In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

In [2]:
features_df = pd.read_csv('combined_user_features.csv')
print("Số dòng dữ liệu:", features_df.shape[0])

Số dòng dữ liệu: 565841


In [3]:
features_df.head(5)

Unnamed: 0,profile_id,in_degree,out_degree,betweenness,closeness,eigenvector,gnn_emb_0,gnn_emb_1,gnn_emb_2,gnn_emb_3,...,gnn_emb_6,gnn_emb_7,gnn_emb_8,gnn_emb_9,gnn_emb_10,gnn_emb_11,gnn_emb_12,gnn_emb_13,gnn_emb_14,gnn_emb_15
0,1,0,0,0.0,0.0,0.0,-0.249802,-0.649477,-0.131534,0.479486,...,0.43203,-0.735447,-0.482806,0.592678,-0.285395,-0.57582,0.427692,1.062022,-0.851154,-0.041554
1,4121646,0,0,0.0,0.0,0.0,-0.731473,-0.433804,-1.185664,0.464575,...,-0.172377,0.093307,0.645131,0.896207,0.532153,-1.274242,0.340453,1.114578,-0.929969,-0.690296
2,836686,0,0,0.0,0.0,0.0,-0.455413,1.207451,-0.555592,-1.088797,...,0.30863,-1.103349,0.127243,1.545743,0.979178,0.983625,-0.643899,-0.172187,0.027617,0.127494
3,3670821,0,0,0.0,0.0,0.0,-0.678496,0.332202,-0.463523,-0.758592,...,1.536808,-1.390308,1.246898,-0.62658,1.668759,-0.586472,-0.526108,0.027588,-0.50869,-1.233127
4,11025609,0,0,0.0,0.0,0.0,1.200557,0.711367,0.490672,1.843835,...,-0.594223,-2.256658,-0.906822,1.060348,0.957034,-0.033136,-0.958256,-1.564844,-0.964178,0.363324


In [4]:
features_df['follower_following_ratio'] = features_df.apply(
    lambda row: row['in_degree'] / row['out_degree'] if row['out_degree'] > 0 else row['in_degree'], axis=1)

In [5]:
features_df.head(5)

Unnamed: 0,profile_id,in_degree,out_degree,betweenness,closeness,eigenvector,gnn_emb_0,gnn_emb_1,gnn_emb_2,gnn_emb_3,...,gnn_emb_7,gnn_emb_8,gnn_emb_9,gnn_emb_10,gnn_emb_11,gnn_emb_12,gnn_emb_13,gnn_emb_14,gnn_emb_15,follower_following_ratio
0,1,0,0,0.0,0.0,0.0,-0.249802,-0.649477,-0.131534,0.479486,...,-0.735447,-0.482806,0.592678,-0.285395,-0.57582,0.427692,1.062022,-0.851154,-0.041554,0.0
1,4121646,0,0,0.0,0.0,0.0,-0.731473,-0.433804,-1.185664,0.464575,...,0.093307,0.645131,0.896207,0.532153,-1.274242,0.340453,1.114578,-0.929969,-0.690296,0.0
2,836686,0,0,0.0,0.0,0.0,-0.455413,1.207451,-0.555592,-1.088797,...,-1.103349,0.127243,1.545743,0.979178,0.983625,-0.643899,-0.172187,0.027617,0.127494,0.0
3,3670821,0,0,0.0,0.0,0.0,-0.678496,0.332202,-0.463523,-0.758592,...,-1.390308,1.246898,-0.62658,1.668759,-0.586472,-0.526108,0.027588,-0.50869,-1.233127,0.0
4,11025609,0,0,0.0,0.0,0.0,1.200557,0.711367,0.490672,1.843835,...,-2.256658,-0.906822,1.060348,0.957034,-0.033136,-0.958256,-1.564844,-0.964178,0.363324,0.0


In [None]:
# Chọn các chỉ số mạng và một phần vector nhúng (có thể dùng toàn bộ hoặc trích chọn các chiều quan trọng)
gnn_emb_cols = [col for col in features_df.columns if col.startswith('gnn_emb_')]
selected_cols = ['in_degree', 'out_degree', 'follower_following_ratio', 'betweenness', 'closeness', 'eigenvector'] + gnn_emb_cols

X = features_df[selected_cols].values

In [7]:
#Chuẩn hóa
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [8]:
# 5. Xác định số lượng cụm (k) bằng Silhouette Score (tùy chọn)
# ---------------------------
silhouette_scores = {}
for k in range(2, 10):
    kmeans_temp = KMeans(n_clusters=k, random_state=42)
    labels_temp = kmeans_temp.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels_temp)
    silhouette_scores[k] = score
    print(f"Number of clusters: {k}, Silhouette Score: {score:.4f}")


KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(6,4))
plt.plot(list(silhouette_scores.keys()), list(silhouette_scores.values()), marker='o')
plt.xlabel("Number of clusters")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Score for different cluster counts")
plt.show()

In [None]:
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
features_df['cluster'] = kmeans.fit_predict(X_scaled)

In [None]:
# 6. Kiểm tra trung tâm các cụm để xem đặc trưng nào nổi bật
# ---------------------------
cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)
centers_df = pd.DataFrame(cluster_centers, columns=selected_cols)
print("Trung tâm các cụm:")
print(centers_df)

In [None]:
role_map = {
    0: 'Influencers',       # Ví dụ: cụm có tỷ lệ follower/following cao, betweenness cao
    1: 'Active Followers',  # Ví dụ: cụm có out_degree lớn nhưng in_degree thấp
    2: 'Community Leaders', # Ví dụ: cụm có closeness hoặc degree cao
    3: 'Lurkers',           # Ví dụ: cụm có cả in_degree và out_degree nhỏ
    4: 'Bridge Builders'    # Ví dụ: cụm có betweenness cao, đóng vai trò cầu nối
}

features_df['role'] = features_df['cluster'].map(role_map)

In [None]:
features_df.to_csv('user_roles_clustered.csv', index=False)
print("Đã lưu kết quả phân cụm và gán nhãn vai trò vào file 'user_roles_clustered.csv'")
print("Phân bố các vai trò:")
print(features_df['role'].value_counts())