In [11]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
import pandas as pd
import joblib
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from collections import Counter, defaultdict
import igraph as ig
import leidenalg
import random

In [13]:
# Set Pandas to display all columns
pd.set_option("display.max_columns", None)

# 1. Loading data
df = pd.read_csv("SocialMediaDataset.csv")

df_train = df.copy()

# Data preprocessing: Convert the string of interest columns to a list
_df = df_train.copy()
_df['Interests'] = _df['Interests'].apply(lambda x: [i.strip().strip("'") for i in x.split(',')])

In [14]:
# 2. 
B = nx.Graph()

# Add user nodes and interest nodes, and establish user-interest connections
for _, row in _df.iterrows():
    user_id = row['UserID']
    B.add_node(user_id, bipartite=0)  # User Node
    for interest in row['Interests']:
        B.add_node(interest, bipartite=1)  # Interest Nodes
        B.add_edge(user_id, interest)

# Extract the list of user nodes
user_nodes = [n for n, d in B.nodes(data=True) if d['bipartite'] == 0]

In [15]:
# 3. User-user weighted projection graph
user_graph = nx.bipartite.weighted_projected_graph(B, user_nodes)

In [16]:
# 4. The Leiden algorithm is used for community detection
edges = [(u, v, d['weight']) for u, v, d in user_graph.edges(data=True)]
G_ig = ig.Graph.TupleList(edges, weights=True, directed=False)

# Leiden clustering is performed using modularity optimization
partition = leidenalg.find_partition(G_ig, leidenalg.ModularityVertexPartition)

# Create the mapping from the user ID to the clustering ID
leiden_partition = {}
for cluster_id, cluster_nodes in enumerate(partition):
    for node_index in cluster_nodes:
        user_id = G_ig.vs[node_index]['name']
        leiden_partition[user_id] = cluster_id

In [17]:
# 5. Add the clustering labels to the data frame
_df['Cluster'] = _df['UserID'].map(leiden_partition)

In [18]:
# 6. Generate labels (most common interests) for each cluster
top_n = 2
cluster_interests = defaultdict(list)
for _, row in _df.iterrows():
    cluster = row['Cluster']
    interests = row['Interests']
    if cluster is not None:
        cluster_interests[cluster].extend(interests)

cluster_tags = {cluster: " & ".join([i[0] for i in Counter(interests).most_common(top_n)]) for cluster, interests in cluster_interests.items()}

# Add the clustering labels to the data frame
_df['Cluster_tag'] = _df['Cluster'].map(cluster_tags)

# Sample
print(_df.head())

   UserID             Name Gender         DOB  \
0   75722      Kevin Tabor   Male    1962/9/2   
1   80185  Bethany Buckley   Male   1983/7/28   
2   19865     Marsha Ayala   Male  1958/12/18   
3   76700     Sheryl Hower   Male   1964/4/12   
4   92992    Robin Garrett   Male  1975/11/26   

                                       Interests       City  \
0                 [Nature, Parenting and family]  Liverpool   
1                [Business and entrepreneurship]  Stockport   
2  [Fashion, DIY and crafts, Outdoor activities]     Bolton   
3                [Business and entrepreneurship]  Stockport   
4                                       [Travel]  Liverpool   

                  Occupation  Budget  Age Age Range  Cluster  \
0            Project Manager    6376   63     56-65        6   
1               Data Analyst    4205   42     36-45        2   
2                    Cleaner    2725   67       66+        1   
3  Quality Assurance Analyst    4556   61     56-65        2   
4     

In [28]:
# 7. Secondary clustering

# Define a function for automatically finding the optimal number of clusters
def find_optimal_clusters(data, max_k=10):
    from sklearn.metrics import silhouette_score
    silhouette_scores = []
    for k in range(2, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(data)
        score = silhouette_score(data, labels)
        silhouette_scores.append((k, score))
    # Return the number of clusters with the highest silhouette score.
    optimal_k = max(silhouette_scores, key=lambda x: x[1])[0]
    return optimal_k

for cluster_id in _df['Cluster'].unique():
    sub_df = _df[_df['Cluster'] == cluster_id].copy()

    # Perform secondary clustering directly without considering the age range
    if len(sub_df) > 0 and all(col in sub_df.columns for col in ['Age', 'Budget']):
        scaler = StandardScaler()
        sub_features = scaler.fit_transform(sub_df[['Age', 'Budget']])
        optimal_k = find_optimal_clusters(sub_features)
        kmeans = KMeans(n_clusters=optimal_k, random_state=42)
        sub_df['SubCluster'] = kmeans.fit_predict(sub_features)
        sub_df['SubCluster'] = sub_df['SubCluster'].apply(lambda x: f"{cluster_id}.{x+1}")
        _df.loc[sub_df.index, 'SubCluster'] = sub_df['SubCluster']

        # Obtain the mean and standard deviation for anti-standardization
        means = scaler.mean_
        scales = scaler.scale_

        # Generate secondary label explanations and anti-standardized centroids
        centers = kmeans.cluster_centers_
        for i, center in enumerate(centers):
            real_center = center * scales + means
            avg_age, avg_budget = real_center[0], real_center[1]
            print(f"Cluster {cluster_id}.{i+1} - Avg Age: {avg_age:.1f}, Avg Budget: {avg_budget:.1f}")
            _df.loc[sub_df[sub_df['SubCluster'] == f"{cluster_id}.{i+1}"].index, 'SubCluster_Description'] = f"Avg Age: {avg_age:.1f}, Avg Budget: {avg_budget:.1f}"

    else:
        # If it is empty after filtering, retain the original cluster and mark it as a special subcluster：ClusterID.0
        sub_df['SubCluster'] = f"{cluster_id}.0"
        _df.loc[sub_df.index, 'SubCluster'] = sub_df['SubCluster']
        _df.loc[sub_df.index, 'SubCluster_Description'] = "No valid subcluster"



Cluster 6.1 - Avg Age: 60.0, Avg Budget: 3992.8
Cluster 6.2 - Avg Age: 31.9, Avg Budget: 3789.6
Cluster 6.3 - Avg Age: 42.3, Avg Budget: 6376.3




Cluster 2.1 - Avg Age: 43.9, Avg Budget: 6743.3
Cluster 2.2 - Avg Age: 32.7, Avg Budget: 3932.0
Cluster 2.3 - Avg Age: 59.6, Avg Budget: 4175.7




Cluster 1.1 - Avg Age: 32.7, Avg Budget: 3565.0
Cluster 1.2 - Avg Age: 59.2, Avg Budget: 5916.1
Cluster 1.3 - Avg Age: 46.4, Avg Budget: 9588.0
Cluster 1.4 - Avg Age: 33.3, Avg Budget: 5915.2
Cluster 1.5 - Avg Age: 58.0, Avg Budget: 3560.1
Cluster 7.1 - Avg Age: 58.8, Avg Budget: 3605.3
Cluster 7.2 - Avg Age: 32.8, Avg Budget: 5720.0
Cluster 7.3 - Avg Age: 33.3, Avg Budget: 3372.9
Cluster 7.4 - Avg Age: 60.2, Avg Budget: 6508.0




Cluster 0.1 - Avg Age: 60.0, Avg Budget: 4020.0
Cluster 0.2 - Avg Age: 45.9, Avg Budget: 6780.6
Cluster 0.3 - Avg Age: 32.9, Avg Budget: 4042.3
Cluster 4.1 - Avg Age: 31.0, Avg Budget: 4577.4
Cluster 4.2 - Avg Age: 53.1, Avg Budget: 6977.6
Cluster 4.3 - Avg Age: 56.9, Avg Budget: 3703.1




Cluster 5.1 - Avg Age: 32.9, Avg Budget: 4371.4
Cluster 5.2 - Avg Age: 57.6, Avg Budget: 3697.9
Cluster 5.3 - Avg Age: 55.0, Avg Budget: 6758.3
Cluster 3.1 - Avg Age: 47.4, Avg Budget: 6718.3
Cluster 3.2 - Avg Age: 31.9, Avg Budget: 4085.9
Cluster 3.3 - Avg Age: 58.9, Avg Budget: 3927.2
   UserID  Cluster SubCluster             SubCluster_Description
0   75722        6        6.3  Avg Age: 42.3, Avg Budget: 6376.3
1   80185        2        2.2  Avg Age: 32.7, Avg Budget: 3932.0
2   19865        1        1.5  Avg Age: 58.0, Avg Budget: 3560.1
3   76700        2        2.3  Avg Age: 59.6, Avg Budget: 4175.7
4   92992        7        7.1  Avg Age: 58.8, Avg Budget: 3605.3




In [29]:
output_path = "clustered_social_media_data.csv"

social_df = pd.read_csv("SocialMediaDataset.csv")

social_df = pd.merge(social_df, _df[['UserID', 'Cluster', 'Cluster_tag', 'SubCluster', 'SubCluster_Description']], on='UserID', how='left')

social_df.to_csv(output_path, index=False)
print(f"road: {output_path}")

road: clustered_social_media_data.csv
