In [327]:
import copy
import pandas as pd
import datetime as dt
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import plotly.express as px
import numpy as np

In [328]:
df = pd.read_csv('../data/Live_20210128.csv')

In [329]:
df.head(
)

Unnamed: 0,status_id,status_type,status_published,num_reactions,num_comments,num_shares,num_likes,num_loves,num_wows,num_hahas,num_sads,num_angrys,Column1,Column2,Column3,Column4
0,1,video,4/22/2018 6:00,529,512,262,432,92,3,1,1,0,,,,
1,2,photo,4/21/2018 22:45,150,0,0,150,0,0,0,0,0,,,,
2,3,video,4/21/2018 6:17,227,236,57,204,21,1,1,0,0,,,,
3,4,photo,4/21/2018 2:29,111,0,0,111,0,0,0,0,0,,,,
4,5,photo,4/18/2018 3:22,213,0,0,204,9,0,0,0,0,,,,


In [330]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7050 entries, 0 to 7049
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   status_id         7050 non-null   int64  
 1   status_type       7050 non-null   object 
 2   status_published  7050 non-null   object 
 3   num_reactions     7050 non-null   int64  
 4   num_comments      7050 non-null   int64  
 5   num_shares        7050 non-null   int64  
 6   num_likes         7050 non-null   int64  
 7   num_loves         7050 non-null   int64  
 8   num_wows          7050 non-null   int64  
 9   num_hahas         7050 non-null   int64  
 10  num_sads          7050 non-null   int64  
 11  num_angrys        7050 non-null   int64  
 12  Column1           0 non-null      float64
 13  Column2           0 non-null      float64
 14  Column3           0 non-null      float64
 15  Column4           0 non-null      float64
dtypes: float64(4), int64(10), object(2)
memory

In [331]:
df.drop(columns=[f'Column{i}' for i in range(1,5)], inplace=True)

In [332]:
df.status_published = pd.to_datetime(df.status_published)

In [333]:
time_features = ['year', 'month', 'day', 'dayofweek', 'hour',]
for feature in time_features:
    df[feature] = getattr(df.status_published.dt, feature)

In [334]:
df.drop(columns='status_published', inplace=True)

In [335]:
df.isna().sum() / df.shape[0]

status_id        0.0
status_type      0.0
num_reactions    0.0
num_comments     0.0
num_shares       0.0
num_likes        0.0
num_loves        0.0
num_wows         0.0
num_hahas        0.0
num_sads         0.0
num_angrys       0.0
year             0.0
month            0.0
day              0.0
dayofweek        0.0
hour             0.0
dtype: float64

In [336]:
df.select_dtypes(include='object').columns

Index(['status_type'], dtype='object')

In [337]:
num_pl = Pipeline(steps=[
    ('scaler', RobustScaler())
])
cat_pl = Pipeline(steps=[
    ('encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_pl, df.select_dtypes(include='number').columns),
    ('cat', cat_pl, df.select_dtypes(include='object').columns)
], remainder='passthrough')

base_trainer = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', KMeans(n_clusters=3))
])


In [338]:
base_trainer.fit(df)

In [339]:
base_trainer.predict(df)

array([1, 0, 0, ..., 0, 0, 0], dtype=int32)

In [340]:
df

Unnamed: 0,status_id,status_type,num_reactions,num_comments,num_shares,num_likes,num_loves,num_wows,num_hahas,num_sads,num_angrys,year,month,day,dayofweek,hour
0,1,video,529,512,262,432,92,3,1,1,0,2018,4,22,6,6
1,2,photo,150,0,0,150,0,0,0,0,0,2018,4,21,5,22
2,3,video,227,236,57,204,21,1,1,0,0,2018,4,21,5,6
3,4,photo,111,0,0,111,0,0,0,0,0,2018,4,21,5,2
4,5,photo,213,0,0,204,9,0,0,0,0,2018,4,18,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7045,7046,photo,89,0,0,89,0,0,0,0,0,2016,9,24,5,2
7046,7047,photo,16,0,0,14,1,0,1,0,0,2016,9,23,4,11
7047,7048,photo,2,0,0,1,1,0,0,0,0,2016,9,21,2,23
7048,7049,photo,351,12,22,349,2,0,0,0,0,2016,9,20,1,0


In [341]:
prep_df = pd.DataFrame(
    data=preprocessor.fit_transform(df),
    columns=preprocessor.get_feature_names_out())
prep_df

Unnamed: 0,num__status_id,num__num_reactions,num__num_comments,num__num_shares,num__num_likes,num__num_loves,num__num_wows,num__num_hahas,num__num_sads,num__num_angrys,num__year,num__month,num__day,num__dayofweek,num__hour,cat__status_type_link,cat__status_type_photo,cat__status_type_status,cat__status_type_video
0,-1.000000,2.324257,22.086957,65.50,2.229508,30.666667,3.0,1.0,1.0,0.0,0.5,-0.285714,0.400000,0.75,-0.142857,0.0,0.0,0.0,1.0
1,-0.999716,0.448020,-0.173913,0.00,0.548435,0.000000,0.0,0.0,0.0,0.0,0.5,-0.285714,0.333333,0.50,2.142857,0.0,1.0,0.0,0.0
2,-0.999433,0.829208,10.086957,14.25,0.870343,7.000000,1.0,1.0,0.0,0.0,0.5,-0.285714,0.333333,0.50,-0.142857,0.0,0.0,0.0,1.0
3,-0.999149,0.254950,-0.173913,0.00,0.315946,0.000000,0.0,0.0,0.0,0.0,0.5,-0.285714,0.333333,0.50,-0.714286,0.0,1.0,0.0,0.0
4,-0.998865,0.759901,-0.173913,0.00,0.870343,3.000000,0.0,0.0,0.0,0.0,0.5,-0.285714,0.133333,-0.25,-0.571429,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7045,0.998865,0.146040,-0.173913,0.00,0.184799,0.000000,0.0,0.0,0.0,0.0,-0.5,0.428571,0.533333,0.50,-0.714286,0.0,1.0,0.0,0.0
7046,0.999149,-0.215347,-0.173913,0.00,-0.262295,0.333333,0.0,1.0,0.0,0.0,-0.5,0.428571,0.466667,0.25,0.571429,0.0,1.0,0.0,0.0
7047,0.999433,-0.284653,-0.173913,0.00,-0.339791,0.333333,0.0,0.0,0.0,0.0,-0.5,0.428571,0.333333,-0.25,2.285714,0.0,1.0,0.0,0.0
7048,0.999716,1.443069,0.347826,5.50,1.734724,0.666667,0.0,0.0,0.0,0.0,-0.5,0.428571,0.266667,-0.50,-1.000000,0.0,1.0,0.0,0.0


In [342]:
k_values = range(2, 11)
silhouette_scores = []
for k in k_values:
    base_trainer.set_params(model=KMeans(n_clusters=k, random_state=42))
    labels = base_trainer.fit_predict(df)
    score = silhouette_score(prep_df, labels)
    silhouette_scores.append(score)


In [343]:
px.line(x=list(k_values), y=silhouette_scores, title='Silhouette Score vs. K')

In [344]:
kmeans_trainer = copy.deepcopy(base_trainer)
kmeans_trainer.named_steps['model'].init ='random'
kmeanspp_trainer = copy.deepcopy(base_trainer)
kmeanspp_trainer.named_steps['model'].init ='k-means++'

In [345]:
kmeans_trainer.fit(df)

In [346]:
kmeans_trainer.named_steps['model'].init

'random'

In [347]:
kmeanspp_trainer.fit(df)

In [348]:
kmeanspp_trainer.named_steps['model'].init

'k-means++'

In [349]:
kmeans_labels = kmeans_trainer.predict(df)
kmeanspp_labels = kmeanspp_trainer.predict(df)

In [350]:
kmeans_silhouette = silhouette_score(prep_df, kmeans_labels)
kmeanspp_silhouette = silhouette_score(prep_df, kmeanspp_labels)

In [351]:
kmeans_silhouette, kmeanspp_silhouette 

(0.10442163718964508, 0.8043704724165319)

In [352]:
kmeans_trainer.named_steps['model'].inertia_, kmeanspp_trainer.named_steps['model'].inertia_

(2995823.6136863595, 2147354.5410483656)

In [360]:
res_df = pd.DataFrame(
    data={
        'kmeans': {
            'silhouette_score': kmeans_silhouette,
            'inertia': kmeans_trainer.named_steps['model'].inertia_
        },
        'kmeans++': {
            'silhouette_score': kmeanspp_silhouette,
            'inertia': kmeanspp_trainer.named_steps['model'].inertia_
        }
    }
).round(2)
res_df


Unnamed: 0,kmeans,kmeans++
silhouette_score,0.1,0.8
inertia,2995823.61,2147354.54


The comparison of Kmeans and Kmeans++ clustering algorithms based on silhouette_score and inertia reveals significant differences in performance. The silhouette_score, which measures how similar an object is to its own cluster compared to other clusters, is substantially higher for Kmeans++ (0.8) than for Kmeans (0.1). This indicates that Kmeans++ provides better-defined and more distinct clusters. In terms of inertia, which represents the sum of squared distances of samples to their nearest cluster center, Kmeans++ again outperforms Kmeans with a lower value (2.14735e+06 vs. 2.99582e+06), suggesting more compact clusters. Overall, Kmeans++ demonstrates superior clustering effectiveness, delivering clearer and more cohesive cluster formations compared to Kmeans.

In [355]:
n_iterations = 50
inertia_list = []
silhouette_list = []


for i in range(n_iterations):
    kmeans = kmeans_trainer
    kmeans.fit_predict(df)
    inertia_list.append(kmeans.named_steps['model'].inertia_)
    silhouette_list.append(silhouette_score(prep_df, kmeans.named_steps['model'].labels_))


avg_inertia = np.mean(inertia_list)
std_inertia = np.std(inertia_list)
avg_silhouette = np.mean(silhouette_list)
std_silhouette = np.std(silhouette_list)


performance_df = pd.DataFrame({
    'Inertia': inertia_list,
    'Silhouette Score': silhouette_list
})


avg_performance = {
    'Average Inertia': avg_inertia,
    'Inertia Std Dev': std_inertia,
    'Average Silhouette Score': avg_silhouette,
    'Silhouette Score Std Dev': std_silhouette
}

avg_performance

{'Average Inertia': 2995823.6136863595,
 'Inertia Std Dev': 1.3170890159654385e-10,
 'Average Silhouette Score': 0.10442163718964509,
 'Silhouette Score Std Dev': 1.3877787807814457e-17}

In [356]:
n_iterations = 50
inertia_list = []
silhouette_list = []


for i in range(n_iterations):
    kmeanspp = kmeanspp_trainer
    kmeanspp.fit_predict(df)
    inertia_list.append(kmeanspp.named_steps['model'].inertia_)
    silhouette_list.append(silhouette_score(prep_df, kmeanspp.named_steps['model'].labels_))


avg_inertia = np.mean(inertia_list)
std_inertia = np.std(inertia_list)
avg_silhouette = np.mean(silhouette_list)
std_silhouette = np.std(silhouette_list)


performance_df = pd.DataFrame({
    'Inertia': inertia_list,
    'Silhouette Score': silhouette_list
})


avg_performance = {
    'Average Inertia': avg_inertia,
    'Inertia Std Dev': std_inertia,
    'Average Silhouette Score': avg_silhouette,
    'Silhouette Score Std Dev': std_silhouette
}

avg_performance

{'Average Inertia': 2147354.5410483656,
 'Inertia Std Dev': 6.585445079827192e-11,
 'Average Silhouette Score': 0.8043704724165319,
 'Silhouette Score Std Dev': 0.0}

The comparison between Kmeans and Kmeans++ clustering reveals significant differences in their performance metrics. Kmeans has an average inertia of 2995823.61 with a negligible standard deviation of approximately 0.00, while Kmeans++ shows a much lower average inertia of 2147354.54, also with an almost zero standard deviation. This indicates that Kmeans++ achieves a tighter clustering with data points closer to their centroids. In terms of silhouette scores, Kmeans++ outperforms Kmeans significantly, with an average silhouette score of 0.80 compared to Kmeans' 0.10, both with negligible standard deviations. The higher silhouette score for Kmeans++ suggests better-defined clusters with higher intra-cluster similarity and inter-cluster dissimilarity. These differences are primarily due to the initialization method: Kmeans++ selects initial centroids in a way that spreads them out, leading to more optimal and stable clustering results.