In [167]:
import pandas as pd
import datetime as dt
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import plotly.express as px

In [151]:
df = pd.read_csv('../data/Live_20210128.csv')

In [152]:
df.head(
)

Unnamed: 0,status_id,status_type,status_published,num_reactions,num_comments,num_shares,num_likes,num_loves,num_wows,num_hahas,num_sads,num_angrys,Column1,Column2,Column3,Column4
0,1,video,4/22/2018 6:00,529,512,262,432,92,3,1,1,0,,,,
1,2,photo,4/21/2018 22:45,150,0,0,150,0,0,0,0,0,,,,
2,3,video,4/21/2018 6:17,227,236,57,204,21,1,1,0,0,,,,
3,4,photo,4/21/2018 2:29,111,0,0,111,0,0,0,0,0,,,,
4,5,photo,4/18/2018 3:22,213,0,0,204,9,0,0,0,0,,,,


In [153]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7050 entries, 0 to 7049
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   status_id         7050 non-null   int64  
 1   status_type       7050 non-null   object 
 2   status_published  7050 non-null   object 
 3   num_reactions     7050 non-null   int64  
 4   num_comments      7050 non-null   int64  
 5   num_shares        7050 non-null   int64  
 6   num_likes         7050 non-null   int64  
 7   num_loves         7050 non-null   int64  
 8   num_wows          7050 non-null   int64  
 9   num_hahas         7050 non-null   int64  
 10  num_sads          7050 non-null   int64  
 11  num_angrys        7050 non-null   int64  
 12  Column1           0 non-null      float64
 13  Column2           0 non-null      float64
 14  Column3           0 non-null      float64
 15  Column4           0 non-null      float64
dtypes: float64(4), int64(10), object(2)
memory

In [154]:
df.drop(columns=[f'Column{i}' for i in range(1,5)], inplace=True)

In [155]:
df.status_published = pd.to_datetime(df.status_published)

In [156]:
time_features = ['year', 'month', 'day', 'dayofweek', 'hour',]
for feature in time_features:
    df[feature] = getattr(df.status_published.dt, feature)

In [157]:
df.drop(columns='status_published', inplace=True)

In [158]:
df.isna().sum() / df.shape[0]

status_id        0.0
status_type      0.0
num_reactions    0.0
num_comments     0.0
num_shares       0.0
num_likes        0.0
num_loves        0.0
num_wows         0.0
num_hahas        0.0
num_sads         0.0
num_angrys       0.0
year             0.0
month            0.0
day              0.0
dayofweek        0.0
hour             0.0
dtype: float64

In [159]:
df.select_dtypes(include='object').columns

Index(['status_type'], dtype='object')

In [160]:
num_pl = Pipeline(steps=[
    ('scaler', RobustScaler())
])
cat_pl = Pipeline(steps=[
    ('encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_pl, df.select_dtypes(include='number').columns),
    ('cat', cat_pl, df.select_dtypes(include='object').columns)
], remainder='passthrough')

base_trainer = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', KMeans(n_clusters=3))
])


In [161]:
base_trainer.fit(df)

In [162]:
base_trainer.predict(df)

array([1, 0, 0, ..., 0, 0, 0], dtype=int32)

In [163]:
df

Unnamed: 0,status_id,status_type,num_reactions,num_comments,num_shares,num_likes,num_loves,num_wows,num_hahas,num_sads,num_angrys,year,month,day,dayofweek,hour
0,1,video,529,512,262,432,92,3,1,1,0,2018,4,22,6,6
1,2,photo,150,0,0,150,0,0,0,0,0,2018,4,21,5,22
2,3,video,227,236,57,204,21,1,1,0,0,2018,4,21,5,6
3,4,photo,111,0,0,111,0,0,0,0,0,2018,4,21,5,2
4,5,photo,213,0,0,204,9,0,0,0,0,2018,4,18,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7045,7046,photo,89,0,0,89,0,0,0,0,0,2016,9,24,5,2
7046,7047,photo,16,0,0,14,1,0,1,0,0,2016,9,23,4,11
7047,7048,photo,2,0,0,1,1,0,0,0,0,2016,9,21,2,23
7048,7049,photo,351,12,22,349,2,0,0,0,0,2016,9,20,1,0


In [164]:
prep_df = pd.DataFrame(
    data=preprocessor.fit_transform(df),
    columns=preprocessor.get_feature_names_out())


In [165]:
k_values = range(2, 11)
silhouette_scores = []
for k in k_values:
    base_trainer.set_params(model=KMeans(n_clusters=k, random_state=42))
    labels = base_trainer.fit_predict(df)
    score = silhouette_score(prep_df, labels)
    silhouette_scores.append(score)


In [170]:
px.line(x=list(k_values), y=silhouette_scores, title='Silhouette Score vs. K')