In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

In [14]:
df = pd.read_csv('../data/ohe_data.csv', nrows=50_000)

df.drop(columns='Unnamed: 0', inplace=True)

df['Sunrise_Sunset'] = [0 if c == 'Night' else 1 for c in df['Sunrise_Sunset']]
df['Sunrise_Sunset'].dtypes

In [12]:
df['Severity'].value_counts(normalize=True)

0    0.69126
1    0.30874
Name: Severity, dtype: float64

In [15]:
df.dtypes.head(30)

Severity               int64
Start_Time            object
Start_Lat            float64
Start_Lng            float64
Temperature(F)       float64
Humidity(%)          float64
Pressure(in)         float64
Visibility(mi)       float64
Wind_Speed(mph)      float64
Precipitation(in)    float64
Amenity                 bool
Bump                    bool
Crossing                bool
Give_Way                bool
Junction                bool
No_Exit                 bool
Railway                 bool
Roundabout              bool
Station                 bool
Stop                    bool
Traffic_Calming         bool
Traffic_Signal          bool
Sunrise_Sunset         int64
wc_clouds              int64
wc_dust                int64
wc_fair                int64
wc_fog                 int64
wc_ice                 int64
wc_rain                int64
wc_snow                int64
dtype: object

In [16]:
X = df.drop(columns=['Severity', 'Start_Time'])
ss = StandardScaler()
X_ss = ss.fit_transform(X)

In [22]:
scores = []
for k in range(50,141,10):
    km = KMeans(n_clusters=k)
    km.fit(X_ss)
    scores.append([k, km.inertia_])
    print(f'{k}: inertia - {km.inertia_}; sil. sc. - {silhouette_score(X_ss, km.labels_)}')

50: inertia - 3286114.530213488; sil. sc. - 0.14852520090768492
60: inertia - 2926211.6795525188; sil. sc. - 0.1496719233181056
70: inertia - 2540817.4972780403; sil. sc. - 0.15960933911055072
80: inertia - 2360970.127593463; sil. sc. - 0.1472462276388036
90: inertia - 2240704.3142350954; sil. sc. - 0.1468450018882237
100: inertia - 2110514.6319977692; sil. sc. - 0.14717548158054075
110: inertia - 2061222.6593761188; sil. sc. - 0.13889207549872026
120: inertia - 2018106.2303980337; sil. sc. - 0.14175058377657493
130: inertia - 1978624.2506793847; sil. sc. - 0.14678548685540482
140: inertia - 1932608.9321873796; sil. sc. - 0.1373631591264277


In [23]:
# around 70 seems most optimal
for k in range(60,80,1):
    km = KMeans(n_clusters=k)
    km.fit(X_ss)
    scores.append([k, km.inertia_])
    print(f'{k}: inertia - {km.inertia_}; sil. sc. - {silhouette_score(X_ss, km.labels_)}')

60: inertia - 2848039.919012704; sil. sc. - 0.15799948456315327
61: inertia - 2803681.647139657; sil. sc. - 0.16560839871571706
62: inertia - 2797584.609546416; sil. sc. - 0.1678242362314089
63: inertia - 2763416.0796622685; sil. sc. - 0.15257213673263748
64: inertia - 2737411.198233245; sil. sc. - 0.14540049385387654
65: inertia - 2731026.373549943; sil. sc. - 0.16176855251302782
66: inertia - 2643020.759121263; sil. sc. - 0.16543955676407607
67: inertia - 2700089.95563216; sil. sc. - 0.1472614284654666
68: inertia - 2626815.0126450663; sil. sc. - 0.1402486006031571
69: inertia - 2571192.809010043; sil. sc. - 0.15551366237385858
70: inertia - 2508792.645792486; sil. sc. - 0.13645217095725243
71: inertia - 2555734.510117646; sil. sc. - 0.155379809863537
72: inertia - 2464683.699807533; sil. sc. - 0.17064024884182483
73: inertia - 2529467.384625546; sil. sc. - 0.14670412799038396
74: inertia - 2457870.0304189627; sil. sc. - 0.14755379373411034
75: inertia - 2425283.543161718; sil. sc. -

### Bigger Data

Do this all on a bigger data set, focused on around 72 clusters

In [24]:
df = pd.read_csv('../data/ohe_data.csv', nrows=250_000)

df.drop(columns='Unnamed: 0', inplace=True)

df['Sunrise_Sunset'] = [0 if c == 'Night' else 1 for c in df['Sunrise_Sunset']]
df['Sunrise_Sunset'].dtypes

dtype('int64')

In [25]:
X = df.drop(columns=['Severity', 'Start_Time'])
ss = StandardScaler()
X_ss = ss.fit_transform(X)

In [26]:
for k in range(70,75,1):
    km = KMeans(n_clusters=k)
    km.fit(X_ss)
    scores.append([k, km.inertia_])
    print(f'{k}: inertia - {km.inertia_}; sil. sc. - {silhouette_score(X_ss, km.labels_)}')

70: inertia - 15163136.085931206; sil. sc. - 0.12609130854465286
71: inertia - 15058712.70890541; sil. sc. - 0.12686663633837392
72: inertia - 14654394.857499084; sil. sc. - 0.1368473556030636
73: inertia - 14762153.168655181; sil. sc. - 0.1326741866239145
74: inertia - 14574868.15045363; sil. sc. - 0.13051806244168793
