In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('../data/ohe_data.csv', nrows=50_000)

df.drop(columns='Unnamed: 0', inplace=True)

df['Sunrise_Sunset'] = [0 if c == 'Night' else 1 for c in df['Sunrise_Sunset']]
df['Sunrise_Sunset'].dtypes

dtype('int64')

In [3]:
df['Severity'].value_counts(normalize=True)

0    0.69126
1    0.30874
Name: Severity, dtype: float64

In [4]:
df.dtypes.head(30)

Severity               int64
Start_Time            object
Start_Lat            float64
Start_Lng            float64
Temperature(F)       float64
Humidity(%)          float64
Pressure(in)         float64
Visibility(mi)       float64
Wind_Speed(mph)      float64
Precipitation(in)    float64
Amenity                 bool
Bump                    bool
Crossing                bool
Give_Way                bool
Junction                bool
No_Exit                 bool
Railway                 bool
Roundabout              bool
Station                 bool
Stop                    bool
Traffic_Calming         bool
Traffic_Signal          bool
Sunrise_Sunset         int64
wc_clouds              int64
wc_dust                int64
wc_fair                int64
wc_fog                 int64
wc_ice                 int64
wc_rain                int64
wc_snow                int64
dtype: object

In [5]:
X = df.drop(columns=['Severity', 'Start_Time'])
ss = StandardScaler()
X_ss = ss.fit_transform(X)

In [6]:
scores = []
for k in range(50,141,10):
    km = KMeans(n_clusters=k)
    km.fit(X_ss)
    scores.append([k, km.inertia_])
    print(f'{k}: inertia - {km.inertia_}; sil. sc. - {silhouette_score(X_ss, km.labels_)}')

50: inertia - 3313565.4725475227; sil. sc. - 0.12187353829774644
60: inertia - 2822306.054124759; sil. sc. - 0.1505045744513503
70: inertia - 2553737.0919758705; sil. sc. - 0.16056819273985284
80: inertia - 2347451.8677110146; sil. sc. - 0.15686078627119016
90: inertia - 2218720.9812716935; sil. sc. - 0.14847466114847252
100: inertia - 2136514.582605255; sil. sc. - 0.1432005112356197
110: inertia - 2082134.4059141988; sil. sc. - 0.13926533774490157
120: inertia - 2021257.4915352573; sil. sc. - 0.14491145257370014
130: inertia - 1969407.6656587443; sil. sc. - 0.14625027788689948
140: inertia - 1941038.43836347; sil. sc. - 0.14369975588249892


In [7]:
# around 70 seems most optimal
for k in range(60,80,1):
    km = KMeans(n_clusters=k)
    km.fit(X_ss)
    scores.append([k, km.inertia_])
    print(f'{k}: inertia - {km.inertia_}; sil. sc. - {silhouette_score(X_ss, km.labels_)}')

60: inertia - 2836741.9398328224; sil. sc. - 0.16359112301804743
61: inertia - 2846061.4950390155; sil. sc. - 0.13674738874175973
62: inertia - 2837181.5864065653; sil. sc. - 0.15659902592540886
63: inertia - 2822829.0809917683; sil. sc. - 0.14122536569252692
64: inertia - 2734469.469527257; sil. sc. - 0.14848090649044662
65: inertia - 2733802.601371605; sil. sc. - 0.15779192847709697
66: inertia - 2698561.99453916; sil. sc. - 0.14621833837877457
67: inertia - 2635548.7531827064; sil. sc. - 0.14758102242966611
68: inertia - 2593762.6941094017; sil. sc. - 0.15798177517964965
69: inertia - 2616385.9888505824; sil. sc. - 0.14708664294381368
70: inertia - 2569636.579710776; sil. sc. - 0.12788002355258066
71: inertia - 2531651.519083031; sil. sc. - 0.16199632654497761
72: inertia - 2507076.1540872105; sil. sc. - 0.15324175305288557
73: inertia - 2502311.5736236777; sil. sc. - 0.1491149139030701
74: inertia - 2472112.075041918; sil. sc. - 0.1540437463168656
75: inertia - 2456603.4140201663; 

### Bigger Data

Do this all on a bigger data set, focused on around 72 clusters

In [8]:
df = pd.read_csv('../data/ohe_data.csv', nrows=250_000)

df.drop(columns='Unnamed: 0', inplace=True)

df['Sunrise_Sunset'] = [0 if c == 'Night' else 1 for c in df['Sunrise_Sunset']]
df['Sunrise_Sunset'].dtypes

dtype('int64')

In [9]:
X = df.drop(columns=['Severity', 'Start_Time'])
ss = StandardScaler()
X_ss = ss.fit_transform(X)

In [10]:
for k in range(70,75,1):
    km = KMeans(n_clusters=k)
    km.fit(X_ss)
    scores.append([k, km.inertia_])
    print(f'{k}: inertia - {km.inertia_}; sil. sc. - {silhouette_score(X_ss, km.labels_)}')

70: inertia - 15102963.923698124; sil. sc. - 0.12153038322906985
71: inertia - 14881413.903198672; sil. sc. - 0.12815005126500925
72: inertia - 14819377.30846994; sil. sc. - 0.13382105740883685
73: inertia - 14856922.529910833; sil. sc. - 0.11568998946338367
74: inertia - 14295920.398657776; sil. sc. - 0.12659499749994202
