In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('../data/ohe_data.csv')

df.drop(columns='Unnamed: 0', inplace=True)

df['Sunrise_Sunset'] = [0 if c == 'Night' else 1 for c in df['Sunrise_Sunset']]
df['Sunrise_Sunset'].dtypes

dtype('int64')

In [3]:
df['Severity'].value_counts(normalize=True)

0    0.900065
1    0.099935
Name: Severity, dtype: float64

In [11]:
fdf = df.drop(columns=['Start_Time', 'Start_Lat', 'Start_Lng'])

In [15]:
# create small test dataframe
small_df = fdf.sample(25_000)
small_df['Severity'].value_counts(normalize=True)

0    0.90404
1    0.09596
Name: Severity, dtype: float64

In [16]:
# scale for clustering analysis
ss = StandardScaler()
sdf_ss = ss.fit_transform(small_df.drop(columns=['Severity']))

In [None]:
scores = []
for k in range(10,141,10):
    km = KMeans(n_clusters=k)
    km.fit(sdf_ss)
    scores.append([k, km.inertia_])
    print(f'{k}: inertia - {km.inertia_}; sil. sc. - {silhouette_score(sdf_ss, km.labels_)}')

10: inertia - 2845250.264014109; sil. sc. - 0.015575619885546664
20: inertia - 2600334.8365504583; sil. sc. - 0.043197530600394475
30: inertia - 2365807.316805193; sil. sc. - 0.07803691779383151
40: inertia - 2166041.797590909; sil. sc. - 0.08038494198931735
50: inertia - 1962323.5786319734; sil. sc. - 0.09989947326087897
60: inertia - 1755965.0012384648; sil. sc. - 0.1031354403782469
70: inertia - 1543151.460735703; sil. sc. - 0.11599750584838851
80: inertia - 1467746.8952168296; sil. sc. - 0.10823791994527497
90: inertia - 1348343.5438660788; sil. sc. - 0.12277504698206704
100: inertia - 1317826.321274452; sil. sc. - 0.11289717990964365
110: inertia - 1298872.5701850743; sil. sc. - 0.10879657771247253
120: inertia - 1281000.8120257142; sil. sc. - 0.09786263419106474
130: inertia - 1264799.2160791862; sil. sc. - 0.10050261087251522
140: inertia - 1253165.7958225398; sil. sc. - 0.10170607422190682


In [21]:
# around 90 seems most optimal
for k in range(80,100,1):
    km = KMeans(n_clusters=k)
    km.fit(sdf_ss)
    scores.append([k, km.inertia_])
    print(f'{k}: inertia - {km.inertia_}; sil. sc. - {silhouette_score(sdf_ss, km.labels_)}')

80: inertia - 1437910.240260226; sil. sc. - 0.11337199638791788
81: inertia - 1452392.6371697467; sil. sc. - 0.11086693010843897
82: inertia - 1420067.4461575227; sil. sc. - 0.11046301389839593
83: inertia - 1395015.8814785776; sil. sc. - 0.11394509153060792
84: inertia - 1421993.938659459; sil. sc. - 0.11244288743161747
85: inertia - 1414517.8985592406; sil. sc. - 0.1249890563467794
86: inertia - 1383796.8738670428; sil. sc. - 0.1199640043983344
87: inertia - 1397253.3520210965; sil. sc. - 0.1279625185506179
88: inertia - 1377666.370910612; sil. sc. - 0.1127558807711444
89: inertia - 1366679.7128369096; sil. sc. - 0.11809501202173227
90: inertia - 1388863.1306831255; sil. sc. - 0.12367602905385691
91: inertia - 1352903.2554381732; sil. sc. - 0.11752308828981675
92: inertia - 1375060.5760331657; sil. sc. - 0.11163683829040193
93: inertia - 1345034.5409877796; sil. sc. - 0.11667627355606257
94: inertia - 1350604.0578207793; sil. sc. - 0.11238444350993591
95: inertia - 1341267.6384041577

This is not giving good scores. Abandon.

## Try with fewer features

In [31]:
# Weather features only
small_df.columns[1:7], small_df.columns[20:29]

(Index(['Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)',
        'Wind_Speed(mph)', 'Precipitation(in)'],
       dtype='object'),
 Index(['wc_clouds', 'wc_dust', 'wc_fair', 'wc_fog', 'wc_ice', 'wc_rain',
        'wc_snow', 'wc_thunder', 'wc_wind'],
       dtype='object'))

In [34]:
# weather columns
weather_cols = list(small_df.columns[1:7]) + list(small_df.columns[20:29])
weather_df = small_df[weather_cols]

In [35]:
ss = StandardScaler()
wdf_ss = ss.fit_transform(weather_df)

In [39]:
scores = []
for k in range(4,16,1):
    km = KMeans(n_clusters=k)
    km.fit(wdf_ss)
    scores.append([k, km.inertia_])
    print(f'{k}: inertia - {km.inertia_}; sil. sc. - {silhouette_score(wdf_ss, km.labels_)}')

4: inertia - 259596.3285492221; sil. sc. - 0.32271414099809087
5: inertia - 231177.90796105185; sil. sc. - 0.3495565651456479
6: inertia - 205549.11936193297; sil. sc. - 0.36466307820014093
7: inertia - 180446.0054250816; sil. sc. - 0.3732268730738112
8: inertia - 156976.41191856883; sil. sc. - 0.37930459127580257
9: inertia - 131931.91529602534; sil. sc. - 0.3981221474312042
10: inertia - 112896.47024829878; sil. sc. - 0.398658343828902
11: inertia - 100097.55302819115; sil. sc. - 0.4113648830729991
12: inertia - 89298.26045695662; sil. sc. - 0.3462787270142472
13: inertia - 81640.85265798401; sil. sc. - 0.3030284984324199
14: inertia - 76270.9000165284; sil. sc. - 0.3054638726259491
15: inertia - 73580.92327160662; sil. sc. - 0.2611952223503386


In [44]:
# 11 looks optimal on 25k points, not bad sil. score. Try on bigger set
fwdf = fdf.sample(100_000)
fwdf_ss = ss.fit_transform(fwdf[weather_cols])
km = KMeans(n_clusters = 11)
km.fit(fwdf_ss)
print(f'inertia: {km.inertia_}; sil. sc.: {silhouette_score(fwdf_ss, km.labels_)}')

inertia: 431198.00098080083; sil. sc.: 0.35085061790367233


The above took quite a bit of time, and is still only a small subsection of the data. As can be seen, 0.35 silhouette score is a bit lower than the 0.41 for 25,000 data points. The time and resource costs of running a KMeans Clustering on the full dataset is prohibitive. Abandon.