In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('../data/ohe_data.csv')

df.drop(columns='Unnamed: 0', inplace=True)

df['Sunrise_Sunset'] = [0 if c == 'Night' else 1 for c in df['Sunrise_Sunset']]
df['Sunrise_Sunset'].dtypes

dtype('int64')

In [3]:
df['Severity'].value_counts(normalize=True)

0    0.900065
1    0.099935
Name: Severity, dtype: float64

In [4]:
# create small test dataframe
small_df = fdf.sample(25_000)
small_df['Severity'].value_counts(normalize=True)

0    0.90404
1    0.09596
Name: Severity, dtype: float64

In [5]:
# scale for clustering analysis
ss = StandardScaler()
sdf_ss = ss.fit_transform(small_df.drop(columns=['Severity']))

In [6]:
scores = []
for k in range(10,141,10):
    km = KMeans(n_clusters=k)
    km.fit(sdf_ss)
    scores.append([k, km.inertia_])
    print(f'{k}: inertia - {km.inertia_}; sil. sc. - {silhouette_score(sdf_ss, km.labels_)}')

50: inertia - 3313565.4725475227; sil. sc. - 0.12187353829774644
60: inertia - 2822306.054124759; sil. sc. - 0.1505045744513503
70: inertia - 2553737.0919758705; sil. sc. - 0.16056819273985284
80: inertia - 2347451.8677110146; sil. sc. - 0.15686078627119016
90: inertia - 2218720.9812716935; sil. sc. - 0.14847466114847252
100: inertia - 2136514.582605255; sil. sc. - 0.1432005112356197
110: inertia - 2082134.4059141988; sil. sc. - 0.13926533774490157
120: inertia - 2021257.4915352573; sil. sc. - 0.14491145257370014
130: inertia - 1969407.6656587443; sil. sc. - 0.14625027788689948
140: inertia - 1941038.43836347; sil. sc. - 0.14369975588249892


In [7]:
# around 90 seems most optimal
for k in range(80,100,1):
    km = KMeans(n_clusters=k)
    km.fit(sdf_ss)
    scores.append([k, km.inertia_])
    print(f'{k}: inertia - {km.inertia_}; sil. sc. - {silhouette_score(sdf_ss, km.labels_)}')

60: inertia - 2836741.9398328224; sil. sc. - 0.16359112301804743
61: inertia - 2846061.4950390155; sil. sc. - 0.13674738874175973
62: inertia - 2837181.5864065653; sil. sc. - 0.15659902592540886
63: inertia - 2822829.0809917683; sil. sc. - 0.14122536569252692
64: inertia - 2734469.469527257; sil. sc. - 0.14848090649044662
65: inertia - 2733802.601371605; sil. sc. - 0.15779192847709697
66: inertia - 2698561.99453916; sil. sc. - 0.14621833837877457
67: inertia - 2635548.7531827064; sil. sc. - 0.14758102242966611
68: inertia - 2593762.6941094017; sil. sc. - 0.15798177517964965
69: inertia - 2616385.9888505824; sil. sc. - 0.14708664294381368
70: inertia - 2569636.579710776; sil. sc. - 0.12788002355258066
71: inertia - 2531651.519083031; sil. sc. - 0.16199632654497761
72: inertia - 2507076.1540872105; sil. sc. - 0.15324175305288557
73: inertia - 2502311.5736236777; sil. sc. - 0.1491149139030701
74: inertia - 2472112.075041918; sil. sc. - 0.1540437463168656
75: inertia - 2456603.4140201663; 

This is not giving good scores. Abandon.

## Try with fewer features

In [8]:
# Weather features only
small_df.columns[1:7], small_df.columns[20:29]

(Index(['Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)',
        'Wind_Speed(mph)', 'Precipitation(in)'],
       dtype='object'),
 Index(['wc_clouds', 'wc_dust', 'wc_fair', 'wc_fog', 'wc_ice', 'wc_rain',
        'wc_snow', 'wc_thunder', 'wc_wind'],
       dtype='object'))

In [9]:
ss = StandardScaler()
wdf_ss = ss.fit_transform(weather_df)

In [10]:
scores = []
for k in range(4,16,1):
    km = KMeans(n_clusters=k)
    km.fit(wdf_ss)
    scores.append([k, km.inertia_])
    print(f'{k}: inertia - {km.inertia_}; sil. sc. - {silhouette_score(wdf_ss, km.labels_)}')

70: inertia - 15102963.923698124; sil. sc. - 0.12153038322906985
71: inertia - 14881413.903198672; sil. sc. - 0.12815005126500925
72: inertia - 14819377.30846994; sil. sc. - 0.13382105740883685
73: inertia - 14856922.529910833; sil. sc. - 0.11568998946338367
74: inertia - 14295920.398657776; sil. sc. - 0.12659499749994202


In [44]:
# 11 looks optimal on 25k points, not bad sil. score. Try on bigger set
fwdf = fdf.sample(100_000)
fwdf_ss = ss.fit_transform(fwdf[weather_cols])
km = KMeans(n_clusters = 11)
km.fit(fwdf_ss)
print(f'inertia: {km.inertia_}; sil. sc.: {silhouette_score(fwdf_ss, km.labels_)}')

inertia: 431198.00098080083; sil. sc.: 0.35085061790367233


The above took quite a bit of time, and is still only a small subsection of the data. As can be seen, 0.35 silhouette score is a bit lower than the 0.41 for 25,000 data points. The time and resource costs of running a KMeans Clustering on the full dataset is prohibitive. Abandon.