In [2]:
import pandas as pd 
import numpy as np 
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
from sklearn.preprocessing import TargetEncoder, StandardScaler

In [3]:
link = './accident dataset/new york accidents preprocessed.csv'
data = pd.read_csv(link)

In [3]:
xtrain, xtest, ytrain, ytest = train_test_split(
    data.drop('Severity',axis=1),
    data.Severity,
    test_size=0.3
)

In [3]:
data.columns

Index(['Severity', 'Start_Lat', 'Start_Lng', 'Distance(mi)', 'Street',
       'Zipcode', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)',
       'Visibility(mi)', 'Wind_Direction', 'Weather_Condition', 'Amenity',
       'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
       'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight',
       'Astronomical_Twilight', 'TimeTaken'],
      dtype='object')

In [4]:
minimeans = MiniBatchKMeans(compute_labels=True, n_clusters=150)
minimeans.fit(data[['Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)','Wind_Direction', 'Weather_Condition']])

  super()._check_params_vs_input(X, default_n_init=3)


In [5]:
minimeans.inertia_

271133.7193536514

In [6]:
minimeans.labels_

array([ 18,  53, 104, ...,  66,  38,  67], dtype=int32)

In [7]:
data['tempClusters'] = minimeans.labels_
data.head()

Unnamed: 0,Severity,Start_Lat,Start_Lng,Distance(mi),Street,Zipcode,Temperature(F),Humidity(%),Pressure(in),Visibility(mi),...,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,TimeTaken,tempClusters
0,2,-0.477042,0.206319,-0.364533,-0.004402,0.673132,-0.027788,1.331937,-0.190567,-2.348565,...,0,0,1,0,1,0,0,0,-0.550692,18
1,3,-0.113118,0.256402,-0.364533,0.527205,2.825723,-0.127081,1.331937,-0.134399,-2.894542,...,0,0,0,0,0,0,0,0,-0.549675,53
2,3,-0.133706,0.263927,0.951761,0.994872,0.853023,-0.127081,1.331937,-0.07823,-2.894542,...,0,0,0,0,0,0,0,0,-0.208382,104
3,3,-0.238482,0.592595,0.24138,0.002717,-0.190761,-0.184567,1.331937,-0.134399,-2.530557,...,0,0,0,0,0,0,0,0,-0.207111,87
4,3,0.138075,0.681182,-0.364533,0.605621,1.187742,-0.184567,1.137782,0.006022,-2.166572,...,0,0,0,0,0,0,0,0,-0.549675,87


In [8]:
data = data.drop(columns=['Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)','Wind_Direction', 'Weather_Condition'], axis=1)

In [9]:
te = TargetEncoder(target_type='continuous')
data[['tempClusters']] = te.fit_transform(data[['tempClusters']], data.Severity)
data.head()

Unnamed: 0,Severity,Start_Lat,Start_Lng,Distance(mi),Street,Zipcode,Amenity,Bump,Crossing,Give_Way,...,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,TimeTaken,tempClusters
0,2,-0.477042,0.206319,-0.364533,-0.004402,0.673132,0,0,0,0,...,0,0,1,0,1,0,0,0,-0.550692,2.250169
1,3,-0.113118,0.256402,-0.364533,0.527205,2.825723,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.549675,2.419803
2,3,-0.133706,0.263927,0.951761,0.994872,0.853023,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.208382,2.435781
3,3,-0.238482,0.592595,0.24138,0.002717,-0.190761,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.207111,2.421683
4,3,0.138075,0.681182,-0.364533,0.605621,1.187742,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.549675,2.406568


In [10]:
scaler = StandardScaler()
data[['tempClusters']] = scaler.fit_transform(data[['tempClusters']])
data.head()

Unnamed: 0,Severity,Start_Lat,Start_Lng,Distance(mi),Street,Zipcode,Amenity,Bump,Crossing,Give_Way,...,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,TimeTaken,tempClusters
0,2,-0.477042,0.206319,-0.364533,-0.004402,0.673132,0,0,0,0,...,0,0,1,0,1,0,0,0,-0.550692,-0.011049
1,3,-0.113118,0.256402,-0.364533,0.527205,2.825723,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.549675,1.473246
2,3,-0.133706,0.263927,0.951761,0.994872,0.853023,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.208382,1.613054
3,3,-0.238482,0.592595,0.24138,0.002717,-0.190761,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.207111,1.489703
4,3,0.138075,0.681182,-0.364533,0.605621,1.187742,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.549675,1.357445


In [11]:
link = './accident dataset/new york accident minimeans clustered.csv'
data.to_csv(path_or_buf=link, index=False)

In [11]:
data.head()

Unnamed: 0,Severity,Start_Lat,Start_Lng,Distance(mi),Street,Zipcode,Amenity,Bump,Crossing,Give_Way,...,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,TimeTaken,tempClusters
0,2,-0.477042,0.206319,-0.364533,-0.004402,0.673132,0,0,0,0,...,0,0,1,0,1,0,0,0,-0.550692,78
1,3,-0.113118,0.256402,-0.364533,0.527205,2.825723,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.549675,62
2,3,-0.133706,0.263927,0.951761,0.994872,0.853023,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.208382,52
3,3,-0.238482,0.592595,0.24138,0.002717,-0.190761,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.207111,72
4,3,0.138075,0.681182,-0.364533,0.605621,1.187742,0,0,0,0,...,0,0,0,0,0,0,0,0,-0.549675,72
