Goal: create a model that predicts severity of a collision, based on the features of the collision

In [1]:
import pandas as pd
import numpy as np
import pylab as pl
%pylab inline
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

Populating the interactive namespace from numpy and matplotlib


1) Load data  
    - 2014 attendant, vehicle, and casualty datasets
    - Description of data: https://drive.google.com/drive/u/0/folders/1afX9SJzqmO6rQG6FDhpMlzOfoy_NwI7Q
    - Merge on AREFNO
2) Train, Test split  
3) Hyper Paramter tuning for Random Forest Classifier  
3) Feature Importance  
4) Next: Visualize a decision tree  

** * Load Data * **

In [2]:
df_att = pd.read_csv('./data/2014-gla-data-extract-attendant.csv')
df_att.head()

Unnamed: 0,AREFNO,Borough,Boro,Easting,Northing,Location,Accident Severity,No. of Casualties in Acc.,No. of Vehicles in Acc.,Accident Date,...,Junction Detail,Junction Control,Road Class 2,Road No. 2,Ped. Crossing Decoded,Light Conditions (Banded),Weather,Road Surface,Special Conditions,C/W Hazard
0,0114CP00001,CITY OF LONDON,0,533540,181230,ST BOTOLPH STREET J/W HOUNDSDITCH,3 Slight,1,1,01-Jan-14,...,3 T/Stag Jun,4 Give Way/Uncontrolled,A,1211,0 No Xing Facility In 50m,2 Dark,1 Fine,2 Road-Wet,0 None,0 None
1,0114CP00002,CITY OF LONDON,0,532680,181430,MOORGATE J/W GREAT SWAN ALLEY,3 Slight,1,1,08-Jan-14,...,3 T/Stag Jun,4 Give Way/Uncontrolled,C,0,0 No Xing Facility In 50m,1 Daylight,1 Fine,1 Road-Dry,0 None,0 None
2,0114CP00003,CITY OF LONDON,0,532090,181830,ALDERGATE STREET J/W LONG LANE,2 Serious,1,1,09-Jan-14,...,3 T/Stag Jun,2 Auto Sig,C,0,5 Pedn Phase At Ats,2 Dark,1 Fine,1 Road-Dry,0 None,0 None
3,0114CP00004,CITY OF LONDON,0,531770,180950,QUEEN VICTORIA STREET J/W PUDDLE DOCK,2 Serious,1,2,08-Jan-14,...,3 T/Stag Jun,2 Auto Sig,C,0,5 Pedn Phase At Ats,1 Daylight,1 Fine,1 Road-Dry,0 None,0 None
4,0114CP00005,CITY OF LONDON,0,533130,180920,FENCHURCH STREET J/W ROOD LANE,3 Slight,1,1,15-Jan-14,...,3 T/Stag Jun,4 Give Way/Uncontrolled,C,0,0 No Xing Facility In 50m,1 Daylight,1 Fine,1 Road-Dry,0 None,0 None


In [4]:
df_cas = pd.read_csv('./data/2014-gla-data-extract-casualty.csv')
df_cas.head()

Unnamed: 0,AREFNO,Borough,Boro,Easting,Northing,CREFNO,Casualty Class,Casualty Sex,Casualty Age (Banded),Casualty Age,No. of Casualties,Casualty Severity,Ped. Location,Ped. Movement,Mode of Travel,Unnamed: 16
0,0114CP00001,CITY OF LONDON,0,533540,181230,1,3 Pedestrian,1 Male,25-59,29,1,3 Slight,05 Crossing Road (Not On Xing),9 Unknown Or Other,1 Pedestrian,
1,0114CP00002,CITY OF LONDON,0,532680,181430,1,3 Pedestrian,1 Male,25-59,48,1,3 Slight,05 Crossing Road (Not On Xing),9 Unknown Or Other,1 Pedestrian,
2,0114CP00003,CITY OF LONDON,0,532090,181830,1,3 Pedestrian,1 Male,Unknown,0,1,2 Serious,01 Crossing Road On Ped Xing,3 From Drivers O/Side,1 Pedestrian,
3,0114CP00004,CITY OF LONDON,0,531770,180950,1,1 Driver/Rider,1 Male,25-59,33,1,2 Serious,-2 Unknown,-2 N/A,3 Powered 2 Wheeler,
4,0114CP00005,CITY OF LONDON,0,533130,180920,1,3 Pedestrian,1 Male,25-59,31,1,3 Slight,06 On Footpath - Verge,9 Unknown Or Other,1 Pedestrian,


In [7]:
df_veh = pd.read_csv('./data/2014-gla-data-extract-vehicle.csv')
df_veh.head()

Unnamed: 0,AREFNO,Borough,Boro,Easting,Northing,Vehicle Ref.,Vehicle Type,Vehicle Type (Banded),Vehicle Manoeuvres,Vehicle Skidding,...,Junction Location,Object in C/W,Veh. Leaving C/W,Veh. off C/W,Veh. Impact,VJNYPURP DECODED,Driver Sex,Driver Age,Driver Age (Banded),Unnamed: 21
0,0114CP00001,CITY OF LONDON,0,533540,181230,1,08 Taxi,4 Taxi,18 Going Ahead Other,0 No Skidding/Overturn,...,1 Jct App,00 None,0 Did Not Leave,00 None,1 Front Hit First,5 Other/Not Known,1 Male,55,35-64,
1,0114CP00002,CITY OF LONDON,0,532680,181430,1,21 Gds => 7.5t,7 Goods vehicles,01 Reversing,0 No Skidding/Overturn,...,8 Jct Mid,00 None,0 Did Not Leave,00 None,2 Back Hit First,5 Other/Not Known,3 Not Traced,0,Unknown,
2,0114CP00003,CITY OF LONDON,0,532090,181830,1,09 Car,3 Car,18 Going Ahead Other,0 No Skidding/Overturn,...,8 Jct Mid,00 None,0 Did Not Leave,00 None,1 Front Hit First,2 Comm To/From Work,1 Male,57,35-64,
3,0114CP00004,CITY OF LONDON,0,531770,180950,1,09 Car,3 Car,18 Going Ahead Other,0 No Skidding/Overturn,...,1 Jct App,00 None,0 Did Not Leave,00 None,3 O/S Hit First,1 Jny Part of Work,1 Male,45,35-64,
4,0114CP00004,CITY OF LONDON,0,531770,180950,2,03 M/C 50-125cc,2 Powered 2 wheeler,13 Overtake Move Veh O/S,0 No Skidding/Overturn,...,1 Jct App,00 None,0 Did Not Leave,00 None,4 N/S Hit First,2 Comm To/From Work,1 Male,33,25-34,


In [9]:
print ('len df_att: ', len(df_att))
print ('len df_cas: ', len(df_cas))
len(pd.merge(df_att, df_cas, on='AREFNO', how='inner'))

len df_att:  25992
len df_cas:  30785


30785

More casualities than attendants entries -- more than one casuality per accident

In [11]:
df_cas.AREFNO.value_counts().sort_values(ascending=False).head(10)

0114RG40676    12
0114XD80275    10
0114SX20453     8
0114VK39014     8
0114TX20539     7
0114YE80362     7
0114YE89013     7
0114PL60101     7
0114JC30829     7
0114GD10431     7
Name: AREFNO, dtype: int64

In [12]:
df_att.AREFNO.value_counts().sort_values(ascending=False).head(10)

0114GD10665    1
0114CP00075    1
0114RG40497    1
0114YR90912    1
0114CW11719    1
0114SX20340    1
0114TX20156    1
0114YE80559    1
0114JC30778    1
0114HT20550    1
Name: AREFNO, dtype: int64

Merge keeping the duplicates

In [13]:
df_crash = pd.merge(df_att, df_cas, on='AREFNO', how='inner')

In [14]:
print ('len df_crash: ', len(df_crash))
print ('len df_veh: ', len(df_veh))
len(pd.merge(df_crash, df_veh, on='AREFNO', how='inner'))

len df_crash:  30785
len df_veh:  46074


56406

Each accident has more than one vehicle. Merge to keep duplicates.

In [15]:
df_crash = pd.merge(df_crash, df_veh, on='AREFNO', how='inner')

In [19]:
df_crash.columns

Index(['AREFNO', 'Borough_x', 'Boro_x', 'Easting_x', 'Northing_x', 'Location',
       'Accident Severity', 'No. of Casualties in Acc.',
       'No. of Vehicles in Acc.', 'Accident Date', 'Day', 'Time', 'Highway',
       'Road Class 1', 'Road No. 1', 'Road Type', 'Speed Limit',
       'Junction Detail', 'Junction Control', 'Road Class 2', 'Road No. 2',
       'Ped. Crossing Decoded', 'Light Conditions (Banded)', 'Weather',
       'Road Surface', 'Special Conditions', 'C/W Hazard', 'Borough_y',
       'Boro_y', 'Easting_y', 'Northing_y', 'CREFNO', 'Casualty Class',
       'Casualty Sex', 'Casualty Age (Banded)', 'Casualty Age',
       'No. of Casualties', 'Casualty Severity', 'Ped. Location',
       'Ped. Movement', 'Mode of Travel', ' _x', 'Borough', 'Boro', 'Easting',
       'Northing', 'Vehicle Ref.', 'Vehicle Type', 'Vehicle Type (Banded)',
       'Vehicle Manoeuvres', 'Vehicle Skidding', 'Restricted Lane',
       'Junction Location', 'Object in C/W', 'Veh. Leaving C/W',
       'Veh.

***2) Train, Test split  ***

In [17]:
df_crash.isnull().sum().sum()

0

In [26]:
# Select X and y data

y = df_crash['Accident Severity']

X = df_crash[['No. of Casualties in Acc.',
       'No. of Vehicles in Acc.', 'Day', 'Time', 'Highway',
       'Road Class 1', 'Road No. 1', 'Road Type', 'Speed Limit',
       'Junction Detail', 'Junction Control', 'Road Class 2', 'Road No. 2',
       'Ped. Crossing Decoded', 'Light Conditions (Banded)', 'Weather',
       'Road Surface', 'Special Conditions', 'C/W Hazard', 'Casualty Class',
       'Casualty Sex', 'Casualty Age (Banded)', 'Casualty Age',
       'No. of Casualties', 'Casualty Severity', 'Ped. Location',
       'Ped. Movement', 'Mode of Travel', 'Vehicle Ref.', 'Vehicle Type',
        'Vehicle Type (Banded)',
       'Vehicle Manoeuvres', 'Vehicle Skidding', 'Restricted Lane',
       'Junction Location', 'Object in C/W', 'Veh. Leaving C/W',
       'Veh. off C/W', 'Veh. Impact', 'VJNYPURP DECODED', 'Driver Sex',
       'Driver Age', 'Driver Age (Banded)']]

In [30]:
print ('X shape: ', X.shape)
print ('y shape: ', y.shape)

X shape:  (56406, 43)
y shape:  (56406,)


In [42]:
X=pd.get_dummies(X)
print ('X shape: ', X.shape)

X shape:  (56406, 1630)


In [43]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=0)

***3) Hyper Paramter tuning for Random Forest Classifier  ***

In [33]:
y.value_counts(dropna=False)

3 Slight     52331
2 Serious     3838
1 Fatal        237
Name: Accident Severity, dtype: int64

In [45]:
from sklearn.ensemble import RandomForestClassifier
param_grid ={'n_estimators':np.arange(1,52,3),
             'max_leaf_nodes':np.arange(2,30,3),
            'min_samples_leaf':np.arange(1,10,2),
            'min_samples_split':np.arange(2,20,2),
            'max_depth':[None, 2, 5, 10, 20]} #choose a grid of parameters

rf = RandomForestClassifier(n_jobs=-1)#don't specifcy n_est in the intialization
gr=GridSearchCV(rf,param_grid=param_grid, cv=10)#pass in predictor, and gridded parameters
rs=gr.fit(X_train,y_train)#run predictor with each parameter using training data
#pred=rs.predict_proba(X_test)[:,1]
#print (roc_auc_score(np.array(y_test), pred))
print (rs.best_params_) #will output the best parameter
print (rs.best_score_) #will output the best parameter

KeyboardInterrupt: 

***4) Feature Importance  ***

***5) Next: Visualize a decision tree  ***