In [54]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pylab as plt

!pip install dmba
from dmba import classificationSummary



In [55]:
ksi = pd.read_csv('KSI.csv')
ksi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16860 entries, 0 to 16859
Data columns (total 57 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   X                16860 non-null  float64
 1   Y                16860 non-null  float64
 2   INDEX_           16860 non-null  int64  
 3   ACCNUM           16860 non-null  int64  
 4   YEAR             16860 non-null  int64  
 5   DATE             16860 non-null  object 
 6   TIME             16860 non-null  int64  
 7   HOUR             16860 non-null  int64  
 8   STREET1          16860 non-null  object 
 9   STREET2          16860 non-null  object 
 10  OFFSET           16860 non-null  object 
 11  ROAD_CLASS       16860 non-null  object 
 12  DISTRICT         16860 non-null  object 
 13  WARDNUM          16860 non-null  object 
 14  DIVISION         16860 non-null  object 
 15  LATITUDE         16860 non-null  float64
 16  LONGITUDE        16860 non-null  float64
 17  LOCCOORD    

In [56]:
# Split Date
ksi['DATE'] = pd.to_datetime(ksi['DATE'])
ksi['YEAR_S'] = ksi['DATE'].dt.year
ksi['MONTH_S'] = ksi['DATE'].dt.month
ksi['DAY_S'] = ksi['DATE'].dt.day

In [57]:
# Combine Latitude and Longitude
ksi['NEW_LATITUDE'] = ksi['LATITUDE'].round(2)
ksi['NEW_LONGITUDE'] = ksi['LONGITUDE'].round(2)
ksi['Combine_Lat_Long'] = ksi['NEW_LATITUDE'].astype(str) + ", " +  ksi['NEW_LONGITUDE'].astype(str)

In [58]:
# Reduce LIGHT category
ksi['LIGHT'] = np.where(ksi['LIGHT']=='Dark, artificial', 'Dark', ksi['LIGHT'])
ksi['LIGHT'] = np.where(ksi['LIGHT']=='Dawn, artificial', 'Dawn', ksi['LIGHT'])
ksi['LIGHT'] = np.where(ksi['LIGHT']=='Daylight, artificial', 'Daylight', ksi['LIGHT'])
ksi['LIGHT'] = np.where(ksi['LIGHT']=='Dusk, artificial', 'Dusk', ksi['LIGHT'])

In [59]:
# Update Accident Class to 0 (non-Fatal) and 1 (Fatal)
ksi['ACCLASS'] = np.where(ksi['ACCLASS']=='Property Damage Only', '0', ksi['ACCLASS'])
ksi['ACCLASS'] = np.where(ksi['ACCLASS']=='Non-Fatal Injury', '0', ksi['ACCLASS'])
ksi['ACCLASS'] = np.where(ksi['ACCLASS']=='Fatal', '1', ksi['ACCLASS'])
ksi.groupby('ACCLASS')['ACCNUM'].nunique()

ACCLASS
0    5196
1     806
Name: ACCNUM, dtype: int64

In [60]:
ksi

Unnamed: 0,X,Y,INDEX_,ACCNUM,YEAR,DATE,TIME,HOUR,STREET1,STREET2,...,POLICE_DIVISION,HOOD_ID,NEIGHBOURHOOD,ObjectId,YEAR_S,MONTH_S,DAY_S,NEW_LATITUDE,NEW_LONGITUDE,Combine_Lat_Long
0,-8.844611e+06,5.412414e+06,3387730,892658,2006,2006-03-11 05:00:00+00:00,852,8,BLOOR ST W,DUNDAS ST W,...,D11,88,High Park North (88),1,2006,3,11,43.66,-79.45,"43.66, -79.45"
1,-8.844611e+06,5.412414e+06,3387731,892658,2006,2006-03-11 05:00:00+00:00,852,8,BLOOR ST W,DUNDAS ST W,...,D11,88,High Park North (88),2,2006,3,11,43.66,-79.45,"43.66, -79.45"
2,-8.816480e+06,5.434843e+06,3388101,892810,2006,2006-03-11 05:00:00+00:00,915,9,MORNINGSIDE AVE,SHEPPARD AVE E,...,D42,131,Rouge (131),3,2006,3,11,43.80,-79.20,"43.8, -79.2"
3,-8.816480e+06,5.434843e+06,3388102,892810,2006,2006-03-11 05:00:00+00:00,915,9,MORNINGSIDE AVE,SHEPPARD AVE E,...,D42,131,Rouge (131),4,2006,3,11,43.80,-79.20,"43.8, -79.2"
4,-8.822759e+06,5.424516e+06,3387793,892682,2006,2006-03-12 05:00:00+00:00,240,2,EGLINTON AVE E,COMMONWEALTH AVE,...,D41,138,Eglinton East (138),5,2006,3,12,43.73,-79.26,"43.73, -79.26"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16855,-8.820837e+06,5.421411e+06,81509784,1636966,2020,2020-08-30 04:00:00+00:00,1340,13,BRIMLEY RD,BARKDENE HILLS,...,D41,123,Cliffcrest (123),16856,2020,8,30,43.71,-79.24,"43.71, -79.24"
16856,-8.820068e+06,5.425334e+06,81505452,1650701,2020,2020-09-01 04:00:00+00:00,1205,12,EGLINTON AVE E,BELLAMY RD N,...,D43,138,Eglinton East (138),16857,2020,9,1,43.74,-79.23,"43.74, -79.23"
16857,-8.820068e+06,5.425334e+06,81505453,1650701,2020,2020-09-01 04:00:00+00:00,1205,12,EGLINTON AVE E,BELLAMY RD N,...,D43,138,Eglinton East (138),16858,2020,9,1,43.74,-79.23,"43.74, -79.23"
16858,-8.820068e+06,5.425334e+06,81505454,1650701,2020,2020-09-01 04:00:00+00:00,1205,12,EGLINTON AVE E,BELLAMY RD N,...,D43,138,Eglinton East (138),16859,2020,9,1,43.74,-79.23,"43.74, -79.23"


In [61]:
# Replace <Null> with Unknown
ksi['VISIBILITY'] = np.where(ksi['VISIBILITY']=='<Null>', 'Unknown', ksi['VISIBILITY'])
ksi['LIGHT'] = np.where(ksi['LIGHT']=='<Null>', 'Unknown', ksi['LIGHT'])
ksi['RDSFCOND'] = np.where(ksi['RDSFCOND']=='<Null>', 'Unknown', ksi['RDSFCOND'])

In [62]:
# Change variables to 'category'
ksi.VISIBILITY = ksi.VISIBILITY.astype('category')
ksi.LIGHT = ksi.LIGHT.astype('category')
ksi.RDSFCOND = ksi.RDSFCOND.astype('category')
ksi.ACCLASS = ksi.ACCLASS.astype('category')

In [63]:
# Extract only the environmental variables
ksi_environmental = ksi[['ACCNUM', 'VISIBILITY', 'LIGHT', 'RDSFCOND', 'ACCLASS']]
ksi_environmental

Unnamed: 0,ACCNUM,VISIBILITY,LIGHT,RDSFCOND,ACCLASS
0,892658,Clear,Daylight,Dry,1
1,892658,Clear,Daylight,Dry,1
2,892810,Clear,Daylight,Dry,1
3,892810,Clear,Daylight,Dry,1
4,892682,Clear,Dark,Dry,1
...,...,...,...,...,...
16855,1636966,Clear,Daylight,Dry,0
16856,1650701,Clear,Daylight,Dry,0
16857,1650701,Clear,Daylight,Dry,0
16858,1650701,Clear,Daylight,Dry,0


In [64]:
# Remove the duplicate records so that each row is a unique accident
ksi_environmental = ksi_environmental.drop_duplicates()
ksi_environmental

Unnamed: 0,ACCNUM,VISIBILITY,LIGHT,RDSFCOND,ACCLASS
0,892658,Clear,Daylight,Dry,1
2,892810,Clear,Daylight,Dry,1
4,892682,Clear,Dark,Dry,1
7,892913,Clear,Dark,Dry,0
9,893251,Clear,Daylight,Dry,1
...,...,...,...,...,...
16848,1603307,Clear,Daylight,Dry,0
16850,1604895,Clear,Dark,Dry,0
16852,1630486,Clear,Daylight,Dry,0
16854,1636966,Clear,Daylight,Dry,0


**LOGISTIC REGRESSION**

In [65]:
predictors = ['VISIBILITY', 'LIGHT', 'RDSFCOND']
outcome = 'ACCLASS'

In [66]:
X = pd.get_dummies(ksi_environmental[predictors], prefix_sep='_', drop_first=True)
y = ksi_environmental[outcome]

In [67]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)

In [68]:
logit_reg = LogisticRegression(solver='liblinear', C=1e42, random_state=1)
logit_reg.fit(train_X, train_y)

LogisticRegression(C=1e+42, random_state=1, solver='liblinear')

In [69]:
logit_reg_prob = logit_reg.predict_proba(valid_X)
logit_reg_pred = logit_reg.predict(valid_X)

In [70]:
logit_result = pd.DataFrame({'actual': valid_y,
                             'p_0': [p[0] for p in logit_reg_prob],
                             'p_1': [p[1] for p in logit_reg_prob],
                             'Predicted': logit_reg_pred})
logit_result

Unnamed: 0,actual,p_0,p_1,Predicted
3778,0,0.830237,0.169763,0
10731,0,0.885971,0.114029,0
6717,0,0.885971,0.114029,0
8239,0,0.885971,0.114029,0
9430,0,0.885971,0.114029,0
...,...,...,...,...
6274,0,0.858848,0.141152,0
6029,0,0.830237,0.169763,0
9009,0,0.885971,0.114029,0
12867,0,0.830237,0.169763,0


In [71]:
print(logit_reg.intercept_)
print(pd.DataFrame({'coef': logit_reg.coef_[0], 'odds': np.e**logit_reg.coef_[0]}, 
                   index = X.columns).sort_values(by='coef', ascending=False))

[-1.58730643]
                                        coef          odds
RDSFCOND_Unknown                   10.573449  39083.247459
VISIBILITY_Unknown                  6.706615    817.797882
RDSFCOND_Packed Snow                1.630764      5.107774
VISIBILITY_Other                    1.147538      3.150428
RDSFCOND_Other                      1.050741      2.859769
VISIBILITY_Fog, Mist, Smoke, Dust   0.645024      1.906032
RDSFCOND_Wet                        0.244472      1.276947
LIGHT_Dawn                          0.011446      1.011512
RDSFCOND_Slush                      0.006782      1.006805
RDSFCOND_Spilled liquid             0.000000      1.000000
VISIBILITY_Strong wind              0.000000      1.000000
LIGHT_Dusk                         -0.118550      0.888207
RDSFCOND_Ice                       -0.216605      0.805248
VISIBILITY_Rain                    -0.402395      0.668716
VISIBILITY_Snow                    -0.441811      0.642871
LIGHT_Daylight                     -0.4629

In [72]:
classificationSummary(valid_y, logit_reg.predict(valid_X))

Confusion Matrix (Accuracy 0.8742)

       Prediction
Actual    0    1
     0 2090    8
     1  294    9
