In [1]:
%matplotlib inline
import math
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, roc_curve, auc
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import matplotlib.pylab as plt
from sklearn.ensemble import RandomForestClassifier

!pip install scikit-plot
import scikitplot as skplt
!pip install dmba
from dmba import plotDecisionTree, regressionSummary, classificationSummary, liftChart, gainsChart, adjusted_r2_score, exhaustive_search, backward_elimination, forward_selection, AIC_score, BIC_score

Collecting scikit-plot
  Downloading scikit_plot-0.3.7-py3-none-any.whl (33 kB)
Installing collected packages: scikit-plot
Successfully installed scikit-plot-0.3.7
Collecting dmba
  Downloading dmba-0.0.19-py3-none-any.whl (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 25.7 MB/s 
[?25hInstalling collected packages: dmba
Successfully installed dmba-0.0.19
no display found. Using non-interactive Agg backend


In [34]:
KSI = pd.read_csv('KSI.csv')

In [35]:
KSI_d = KSI.loc[:,['ACCLASS','INVTYPE','DRIVACT']]
KSI_d

Unnamed: 0,ACCLASS,INVTYPE,DRIVACT
0,Fatal,Driver,Failed to Yield Right of Way
1,Fatal,Pedestrian,<Null>
2,Fatal,Motorcycle Driver,Disobeyed Traffic Control
3,Fatal,Driver,Driving Properly
4,Fatal,Driver,Other
...,...,...,...
16855,Non-Fatal Injury,Cyclist,<Null>
16856,Non-Fatal Injury,Driver,Failed to Yield Right of Way
16857,Non-Fatal Injury,Driver,Driving Properly
16858,Non-Fatal Injury,Passenger,<Null>


In [36]:
objdtype_cols = KSI_d.select_dtypes(["object"]).columns
KSI_d[objdtype_cols] = KSI_d[objdtype_cols].astype('category')

In [37]:
driver = KSI_d[KSI_d['INVTYPE'].str.contains('Driver')]
driver

Unnamed: 0,ACCLASS,INVTYPE,DRIVACT
0,Fatal,Driver,Failed to Yield Right of Way
2,Fatal,Motorcycle Driver,Disobeyed Traffic Control
3,Fatal,Driver,Driving Properly
4,Fatal,Driver,Other
7,Non-Fatal Injury,Driver,Failed to Yield Right of Way
...,...,...,...
16853,Non-Fatal Injury,Motorcycle Driver,Driving Properly
16854,Non-Fatal Injury,Driver,Improper Passing
16856,Non-Fatal Injury,Driver,Failed to Yield Right of Way
16857,Non-Fatal Injury,Driver,Driving Properly


In [38]:
driver['ACCLASS'] = np.where(driver['ACCLASS'].str.contains('Non-Fatal'), 0, 1)
driver

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,ACCLASS,INVTYPE,DRIVACT
0,1,Driver,Failed to Yield Right of Way
2,1,Motorcycle Driver,Disobeyed Traffic Control
3,1,Driver,Driving Properly
4,1,Driver,Other
7,0,Driver,Failed to Yield Right of Way
...,...,...,...
16853,0,Motorcycle Driver,Driving Properly
16854,0,Driver,Improper Passing
16856,0,Driver,Failed to Yield Right of Way
16857,0,Driver,Driving Properly


In [39]:
X = pd.get_dummies(driver.drop(columns=['ACCLASS','INVTYPE']))
y = driver['ACCLASS']

In [40]:
X

Unnamed: 0,DRIVACT_<Null>,DRIVACT_Disobeyed Traffic Control,DRIVACT_Driving Properly,DRIVACT_Exceeding Speed Limit,DRIVACT_Failed to Yield Right of Way,DRIVACT_Following too Close,DRIVACT_Improper Lane Change,DRIVACT_Improper Passing,DRIVACT_Improper Turn,DRIVACT_Lost control,DRIVACT_Other,DRIVACT_Speed too Fast For Condition,DRIVACT_Speed too Slow,DRIVACT_Wrong Way on One Way Road
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,1,0,0,0
7,0,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16853,0,0,1,0,0,0,0,0,0,0,0,0,0,0
16854,0,0,0,0,0,0,0,1,0,0,0,0,0,0
16856,0,0,0,0,1,0,0,0,0,0,0,0,0,0
16857,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [41]:
train_X, valid_X, train_y, valid_y = train_test_split(X,y, test_size=0.4, random_state=1)

In [42]:
fullClassTree = DecisionTreeClassifier(random_state=1)
fullClassTree.fit(train_X,train_y)

DecisionTreeClassifier(random_state=1)

In [43]:
plotDecisionTree(fullClassTree,feature_names=train_X.columns)

InvocationException: ignored

In [12]:
classificationSummary(valid_y,fullClassTree.predict(valid_X))

Confusion Matrix (Accuracy 0.8762)

       Prediction
Actual    0    1
     0 3009    0
     1  425    0


#Random Forest

In [44]:
rf = RandomForestClassifier(random_state=1, n_estimators=500)
rf.fit(train_X, train_y)

RandomForestClassifier(n_estimators=500, random_state=1)

In [45]:
importance = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)

In [48]:
df = pd.DataFrame({'feature': train_X.columns,
                   'importance': importance,
                   'std':std})
print(df.sort_values('importance', ascending=False))

                                 feature  importance       std
3          DRIVACT_Exceeding Speed Limit    0.411521  0.143139
5            DRIVACT_Following too Close    0.154381  0.065801
9                   DRIVACT_Lost control    0.129367  0.088843
8                  DRIVACT_Improper Turn    0.086467  0.060996
2               DRIVACT_Driving Properly    0.039035  0.052323
4   DRIVACT_Failed to Yield Right of Way    0.038966  0.043604
0                         DRIVACT_<Null>    0.028757  0.031084
7               DRIVACT_Improper Passing    0.023441  0.023471
10                         DRIVACT_Other    0.020273  0.027762
13     DRIVACT_Wrong Way on One Way Road    0.020068  0.032346
1      DRIVACT_Disobeyed Traffic Control    0.017388  0.033547
6           DRIVACT_Improper Lane Change    0.015985  0.022457
11  DRIVACT_Speed too Fast For Condition    0.014350  0.028331
12                DRIVACT_Speed too Slow    0.000000  0.000000


In [49]:
tf = pd.DataFrame({'feature': train_X.columns,'importance': rf.feature_importances_,'std':std})
tf = tf.sort_values('importance',ascending = False)
sns.barplot(x = tf['importance'], y=feature_imp.index, xerr = tf['std'])
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title('Visualizing Importance Features')

ValueError: ignored

In [50]:
classificationSummary(valid_y, rf.predict(valid_X))

Confusion Matrix (Accuracy 0.8762)

       Prediction
Actual    0    1
     0 3009    0
     1  425    0


#Logistic Regression

In [51]:
logit_reg = LogisticRegression(solver='liblinear', C=1e42, random_state=1)
logit_reg.fit(train_X, train_y)

LogisticRegression(C=1e+42, random_state=1, solver='liblinear')

In [52]:
logit_reg.intercept_

array([-1.91760541])

In [53]:
logit_reg.coef_

array([[-0.67780243, -0.04087087,  0.0500161 ,  1.16481123, -0.18048538,
        -2.00789341, -0.31083572, -0.70237564, -0.6111081 ,  0.40369166,
         0.11301077,  0.05540441,  0.        ,  0.82683197]])

In [54]:
print(pd.DataFrame({'coef': logit_reg.coef_[0]}, index=X.columns))

                                          coef
DRIVACT_<Null>                       -0.677802
DRIVACT_Disobeyed Traffic Control    -0.040871
DRIVACT_Driving Properly              0.050016
DRIVACT_Exceeding Speed Limit         1.164811
DRIVACT_Failed to Yield Right of Way -0.180485
DRIVACT_Following too Close          -2.007893
DRIVACT_Improper Lane Change         -0.310836
DRIVACT_Improper Passing             -0.702376
DRIVACT_Improper Turn                -0.611108
DRIVACT_Lost control                  0.403692
DRIVACT_Other                         0.113011
DRIVACT_Speed too Fast For Condition  0.055404
DRIVACT_Speed too Slow                0.000000
DRIVACT_Wrong Way on One Way Road     0.826832


In [55]:
logit_reg_prob = logit_reg.predict_proba(valid_X)
logit_reg_pred = logit_reg.predict(valid_X)

In [56]:
logit_result = pd.DataFrame({'actual' : valid_y,
                             'p_0' : [p[0] for p in logit_reg_prob],
                             'p_1' : [p[1] for p in logit_reg_prob],
                             'predicted': logit_reg_pred})
logit_result

Unnamed: 0,actual,p_0,p_1,predicted
6798,0,0.876368,0.123632,0
6549,0,0.866179,0.133821,0
11766,0,0.876368,0.123632,0
14066,0,0.930565,0.069435,0
13096,0,0.866179,0.133821,0
...,...,...,...,...
10370,0,0.819641,0.180359,0
9824,0,0.876368,0.123632,0
3793,0,0.926130,0.073870,0
4604,0,0.819641,0.180359,0


In [57]:
classificationSummary(valid_y, logit_reg.predict(valid_X))

Confusion Matrix (Accuracy 0.8762)

       Prediction
Actual    0    1
     0 3009    0
     1  425    0
