In [None]:
# Import numpy, pandas, matpltlib.pyplot, sklearn modules and seaborn
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
plt.style.use('ggplot')
import seaborn as sns
# Import KNeighborsClassifier from sklearn.neighbors
from sklearn.neighbors import KNeighborsClassifier
# Import DecisionTreeClassifier from sklearn.tree
from sklearn.tree import DecisionTreeClassifier
# Import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
# Import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.metrics import roc_curve, auc
# For Feature Explanations
import shap
# load JS visualization code to notebook
shap.initjs()

In [None]:
# Configurations
STATE = 'CA'

In [None]:
df = pd.read_csv(f'../output/accident_data_{STATE}.csv')

df.head()

In [None]:
# Feature Extraction & Preprocessing

In [5]:
# Set the list of features to include in Machine Learning
feature_lst=['Source', 'Severity','Start_Lng','Start_Lat','Distance(mi)',
             'Side','City','County','State','Timezone','Temperature(F)',
             'Humidity(%)','Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
             'Weather_Condition','Amenity','Bump','Crossing','Give_Way',
             'Junction','No_Exit','Railway','Roundabout','Station','Stop',
             'Traffic_Calming','Traffic_Signal','Turning_Loop',
             'Sunrise_Sunset','Hour','Weekday', 
             'previous_1', 'previous_2', 'previous_3',
             ]

In [6]:
# Select the dataset to include only the selected features
df_sel=df[feature_lst].copy()
print(df_sel.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177180 entries, 0 to 177179
Data columns (total 35 columns):
Source               177180 non-null object
Severity             177180 non-null int64
Start_Lng            177180 non-null float64
Start_Lat            177180 non-null float64
Distance(mi)         177180 non-null float64
Side                 177180 non-null object
City                 177179 non-null object
County               177180 non-null object
State                177180 non-null object
Timezone             177114 non-null object
Temperature(F)       172080 non-null float64
Humidity(%)          171840 non-null float64
Pressure(in)         173590 non-null float64
Visibility(mi)       173103 non-null float64
Wind_Direction       172049 non-null object
Weather_Condition    173050 non-null object
Amenity              177180 non-null bool
Bump                 177180 non-null bool
Crossing             177180 non-null bool
Give_Way             177180 non-null bool
Junction   

In [7]:
# Check missing values
df_sel.isnull().mean()

Source               0.000000
Severity             0.000000
Start_Lng            0.000000
Start_Lat            0.000000
Distance(mi)         0.000000
Side                 0.000000
City                 0.000006
County               0.000000
State                0.000000
Timezone             0.000373
Temperature(F)       0.028784
Humidity(%)          0.030139
Pressure(in)         0.020262
Visibility(mi)       0.023010
Wind_Direction       0.028959
Weather_Condition    0.023310
Amenity              0.000000
Bump                 0.000000
Crossing             0.000000
Give_Way             0.000000
Junction             0.000000
No_Exit              0.000000
Railway              0.000000
Roundabout           0.000000
Station              0.000000
Stop                 0.000000
Traffic_Calming      0.000000
Traffic_Signal       0.000000
Turning_Loop         0.000000
Sunrise_Sunset       0.000006
Hour                 0.000000
Weekday              0.000000
previous_1           0.000000
previous_2

In [8]:
df_sel.dropna(subset=df_sel.columns[df_sel.isnull().mean()!=0], how='any', axis=0, inplace=True)
df_sel.shape

(169881, 35)

In [9]:
df_state = df_sel.drop('State', axis=1)
df_state.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 169881 entries, 0 to 177179
Data columns (total 34 columns):
Source               169881 non-null object
Severity             169881 non-null int64
Start_Lng            169881 non-null float64
Start_Lat            169881 non-null float64
Distance(mi)         169881 non-null float64
Side                 169881 non-null object
City                 169881 non-null object
County               169881 non-null object
Timezone             169881 non-null object
Temperature(F)       169881 non-null float64
Humidity(%)          169881 non-null float64
Pressure(in)         169881 non-null float64
Visibility(mi)       169881 non-null float64
Wind_Direction       169881 non-null object
Weather_Condition    169881 non-null object
Amenity              169881 non-null bool
Bump                 169881 non-null bool
Crossing             169881 non-null bool
Give_Way             169881 non-null bool
Junction             169881 non-null bool
No_Exit      

In [10]:
# Generate dummies for categorical data
df_train = pd.get_dummies(df_state,drop_first=True)

df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 169881 entries, 0 to 177179
Columns: 1149 entries, Severity to Weekday_Wed
dtypes: bool(13), float64(10), int64(2), uint8(1124)
memory usage: 201.1 MB


In [11]:
# Set the target for the prediction
target='Severity'

# Create arrays for the features and the response variable

# set X and y
y = df_train[target]
X = df_train.drop(target, axis=1)

# Split the data set into training and testing data sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

In [None]:
model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Get the accuracy score
acc=accuracy_score(y_test, y_pred)

print("[Logistic regression algorithm] accuracy_score: {:.3f}.".format(acc))

print(classification_report(y_test, y_pred))

In [None]:
model = DecisionTreeClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Get the accuracy score
acc = accuracy_score(y_test, y_pred)

print("[Logistic regression algorithm] accuracy_score: {:.3f}.".format(acc))

print(classification_report(y_test, y_pred))

In [12]:
model = RandomForestClassifier(n_estimators=1)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Get the accuracy score
acc = accuracy_score(y_test, y_pred)

print("[Logistic regression algorithm] accuracy_score: {:.3f}.".format(acc))

print(classification_report(y_test, y_pred))

[Logistic regression algorithm] accuracy_score: 0.848.
              precision    recall  f1-score   support

       False       0.90      0.90      0.90     26692
        True       0.65      0.65      0.65      7285

    accuracy                           0.85     33977
   macro avg       0.77      0.78      0.78     33977
weighted avg       0.85      0.85      0.85     33977



In [13]:
# explain the model's predictions using SHAP
explainer = shap.TreeExplainer(model)

Setting feature_perturbation = "tree_path_dependent" because no background data was given.


In [14]:
top = X_test.sample(n=200)

In [15]:
shap_values = explainer.shap_values(top)

SHAPError: Additivity check failed in TreeExplainer! Please ensure the data matrix you passed to the explainer is the same shape that the model was trained on. If your data shape is correct then please report this on GitHub. Consider retrying with the feature_perturbation='interventional' option. This check failed because for one of the samples the sum of the SHAP values was 244585.705663, while the model output was 1.000000. If this difference is acceptable you can set check_additivity=False to disable this check.

In [None]:
shap.summary_plot(shap_values, top)

In [None]:
shap.force_plot(explainer.expected_value[0], shap_values[0])