In [139]:
# Plotting groups of categorical variables is so far proving fruitless. Connections must be there (I hope), but how to find them is difficult. I think it is time to 
# just start forming a logistic regression model and see what happens. I am hoping it will give any insight into how different characteristics effect injury severity.
# This notebook will be a first attempt at a simple random forest classifier. 

In [140]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE

sns.set_style("whitegrid")

In [141]:
data = '/home/wbrave1/Desktop/erdos/Bicycle-Safety/data/NCDOT_BikePedCrash.csv'

BikeCrash = pd.read_csv(data)

In [142]:
# I think for the sake of simplicity, I will take only a few features from the whole dataset. BikeInjury, Weather, CrashHour, CrashDay, and CrashMonth. This choice is made 
# because everythin besides BikeInjury can be encoded in a straightforward way. Weather will be one hot encoded, CrashMonth numbered 1 to 12.

features = ['BikeInjury', 'Weather', 'CrashHour', 'CrashDay', 'CrashMonth', 'RdConfig']

#features = ['BikeInjury', 'Weather', 'CrashHour', 'CrashDay', 'CrashMonth']

BikeCrashTrunc = BikeCrash[features]

In [143]:
WeekDays = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']

BikeCrashTrunc['CrashDay'] = pd.Categorical(BikeCrashTrunc['CrashDay'], categories=WeekDays, ordered=True)

BikeCrashTrunc = BikeCrashTrunc.sort_values(by='CrashDay')

cat_columns = BikeCrashTrunc.select_dtypes(['category']).columns

BikeCrashTrunc[cat_columns] = BikeCrashTrunc[cat_columns].apply(lambda x: x.cat.codes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  BikeCrashTrunc['CrashDay'] = pd.Categorical(BikeCrashTrunc['CrashDay'], categories=WeekDays, ordered=True)


In [144]:
Months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

BikeCrashTrunc['CrashMonth'] = pd.Categorical(BikeCrashTrunc['CrashMonth'], categories=Months, ordered=True)

BikeCrashTrunc = BikeCrashTrunc.sort_values(by='CrashMonth')

cat_columns = BikeCrashTrunc.select_dtypes(['category']).columns

BikeCrashTrunc[cat_columns] = BikeCrashTrunc[cat_columns].apply(lambda x: x.cat.codes)

In [145]:
BikeCrashTrunc = BikeCrashTrunc.drop(BikeCrashTrunc[BikeCrashTrunc.BikeInjury == 'Unknown Injury'].index)
BikeCrashTrunc = BikeCrashTrunc.drop(BikeCrashTrunc[BikeCrashTrunc.RdConfig == 'Unknown'].index)

In [146]:
BikeCrashTrunc = pd.get_dummies(BikeCrashTrunc, columns = ['Weather'], dtype=float)
BikeCrashTrunc = pd.get_dummies(BikeCrashTrunc, columns = ['RdConfig'], dtype=float)

In [147]:
BikeCrashTrunc = BikeCrashTrunc.replace(['O: No Injury', 'B: Suspected Minor Injury', 'C: Possible Injury', 'A: Suspected Serious Injury', 'K: Killed'], ['Minor Injury', 'Minor Injury', 'Minor Injury', 'Serious Injury', 'Serious Injury'])

In [148]:
# Okay, I've got all the features encoded in some way.

from sklearn.model_selection import train_test_split
oversample = SMOTE()

In [149]:
X = BikeCrashTrunc.iloc[:, 1:]
y = BikeCrashTrunc.iloc[:, 0]

In [150]:
CrashTrainX, CrashTestX, CrashTrainY, CrashTestY = train_test_split(X, y,
                                              shuffle=True,
                                              random_state=123,
                                              test_size=.2)

In [151]:
CrashTrainX_res, CrashTrainY_res = oversample.fit_resample(CrashTrainX, CrashTrainY)

In [152]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [153]:
max_depths = range(1, 16)
n_trees = [100, 750]

grid_cv = GridSearchCV(RandomForestClassifier(), 
                          param_grid = {'max_depth':max_depths, 
                                        'n_estimators':n_trees}, 
                          scoring = 'accuracy', 
                          cv = 5) 

In [154]:

grid_cv.fit(CrashTrainX_res, CrashTrainY_res)

In [155]:
grid_cv.best_params_

{'max_depth': 10, 'n_estimators': 100}

In [156]:
grid_cv.best_score_

np.float64(0.7231714448862503)

In [157]:
grid_cv.best_estimator_.predict(CrashTrainX_res)

array(['Minor Injury', 'Minor Injury', 'Minor Injury', ...,
       'Minor Injury', 'Serious Injury', 'Minor Injury'],
      shape=(15934,), dtype=object)

In [158]:
grid_cv.cv_results_

{'mean_fit_time': array([0.15468936, 0.70807424, 0.16718135, 0.81490846, 0.19830513,
        0.91972413, 0.21315222, 1.02619638, 0.22200131, 1.10386572,
        0.25556221, 1.21856976, 0.2649169 , 1.30351663, 0.28614068,
        1.3899807 , 0.30539017, 1.48223238, 0.31570477, 1.57188292]),
 'std_fit_time': array([0.0167461 , 0.05298515, 0.00454148, 0.0079746 , 0.00106876,
        0.03149549, 0.00379832, 0.01232444, 0.00094583, 0.00377562,
        0.00233456, 0.03002836, 0.00348161, 0.01176479, 0.00227309,
        0.01072852, 0.00409126, 0.01119124, 0.00272364, 0.02166346]),
 'mean_score_time': array([0.00875149, 0.02843018, 0.00901704, 0.03293209, 0.01041121,
        0.03642006, 0.01106853, 0.04033666, 0.01144886, 0.04402895,
        0.01308074, 0.05000868, 0.01376233, 0.05510983, 0.01506171,
        0.06064067, 0.01693125, 0.06827292, 0.01764035, 0.07608442]),
 'std_score_time': array([0.00068523, 0.00142335, 0.0003078 , 0.00039884, 0.00019767,
        0.0009184 , 0.00026045, 0.000715

In [159]:
pd.DataFrame({'feature_importance_score':grid_cv.best_estimator_.feature_importances_},
                 index=CrashTrainX.columns).sort_values('feature_importance_score',
                                                ascending=False)

Unnamed: 0,feature_importance_score
"RdConfig_Two-Way, Not Divided",0.204877
"RdConfig_Two-Way, Divided, Unprotected Median",0.155217
CrashHour,0.123555
Weather_Clear,0.121867
Weather_Cloudy,0.098793
CrashMonth,0.07285
"RdConfig_Two-Way, Divided, Positive Median Barrier",0.059254
CrashDay,0.055873
Weather_Rain,0.054207
"RdConfig_One-Way, Not Divided",0.042472


In [160]:
# look into decision trees 

In [161]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

train_forest_pred = grid_cv.predict(CrashTrainX_res)
test_forest_pred = grid_cv.predict(CrashTestX)

print(f"Test accuracy:", accuracy_score(CrashTestY, test_forest_pred))
print(f"Training accuracy:", accuracy_score(CrashTrainY_res, train_forest_pred))

Test accuracy: 0.7708333333333334
Training accuracy: 0.7357851135935735


In [162]:
cv_scores = cross_val_score(grid_cv, CrashTestX, CrashTestY, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

KeyboardInterrupt: 