In [86]:
# Plotting groups of categorical variables is so far proving fruitless. Connections must be there (I hope), but how to find them is difficult. I think it is time to 
# just start forming a logistic regression model and see what happens. I am hoping it will give any insight into how different characteristics effect injury severity.
# This notebook will be a first attempt at a simple random forest classifier. 

In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE

sns.set_style("whitegrid")

In [88]:
data = '/home/wbrave1/Desktop/erdos/Bicycle-Safety/data/NCDOT_BikePedCrash.csv'

BikeCrash = pd.read_csv(data)

In [89]:
# I think for the sake of simplicity, I will take only a few features from the whole dataset. BikeInjury, Weather, CrashHour, CrashDay, and CrashMonth. This choice is made 
# because everythin besides BikeInjury can be encoded in a straightforward way. Weather will be one hot encoded, CrashMonth numbered 1 to 12.

features = ['BikeInjury', 'Weather', 'CrashHour', 'CrashDay', 'CrashMonth', 'RdConfig']

#features = ['BikeInjury', 'Weather', 'CrashHour', 'CrashDay', 'CrashMonth']

BikeCrashTrunc = BikeCrash[features]

In [90]:
WeekDays = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']

BikeCrashTrunc['CrashDay'] = pd.Categorical(BikeCrashTrunc['CrashDay'], categories=WeekDays, ordered=True)

BikeCrashTrunc = BikeCrashTrunc.sort_values(by='CrashDay')

cat_columns = BikeCrashTrunc.select_dtypes(['category']).columns

BikeCrashTrunc[cat_columns] = BikeCrashTrunc[cat_columns].apply(lambda x: x.cat.codes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  BikeCrashTrunc['CrashDay'] = pd.Categorical(BikeCrashTrunc['CrashDay'], categories=WeekDays, ordered=True)


In [91]:
Months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

BikeCrashTrunc['CrashMonth'] = pd.Categorical(BikeCrashTrunc['CrashMonth'], categories=Months, ordered=True)

BikeCrashTrunc = BikeCrashTrunc.sort_values(by='CrashMonth')

cat_columns = BikeCrashTrunc.select_dtypes(['category']).columns

BikeCrashTrunc[cat_columns] = BikeCrashTrunc[cat_columns].apply(lambda x: x.cat.codes)

In [92]:
BikeCrashTrunc = BikeCrashTrunc.drop(BikeCrashTrunc[BikeCrashTrunc.BikeInjury == 'Unknown Injury'].index)
BikeCrashTrunc = BikeCrashTrunc.drop(BikeCrashTrunc[BikeCrashTrunc.RdConfig == 'Unknown'].index)

In [93]:
BikeCrashTrunc = pd.get_dummies(BikeCrashTrunc, columns = ['Weather'], dtype=float)
BikeCrashTrunc = pd.get_dummies(BikeCrashTrunc, columns = ['RdConfig'], dtype=float)

In [94]:
BikeCrashTrunc = BikeCrashTrunc.replace(['O: No Injury', 'B: Suspected Minor Injury', 'C: Possible Injury', 'A: Suspected Serious Injury', 'K: Killed'], ['Minor Injury', 'Minor Injury', 'Minor Injury', 'Serious Injury', 'Serious Injury'])

In [95]:
# Okay, I've got all the features encoded in some way.

from sklearn.model_selection import train_test_split
oversample = SMOTE()

In [96]:
X = BikeCrashTrunc.iloc[:, 1:]
y = BikeCrashTrunc.iloc[:, 0]

In [97]:
CrashTrainX, CrashTestX, CrashTrainY, CrashTestY = train_test_split(X, y,
                                              shuffle=True,
                                              random_state=123,
                                              test_size=.2)

In [98]:
CrashTrainX_res, CrashTrainY_res = oversample.fit_resample(CrashTrainX, CrashTrainY)

In [99]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [100]:
max_depths = range(1, 20)
n_trees = [100, 1000]

grid_cv = GridSearchCV(RandomForestClassifier(), 
                          param_grid = {'max_depth':max_depths, 
                                        'n_estimators':n_trees}, 
                          scoring = 'accuracy', 
                          cv = 5) 

In [101]:

grid_cv.fit(CrashTrainX_res, CrashTrainY_res)

In [102]:
grid_cv.best_params_

{'max_depth': 10, 'n_estimators': 500}

In [103]:
grid_cv.best_score_

np.float64(0.7191546361739892)

In [104]:
grid_cv.best_estimator_.predict(CrashTrainX_res)

array(['Serious Injury', 'Minor Injury', 'Minor Injury', ...,
       'Serious Injury', 'Serious Injury', 'Minor Injury'],
      shape=(15934,), dtype=object)

In [105]:
grid_cv.cv_results_

{'mean_fit_time': array([0.15121331, 0.68170757, 0.18319225, 0.80501471, 0.19293084,
        0.91829319, 0.21057439, 1.03206773, 0.2366159 , 1.15868692,
        0.26318989, 1.27116489, 0.27422781, 1.28939257, 0.30056181,
        1.48297138, 0.31680279, 1.50357146, 0.32746553, 1.62899323]),
 'std_fit_time': array([0.01299543, 0.01929323, 0.00820971, 0.01230208, 0.00494868,
        0.02873924, 0.01071541, 0.02877243, 0.00121004, 0.04503755,
        0.0062094 , 0.00974006, 0.00901876, 0.01152203, 0.01712433,
        0.04855606, 0.01221017, 0.02289813, 0.00654222, 0.04109485]),
 'mean_score_time': array([0.00857892, 0.02782416, 0.00968947, 0.0320591 , 0.01025729,
        0.03600292, 0.01056013, 0.04054317, 0.01195397, 0.04506922,
        0.01303062, 0.05180573, 0.01392388, 0.05583258, 0.01529698,
        0.06454167, 0.01696205, 0.06759052, 0.01837506, 0.07774887]),
 'std_score_time': array([2.59329834e-04, 5.04319376e-04, 5.17424097e-04, 9.01259676e-04,
        5.42883270e-04, 7.06467399e-

In [None]:
pd.DataFrame({'feature_importance_score':grid_cv.best_estimator_.feature_importances_},
                 index=CrashTrainX.columns[1:]).sort_values('feature_importance_score',
                                                ascending=False)

ValueError: Length of values (13) does not match length of index (6)

In [20]:
# look into decision trees 

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

train_forest_pred = grid_cv.predict(CrashTrainX_res)
test_forest_pred = grid_cv.predict(CrashTestX)

print(f"Test accuracy:", accuracy_score(CrashTestY, test_forest_pred))
print(f"Training accuracy:", accuracy_score(CrashTrainY_res, train_forest_pred))

Test accuracy: 0.7708142726440989
Training accuracy: 0.7329376854599406


In [85]:
cv_scores = cross_val_score(grid_cv, CrashTestX, CrashTestY, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

Cross-validation scores: [0.92009132 0.9221968  0.91990847 0.91990847 0.91990847]
Mean cross-validation score: 0.9204027041994504
