In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.model_selection import train_test_split

In [2]:
data = '../data/NCDOT_BikePedCrash.csv'

BikeCrash = pd.read_csv(data)

In [101]:
features = ['BikeInjury', 'Weather', 'CrashHour', 'CrashDay', 'CrashMonth']
BikeCrashTrunc = BikeCrash[features]

In [102]:
WeekDays = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']

BikeCrashTrunc['CrashDay'] = pd.Categorical(BikeCrashTrunc['CrashDay'], categories=WeekDays, ordered=True)

BikeCrashTrunc = BikeCrashTrunc.sort_values(by='CrashDay')

cat_columns = BikeCrashTrunc.select_dtypes(['category']).columns

BikeCrashTrunc[cat_columns] = BikeCrashTrunc[cat_columns].apply(lambda x: x.cat.codes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  BikeCrashTrunc['CrashDay'] = pd.Categorical(BikeCrashTrunc['CrashDay'], categories=WeekDays, ordered=True)


In [103]:
Months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

BikeCrashTrunc['CrashMonth'] = pd.Categorical(BikeCrashTrunc['CrashMonth'], categories=Months, ordered=True)

BikeCrashTrunc = BikeCrashTrunc.sort_values(by='CrashMonth')

cat_columns = BikeCrashTrunc.select_dtypes(['category']).columns

BikeCrashTrunc[cat_columns] = BikeCrashTrunc[cat_columns].apply(lambda x: x.cat.codes)


In [104]:
BikeCrashTrunc.BikeInjury.unique()

array(['B: Suspected Minor Injury', 'O: No Injury',
       'A: Suspected Serious Injury', 'C: Possible Injury', 'K: Killed',
       'Unknown Injury'], dtype=object)

In [105]:
BikeCrashTrunc = BikeCrashTrunc.drop(BikeCrashTrunc[BikeCrashTrunc.BikeInjury == 'Unknown Injury'].index)

In [106]:
BikeCrashTrunc = BikeCrashTrunc.replace(['O: No Injury', 'B: Suspected Minor Injury', 'C: Possible Injury', 'A: Suspected Serious Injury', 'K: Killed'], ['Minor Injury', 'Minor Injury', 'Minor Injury', 'Serious Injury', 'Serious Injury'])

In [107]:
BikeCrashTrunc = pd.get_dummies(BikeCrashTrunc, columns = ['Weather'], dtype=float)

In [108]:
BikeCrashTrunc

Unnamed: 0,BikeInjury,CrashHour,CrashDay,CrashMonth,RdConfig,Weather_Clear,Weather_Cloudy,"Weather_Fog, Smog, Smoke",Weather_Other,Weather_Rain,"Weather_Snow, Sleet, Hail, Freezing Rain/Drizzle"
1069,Minor Injury,11,3,0,"Two-Way, Divided, Unprotected Median",1.0,0.0,0.0,0.0,0.0,0.0
3888,Minor Injury,16,0,0,"Two-Way, Not Divided",1.0,0.0,0.0,0.0,0.0,0.0
10397,Serious Injury,16,4,0,"Two-Way, Not Divided",0.0,0.0,0.0,0.0,1.0,0.0
9831,Minor Injury,16,2,0,"Two-Way, Not Divided",1.0,0.0,0.0,0.0,0.0,0.0
2781,Minor Injury,18,6,0,"Two-Way, Not Divided",1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
9172,Minor Injury,17,2,11,"Two-Way, Divided, Unprotected Median",1.0,0.0,0.0,0.0,0.0,0.0
9179,Minor Injury,15,2,11,"Two-Way, Not Divided",1.0,0.0,0.0,0.0,0.0,0.0
9185,Minor Injury,15,2,11,"Two-Way, Not Divided",1.0,0.0,0.0,0.0,0.0,0.0
992,Minor Injury,9,2,11,"Two-Way, Not Divided",1.0,0.0,0.0,0.0,0.0,0.0


In [109]:
X = BikeCrashTrunc.iloc[:, 1:]
y = BikeCrashTrunc.iloc[:, 0]

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=216)

In [111]:
dtc = DecisionTreeClassifier()

In [112]:
dtc.fit(X_train, y_train)

ValueError: could not convert string to float: 'Two-Way, Not Divided'

In [87]:
y_pred = dtc.predict(X_test)

In [88]:
from sklearn.metrics import confusion_matrix

In [89]:
print(confusion_matrix(y_test, y_pred))
# I don't understand what this all means at the moment

[[1986   54]
 [ 140    6]]


In [90]:
from sklearn.metrics import classification_report

In [91]:
print(classification_report(y_test, y_pred))

                precision    recall  f1-score   support

  Minor Injury       0.93      0.97      0.95      2040
Serious Injury       0.10      0.04      0.06       146

      accuracy                           0.91      2186
     macro avg       0.52      0.51      0.51      2186
  weighted avg       0.88      0.91      0.89      2186



In [98]:
pd.DataFrame({'feature_importance_score':dtc.feature_importances_}, index = X.columns).sort_values('feature_importance_score',
                                                ascending=False)

Unnamed: 0,feature_importance_score
CrashMonth,0.314974
CrashDay,0.300857
CrashHour,0.291164
Weather_Clear,0.033969
Weather_Cloudy,0.029503
Weather_Rain,0.021324
"Weather_Fog, Smog, Smoke",0.006543
"Weather_Snow, Sleet, Hail, Freezing Rain/Drizzle",0.001569
Weather_Other,9.8e-05
