In [190]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [191]:
def col_val_to_int(col_val, true_val):
    if col_val in true_val:
        return 1
    else:
        return 0

def crash_day_to_int(day):
    if day == 'Monday':
        return 1
    elif day == 'Tuesday':
        return 2
    elif day == 'Wednesday':
        return 3
    elif day == 'Thursday':
        return 4
    elif day == 'Friday':
        return 5
    elif day == 'Saturday':
        return 6
    elif day == 'Sunday':
        return 7
    else:
        return -1
    
def crash_month_to_int(month):
    if month == 'January':
        return 1
    elif month == 'February':
        return 2
    elif month == 'March':
        return 3
    elif month == 'April':
        return 4
    elif month == 'May':
        return 5
    elif month == 'June':
        return 6
    elif month == 'July':
        return 7
    elif month == 'August':
        return 8
    elif month == 'September':
        return 9
    elif month == 'October':
        return 10
    elif month == 'November':
        return 11
    elif month == 'December':
        return 12
    else:
        return -1
    
def two_digit_to_int(two_digit):
    if two_digit == 'Unknown':
        return 0
    else:
        return int(two_digit[0 : 2])

In [192]:
columns = [
    'AmbulanceR',
    'BikeAge',
    'BikeDir',
    'BikeInjury',
    'BikePos',
    'BikeRace',
    'BikeSex',
    'CrashDay',
    'CrashHour',
    'CrashLoc',
    'CrashMonth',
    'DrvrVehTyp',
    'LightCond',
    'Locality',
    'NumLanes',
    'RdCharacte',
    'RdClass',
    'RdConditio',
    'RdConfig',
    'RdFeature',
    'RdSurface',
    'Region',
    'SpeedLimit',
    'TraffCntrl',
    'Weather'
    ]
true_val = [
    ['BikeDir', ['With Traffic']],
    ['BikePos', ['Travel Lane']],
    ['BikeRace', ['White']],
    ['BikeSex', ['Male']],
    ['CrashLoc', ['Non-Intersection']],
    ['CrashLoc', ['Intersection']],
    ['DrvrVehTyp', ['Passenger Car']],
    ['LightCond', ['Daylight']],
    ['Locality', ['Urban (>70% Developed)']],
    ['RdCharacte', ['Straight - Level']],
    ['RdClass', ['Local Street']],
    ['RdConditio', ['Dry']],
    ['RdConfig', ['Two-Way, Not Divided']],
    ['RdFeature', ['No Special Feature']],
    ['RdSurface', ['Smooth Asphalt']],
    ['Region', ['Piedmont']],
    ['TraffCntrl', ['No Control Present']],
    ['Weather', ['Clear']]
    ]

In [193]:
data = pd.read_csv('../data/NCDOT_BikePedCrash.csv')
data = data[columns]
data = data[data.BikeInjury != 'Unknown Injury']

In [194]:
for pair in true_val:
    data[pair[0] + '_' + pair[1][0]] = data[pair[0]].apply(col_val_to_int, true_val=pair[1])
data = data.drop(columns=list(set([pair[0] for pair in true_val])))
data['BikeAge'] = data['BikeAge'].apply(two_digit_to_int)
data['CrashDay'] = data['CrashDay'].apply(crash_day_to_int)
data['CrashMonth'] = data['CrashMonth'].apply(crash_month_to_int)
data['NumLanes'] = data['NumLanes'].apply(two_digit_to_int)
data['SpeedLimit'] = data['SpeedLimit'].apply(two_digit_to_int)

In [195]:
kfold = StratifiedKFold(n_splits=5,
                           shuffle=True,
                           random_state=21)

In [196]:
X = data.drop(columns=['AmbulanceR', 'BikeInjury'])
y = data['AmbulanceR']

log_reg = LogisticRegression(max_iter=10000)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

for i, (train_index, test_index) in enumerate(kfold.split(X_train, y_train)):
    ## get the kfold training data
    X_train_train = X_train.iloc[train_index,:]
    y_train_train = y_train.iloc[train_index]
    
    ## get the holdout data
    X_holdout = X_train.iloc[test_index,:]
    y_holdout = y_train.iloc[test_index]

    log_reg.fit(X_train_train, y_train_train)
    y_pred = log_reg.predict(X_holdout)

    print('Accuracy = ' + str(accuracy_score(y_holdout, y_pred)))
    print('Classification Report:')
    print(classification_report(y_holdout, y_pred))
    print('Confusion Matrix:')
    print(confusion_matrix(y_holdout, y_pred))

Accuracy = 0.6883933676386507
Classification Report:
              precision    recall  f1-score   support

          No       0.39      0.03      0.05       537
         Yes       0.69      0.98      0.81      1212

    accuracy                           0.69      1749
   macro avg       0.54      0.50      0.43      1749
weighted avg       0.60      0.69      0.58      1749

Confusion Matrix:
[[  14  523]
 [  22 1190]]
Accuracy = 0.6927917620137299
Classification Report:
              precision    recall  f1-score   support

          No       0.48      0.03      0.05       536
         Yes       0.70      0.99      0.82      1212

    accuracy                           0.69      1748
   macro avg       0.59      0.51      0.43      1748
weighted avg       0.63      0.69      0.58      1748

Confusion Matrix:
[[  14  522]
 [  15 1197]]
Accuracy = 0.698512585812357
Classification Report:
              precision    recall  f1-score   support

          No       0.70      0.03      0.06

In [197]:
X = data.drop(columns=['AmbulanceR', 'BikeInjury'])
y = data['BikeInjury']

log_reg = LogisticRegression(max_iter=10000)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

for i, (train_index, test_index) in enumerate(kfold.split(X_train, y_train)):
    ## get the kfold training data
    X_train_train = X_train.iloc[train_index,:]
    y_train_train = y_train.iloc[train_index]
    
    ## get the holdout data
    X_holdout = X_train.iloc[test_index,:]
    y_holdout = y_train.iloc[test_index]

    log_reg.fit(X_train_train, y_train_train)
    y_pred = log_reg.predict(X_holdout)

    print('Accuracy = ' + str(accuracy_score(y_holdout, y_pred)))
    print('Classification Report:')
    print(classification_report(y_holdout, y_pred))
    print('Confusion Matrix:')
    print(confusion_matrix(y_holdout, y_pred))

Accuracy = 0.47913093196112067
Classification Report:
                             precision    recall  f1-score   support

A: Suspected Serious Injury       0.00      0.00      0.00        90
  B: Suspected Minor Injury       0.48      0.65      0.55       753
         C: Possible Injury       0.48      0.50      0.49       690
                  K: Killed       0.00      0.00      0.00        41
               O: No Injury       0.00      0.00      0.00       175

                   accuracy                           0.48      1749
                  macro avg       0.19      0.23      0.21      1749
               weighted avg       0.40      0.48      0.43      1749

Confusion Matrix:
[[  0  67  23   0   0]
 [  0 491 262   0   0]
 [  0 343 347   0   0]
 [  0  35   6   0   0]
 [  0  88  87   0   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy = 0.4816933638443936
Classification Report:
                             precision    recall  f1-score   support

A: Suspected Serious Injury       0.00      0.00      0.00        90
  B: Suspected Minor Injury       0.48      0.66      0.56       752
         C: Possible Injury       0.49      0.50      0.49       691
                  K: Killed       0.00      0.00      0.00        40
               O: No Injury       0.00      0.00      0.00       175

                   accuracy                           0.48      1748
                  macro avg       0.19      0.23      0.21      1748
               weighted avg       0.40      0.48      0.43      1748

Confusion Matrix:
[[  0  70  20   0   0]
 [  0 497 255   0   0]
 [  0 345 345   1   0]
 [  0  38   2   0   0]
 [  0  86  89   0   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy = 0.4754004576659039
Classification Report:
                             precision    recall  f1-score   support

A: Suspected Serious Injury       0.00      0.00      0.00        90
  B: Suspected Minor Injury       0.47      0.66      0.55       752
         C: Possible Injury       0.48      0.49      0.49       691
                  K: Killed       0.00      0.00      0.00        40
               O: No Injury       0.00      0.00      0.00       175

                   accuracy                           0.48      1748
                  macro avg       0.19      0.23      0.21      1748
               weighted avg       0.39      0.48      0.43      1748

Confusion Matrix:
[[  0  72  18   0   0]
 [  0 495 256   1   0]
 [  0 355 336   0   0]
 [  0  32   8   0   0]
 [  0  99  76   0   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy = 0.4685354691075515
Classification Report:
                             precision    recall  f1-score   support

A: Suspected Serious Injury       0.00      0.00      0.00        90
  B: Suspected Minor Injury       0.48      0.63      0.54       752
         C: Possible Injury       0.46      0.49      0.47       691
                  K: Killed       1.00      0.05      0.10        40
               O: No Injury       0.00      0.00      0.00       175

                   accuracy                           0.47      1748
                  macro avg       0.39      0.24      0.22      1748
               weighted avg       0.41      0.47      0.42      1748

Confusion Matrix:
[[  0  64  26   0   0]
 [  0 477 275   0   0]
 [  0 351 340   0   0]
 [  0  28  10   2   0]
 [  0  79  96   0   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy = 0.46395881006864986
Classification Report:
                             precision    recall  f1-score   support

A: Suspected Serious Injury       0.00      0.00      0.00        91
  B: Suspected Minor Injury       0.47      0.62      0.54       752
         C: Possible Injury       0.46      0.50      0.47       690
                  K: Killed       0.00      0.00      0.00        41
               O: No Injury       0.00      0.00      0.00       174

                   accuracy                           0.46      1748
                  macro avg       0.19      0.22      0.20      1748
               weighted avg       0.38      0.46      0.42      1748

Confusion Matrix:
[[  0  63  28   0   0]
 [  0 469 281   2   0]
 [  0 347 342   1   0]
 [  0  31  10   0   0]
 [  0  85  89   0   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [198]:
X = data.drop(columns=['AmbulanceR', 'BikeInjury'])
y = data['BikeInjury'].apply(col_val_to_int, true_val=['A: Suspected Serious Injury', 'K: Killed'])

log_reg = LogisticRegression(max_iter=10000)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

for i, (train_index, test_index) in enumerate(kfold.split(X_train, y_train)):
    ## get the kfold training data
    X_train_train = X_train.iloc[train_index,:]
    y_train_train = y_train.iloc[train_index]
    
    ## get the holdout data
    X_holdout = X_train.iloc[test_index,:]
    y_holdout = y_train.iloc[test_index]

    log_reg.fit(X_train_train, y_train_train)
    y_pred = log_reg.predict(X_holdout)

    print('Accuracy = ' + str(accuracy_score(y_holdout, y_pred)))
    print('Classification Report:')
    print(classification_report(y_holdout, y_pred))
    print('Confusion Matrix:')
    print(confusion_matrix(y_holdout, y_pred))

Accuracy = 0.9251000571755289
Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1618
           1       0.00      0.00      0.00       131

    accuracy                           0.93      1749
   macro avg       0.46      0.50      0.48      1749
weighted avg       0.86      0.93      0.89      1749

Confusion Matrix:
[[1618    0]
 [ 131    0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy = 0.9250572082379863
Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1618
           1       0.00      0.00      0.00       130

    accuracy                           0.93      1748
   macro avg       0.46      0.50      0.48      1748
weighted avg       0.86      0.93      0.89      1748

Confusion Matrix:
[[1617    1]
 [ 130    0]]
Accuracy = 0.9256292906178489
Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1618
           1       0.00      0.00      0.00       130

    accuracy                           0.93      1748
   macro avg       0.46      0.50      0.48      1748
weighted avg       0.86      0.93      0.89      1748

Confusion Matrix:
[[1618    0]
 [ 130    0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy = 0.9250572082379863
Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1617
           1       0.00      0.00      0.00       131

    accuracy                           0.93      1748
   macro avg       0.46      0.50      0.48      1748
weighted avg       0.86      0.93      0.89      1748

Confusion Matrix:
[[1617    0]
 [ 131    0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy = 0.9250572082379863
Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1617
           1       0.00      0.00      0.00       131

    accuracy                           0.93      1748
   macro avg       0.46      0.50      0.48      1748
weighted avg       0.86      0.93      0.89      1748

Confusion Matrix:
[[1617    0]
 [ 131    0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [205]:
grid_cv = GridSearchCV(RandomForestClassifier(), 
                          param_grid = {'max_depth':range(1, 11), 
                                        'n_estimators':[100, 500]}, 
                          scoring = 'average_precision', 
                          cv = 5)

In [206]:
X = data.drop(columns=['AmbulanceR', 'BikeInjury'])
y = data['AmbulanceR']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

grid_cv.fit(X_train, y_train)
y_pred = grid_cv.best_estimator_.predict(X_test)

print('Accuracy = ' + str(accuracy_score(y_test, y_pred)))
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print(pd.DataFrame({'feature_importance_score':grid_cv.best_estimator_.feature_importances_}, index=X_train.columns).sort_values('feature_importance_score', ascending=False))

Traceback (most recent call last):
  File "c:\Users\xgr6vm\.conda\envs\erdos_spring_2025\Lib\site-packages\sklearn\model_selection\_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\xgr6vm\.conda\envs\erdos_spring_2025\Lib\site-packages\sklearn\metrics\_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\xgr6vm\.conda\envs\erdos_spring_2025\Lib\site-packages\sklearn\metrics\_scorer.py", line 380, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "c:\Users\xgr6vm\.conda\envs\erdos_spring_2025\Lib\site-packages\sklearn\metrics\_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\xgr6vm\.conda\env

Accuracy = 0.6793229643183898
Classification Report:
              precision    recall  f1-score   support

          No       0.00      0.00      0.00       701
         Yes       0.68      1.00      0.81      1485

    accuracy                           0.68      2186
   macro avg       0.34      0.50      0.40      2186
weighted avg       0.46      0.68      0.55      2186

Confusion Matrix:
[[   0  701]
 [   0 1485]]
                                 feature_importance_score
SpeedLimit                                           0.17
Locality_Urban (>70% Developed)                      0.13
NumLanes                                             0.12
BikeAge                                              0.12
LightCond_Daylight                                   0.08
CrashHour                                            0.07
BikePos_Travel Lane                                  0.05
CrashMonth                                           0.05
BikeRace_White                                       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [204]:
X = data.drop(columns=['AmbulanceR', 'BikeInjury'])
y = data['BikeInjury']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

grid_cv.fit(X_train, y_train)
y_pred = grid_cv.best_estimator_.predict(X_test)

print('Accuracy = ' + str(accuracy_score(y_test, y_pred)))
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print(pd.DataFrame({'feature_importance_score':grid_cv.best_estimator_.feature_importances_}, index=X_train.columns).sort_values('feature_importance_score', ascending=False))

Traceback (most recent call last):
  File "c:\Users\xgr6vm\.conda\envs\erdos_spring_2025\Lib\site-packages\sklearn\model_selection\_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\xgr6vm\.conda\envs\erdos_spring_2025\Lib\site-packages\sklearn\metrics\_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\xgr6vm\.conda\envs\erdos_spring_2025\Lib\site-packages\sklearn\metrics\_scorer.py", line 380, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "c:\Users\xgr6vm\.conda\envs\erdos_spring_2025\Lib\site-packages\sklearn\metrics\_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\xgr6vm\.conda\env

KeyboardInterrupt: 

In [None]:
X = data.drop(columns=['AmbulanceR', 'BikeInjury'])
y = data['BikeInjury'].apply(col_val_to_int, true_val=['A: Suspected Serious Injury', 'K: Killed'])

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

grid_cv.fit(X_train, y_train)
y_pred = grid_cv.best_estimator_.predict(X_test)

print('Accuracy = ' + str(accuracy_score(y_test, y_pred)))
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print(pd.DataFrame({'feature_importance_score':grid_cv.best_estimator_.feature_importances_}, index=X_train.columns).sort_values('feature_importance_score', ascending=False))

Accuracy = 0.9251000571755289
Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1618
           1       0.00      0.00      0.00       131

    accuracy                           0.93      1749
   macro avg       0.46      0.50      0.48      1749
weighted avg       0.86      0.93      0.89      1749

Confusion Matrix:
[[1618    0]
 [ 131    0]]
                                 feature_importance_score
SpeedLimit                                           0.19
RdClass_Local Street                                 0.16
Locality_Urban (>70% Developed)                      0.14
BikeAge                                              0.10
BikeDir_With Traffic                                 0.09
BikePos_Travel Lane                                  0.09
LightCond_Daylight                                   0.06
CrashLoc_Intersection                                0.04
CrashLoc_Non-Intersection                            

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy = 0.9256292906178489
Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1618
           1       0.00      0.00      0.00       130

    accuracy                           0.93      1748
   macro avg       0.46      0.50      0.48      1748
weighted avg       0.86      0.93      0.89      1748

Confusion Matrix:
[[1618    0]
 [ 130    0]]
                                 feature_importance_score
SpeedLimit                                           0.19
BikeAge                                              0.15
RdClass_Local Street                                 0.13
Locality_Urban (>70% Developed)                      0.12
CrashLoc_Non-Intersection                            0.08
BikePos_Travel Lane                                  0.07
CrashLoc_Intersection                                0.06
BikeDir_With Traffic                                 0.05
LightCond_Daylight                                   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


KeyboardInterrupt: 