In [2]:
# ! pip install imbalanced-learn
import pandas as pd
import sklearn
import numpy as np


In [3]:
df = pd.read_csv("../../data/processed/cleaned_data.csv",dtype="category")
df.head()

Unnamed: 0,Route Type,Collision Type,Weather,Surface Condition,Light,Traffic Control,Driver Substance Abuse,Driver At Fault,Injury Severity,Driver Distracted By,Speed Limit,Day of Week,Time of Day
0,County,OTHER,CLEAR,DRY,DAYLIGHT,NO CONTROLS,DETECTED,Yes,No Injury,NOT DISTRACTED,15-25,Sunday,afternoon
1,County,OTHER,CLOUDY,DRY,DAYLIGHT,NO CONTROLS,NONE DETECTED,Yes,Minor Injury,NOT DISTRACTED,15-25,Monday,morning
2,Municipality,SAME DIR REAR END,CLEAR,DRY,DAWN,TRAFFIC SIGNAL,NONE DETECTED,No,No Injury,NOT DISTRACTED,30-40,Tuesday,morning
3,County,SINGLE VEHICLE,CLOUDY,DRY,DAYLIGHT,NO CONTROLS,NONE DETECTED,No,No Injury,NOT DISTRACTED,30-40,Tuesday,morning
4,County,SINGLE VEHICLE,CLEAR,DRY,DARK LIGHTS ON,NO CONTROLS,DETECTED,No,No Injury,NOT DISTRACTED,30-40,Thursday,dawn


In [13]:
# Random Forest can be utilized for feature selection by leveraging the feature importance scores provided by the algorithm
# Before conducting a classification algorithm: Feature importance can help you with feature selection or 
# feature engineering before training a classification algorithm. By assessing the importance of different 
# features, you can prioritize and select the most relevant ones, potentially improving the efficiency and 
# accuracy of your classification algorithm. 

# After conducting a classification algorithm: Feature importance can also be analyzed after training a 
# classification algorithm using Random Forest. you can examine the feature importance to gain insights into 
# which features had the most influence on the model's predictions. This retrospective analysis can help you
# understand the key factors driving the classification outcomes and interpret the model's behavior.

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
df_numeric = df.apply(lambda x: x.cat.codes)
y_numeric=df_numeric["Injury Severity"]
x_numeric=df_numeric.drop(["Injury Severity"],axis=1)
rf.fit(x_numeric, y_numeric)


In [14]:
importance = rf.feature_importances_
indices = np.argsort(importance)[::-1]  # Sort indices in descending order
for i, feature_index in enumerate(indices):
    print(f"Feature {i+1}: {x_numeric.columns[feature_index]} ({importance[feature_index]})")

Feature 1: Day of Week (0.24150974858501528)
Feature 2: Collision Type (0.1696195607792837)
Feature 3: Route Type (0.09373584935280219)
Feature 4: Time of Day (0.0869449529879933)
Feature 5: Light (0.07894559692671745)
Feature 6: Traffic Control (0.07730874596578606)
Feature 7: Weather (0.07452115068470318)
Feature 8: Speed Limit (0.051921275845526686)
Feature 9: Surface Condition (0.036270814441793454)
Feature 10: Driver Substance Abuse (0.03421606441378519)
Feature 11: Driver At Fault (0.03059462871459191)
Feature 12: Driver Distracted By (0.024411611302001646)


In [5]:
y=df["Injury Severity"]
x=df.drop(["Injury Severity"],axis=1)

In [19]:
nrow=df.shape[0]
ncol=df.shape[1]
print(nrow, ncol)


105379 13


In [21]:
# balance class attribute data

from sklearn.datasets import make_classification


class_distribution = [84376, 19981, 1022]
total= sum(class_distribution)
class_weights = {0: class_distribution[0]/total,
                 1: class_distribution[1]/total,
                 2: class_distribution[2]/total}
x, y = make_classification(
    n_samples=nrow,  # the number of rows in clean dataset
    n_features=ncol-1,  # Total number of features excluding the class attribute
    n_informative=ncol-1,  # Number of informative features in your dataset
    n_redundant=0,  # Number of redundant features 
    n_repeated=0,  # Number of repeated features 
    n_classes=3,  # Number of classes in class attribute
    weights=class_weights,  # Class distribution of the target variable
    random_state=42)

# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [7]:
df["Injury Severity"].value_counts()

No Injury         84376
Minor Injury      19981
Serious Injury     1022
Name: Injury Severity, dtype: int64

In [23]:
print("Generated class distribution:")
print(np.bincount(y))


Generated class distribution:
[83894 20148  1337]


In [24]:
# ROSE

from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
x_r_resampled, y_r_resampled = ros.fit_resample(x, y)

print("Before ROSE:")
print(np.bincount(y))

print("After ROSE:")
print(np.bincount(y_r_resampled))

Before ROSE:
[83894 20148  1337]
After ROSE:
[83894 83894 83894]


In [25]:
# SMOTE

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
x_s_resampled, y_s_resampled = smote.fit_resample(x_r_resampled, y_r_resampled)

print("Before SMOTE:")
print(np.bincount(y_r_resampled))

print("After SMOTE:")
print(np.bincount(y_s_resampled))

Before SMOTE:
[83894 83894 83894]
After SMOTE:
[83894 83894 83894]


In [26]:
# cross validation: Stratified K-fold Cross-Validation
# Similar to K-fold, but it ensures that each fold maintains the same class distribution as the original dataset.

from sklearn.model_selection import StratifiedKFold

k=10
stratified_kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
# Iterate over the folds
for train_index, test_index in stratified_kfold.split(x_s_resampled, y_s_resampled):
    # Obtain the training and testing sets for this fold
    x_train, x_test = x_s_resampled[train_index], x_s_resampled[test_index]
    y_train, y_test = y_s_resampled[train_index], y_s_resampled[test_index]

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [27]:
# for training model, there are 5 algorithms have been selected: logistic regression, Naive bayes, KNN, 
# decision tree and gradient boosting algorithm

# logistic regression

from sklearn.linear_model import LogisticRegression

lr_model=LogisticRegression(
    random_state=42,
    solver="newton-cg",
    warm_start=True).fit(x_train,y_train)
y_pred=lr_model.predict(x_test)
lr_accuracy=accuracy_score(y_test,y_pred)
f1=f1_score(y_test,y_pred,average="weighted")
print("Accuracy:", lr_accuracy)
print("F1 Score: %.2f" % f1)

Accuracy: 0.6397409408773045
F1 Score: 0.64


In [28]:
# Naive bayes

from sklearn.naive_bayes import GaussianNB

nb_model=GaussianNB().fit(x_train,y_train)
y_pred=nb_model.predict(x_test)
nb_accuracy=accuracy_score(y_test,y_pred)
f1=f1_score(y_test,y_pred,average="weighted")
print("Accuracy:", nb_accuracy)
print("F1 Score: %.2f" % f1)

Accuracy: 0.6505483153210426
F1 Score: 0.65


In [29]:
# KNN

from sklearn.neighbors import KNeighborsClassifier

knn_model=KNeighborsClassifier(n_neighbors=300).fit(x_train,y_train)
y_pred=knn_model.predict(x_test)
knn_accuracy=accuracy_score(y_test,y_pred)
f1=f1_score(y_test,y_pred,average="weighted")
print("Accuracy:", knn_accuracy)
print("F1 Score: %.2f" % f1)

Accuracy: 0.8835823267641449
F1 Score: 0.88


In [30]:
# decision tree

from sklearn.tree import DecisionTreeClassifier

ct_model=DecisionTreeClassifier(max_depth=10).fit(x_train,y_train)
y_pred=ct_model.predict(x_test)
ct_accuracy=accuracy_score(y_test,y_pred)
f1=f1_score(y_test,y_pred,average="weighted")
print("Accuracy:", ct_accuracy)
print("F1 Score: %.2f" % f1)

Accuracy: 0.837730451366815
F1 Score: 0.84


In [31]:
# gradient boosting algorithm

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

gb_model=GradientBoostingClassifier().fit(x_train,y_train)
y_pred=gb_model.predict(x_test)
gb_accuracy=accuracy_score(y_test,y_pred)
f1=f1_score(y_test,y_pred,average="weighted")
print("Accuracy:", gb_accuracy)
print("F1 Score: %.2f" % f1)


Accuracy: 0.8122218690400509
F1 Score: 0.81


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105379 entries, 0 to 105378
Data columns (total 13 columns):
 #   Column                  Non-Null Count   Dtype   
---  ------                  --------------   -----   
 0   Route Type              105379 non-null  category
 1   Collision Type          105379 non-null  category
 2   Weather                 105379 non-null  category
 3   Surface Condition       105379 non-null  category
 4   Light                   105379 non-null  category
 5   Traffic Control         105379 non-null  category
 6   Driver Substance Abuse  105379 non-null  category
 7   Driver At Fault         105379 non-null  category
 8   Injury Severity         105379 non-null  category
 9   Driver Distracted By    105379 non-null  category
 10  Speed Limit             105379 non-null  category
 11  Day of Week             105379 non-null  category
 12  Time of Day             105379 non-null  category
dtypes: category(13)
memory usage: 1.3 MB
