In [54]:
##Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt 
import plotly.offline as py
from sklearn import tree,linear_model,neighbors, svm
from sklearn.model_selection import cross_val_score, train_test_split,GridSearchCV, KFold
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,recall_score,f1_score,precision_score
from sklearn.exceptions import ConvergenceWarning
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
import time
import matplotlib.pyplot as plt                
import plotly.offline as py
from plotly import tools
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

In [55]:
## For ignoring warnings to view clean output
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [56]:
## Importing the dataset
df = pd.read_csv('Data/cleaned.csv',header=0)

In [58]:
df.sample(3)

Unnamed: 0,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Lanes_or_Medians,Types_of_Junction,Road_surface_type,Light_conditions,Weather_conditions,Type_of_collision,Vehicle_movement,Pedestrian_movement,Cause_of_accident,Accident_severity
11783,Over 51,Male,Junior high school,Employee,2-5yr,Two-way (divided with broken lines road marking),Y Shape,Asphalt roads,Daylight,Normal,Vehicle with vehicle collision,Getting off,Not a Pedestrian,Driving carelessly,2
8924,Under 18,Male,Elementary school,Owner,2-5yr,other,Y Shape,Other,Daylight,Normal,Vehicle with vehicle collision,Going straight,Not a Pedestrian,Moving Backward,1
4250,Over 51,Male,Junior high school,Employee,2-5yr,Two-way (divided with broken lines road marking),No junction,Asphalt roads,Daylight,Normal,Vehicle with vehicle collision,U-Turn,Unknown or other,No priority to vehicle,2


In [59]:
### Separating Independent and Dependent features
X = data.iloc[:,:-1]
y = data.iloc[:, 14]

### Data Transformation
#### Handling Categorical Variables - Creating Dummy Variables

In [22]:
# Shows the columns with their number of categories each variable is having
for col in data.columns:
    print(col, ':', len(data[col].unique()), 'categories')

Age_band_of_driver : 5 categories
Sex_of_driver : 3 categories
Educational_level : 7 categories
Vehicle_driver_relation : 4 categories
Driving_experience : 8 categories
Lanes_or_Medians : 7 categories
Types_of_Junction : 8 categories
Road_surface_type : 6 categories
Light_conditions : 4 categories
Weather_conditions : 9 categories
Type_of_collision : 10 categories
Vehicle_movement : 13 categories
Pedestrian_movement : 9 categories
Cause_of_accident : 20 categories
Accident_severity : 3 categories


In [10]:
pd.get_dummies(data,drop_first=True).shape

(12316, 101)

In [60]:
X = pd.get_dummies(X, drop_first=True)

In [61]:
X.shape

(12316, 99)

In [62]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [63]:
print("Mean of the dataset: ", np.mean(X).round(8))
print("Standard deviation of the dataset: ", np.std(X).round(8))

Mean of the dataset:  0.0
Standard deviation of the dataset:  1.0


### Handling Class imbalance

In [64]:
### Checking for data imbalance 
y.value_counts()

2    10415
1     1743
0      158
Name: Accident_severity, dtype: int64

In [65]:
print('Slight Injury: ' + str(round(data['Accident_severity'].value_counts()[2] / len(data) * 100, 2)) + '%\nSerious Injury: ' + 
      str(round(data['Accident_severity'].value_counts()[1] / len(data) * 100, 2))  + '%\nFatal Injury: ' + 
      str(round(data['Accident_severity'].value_counts()[0] / len(data) * 100, 2)) + '%')

Slight Injury: 84.56%
Serious Injury: 14.15%
Fatal Injury: 1.28%


#### Splitting the dataset into training and test data for normal distribution

In [66]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
#setting 20% aside as validation data for cross validation
x_train_t, x_train_v, y_train_t, y_train_v = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

In [67]:
# print the shapes of our training and test set 
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(9852, 99)
(2464, 99)
(9852,)
(2464,)


In [68]:
y_test.value_counts()

2    2064
1     363
0      37
Name: Accident_severity, dtype: int64

## Model training with normal class distribution

#### Model Evaluation : F1 Macro
**In this problem domain all classes should be treated equally. So Macro F1-score will give the same importance to each label/class. It will be low for models that only perform well on the common classes while performing poorly on the rare classes.***

### 1. Logistic Regression

In [69]:
start_time = time.time()
LR_model = linear_model.LogisticRegression(class_weight='balanced', multi_class='ovr')
# feeding the training data into the model
LR_model.fit(X_train, y_train)
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Execution time: 0.9414849281311035 sec


In [70]:
# predicting the values for x-test
y_pred = LR_model.predict(X_test)
# finding the training and testing accuracy
name = ['Fatal','Serious','Slight']
LR_r=recall_score(y_test,y_pred, average='macro')
LR_p=precision_score(y_test,y_pred, average='macro')
LR_f=f1_score(y_test,y_pred, average='macro')
print("Confusion Matrix: - \n",confusion_matrix(y_test, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test, y_pred,target_names=name))
print("Recall:", LR_r)
print("Precision:", LR_r)
print("F1 score:", LR_r)

Confusion Matrix: - 
 [[  14   12   11]
 [ 107  112  144]
 [ 484  560 1020]]

Classification Report: - 
               precision    recall  f1-score   support

       Fatal       0.02      0.38      0.04        37
     Serious       0.16      0.31      0.21       363
      Slight       0.87      0.49      0.63      2064

    accuracy                           0.47      2464
   macro avg       0.35      0.39      0.30      2464
weighted avg       0.75      0.47      0.56      2464

Recall: 0.39370145659786254
Precision: 0.39370145659786254
F1 score: 0.39370145659786254


### 2. Naive Bayes

In [71]:
start_time = time.time()
NB_model = GaussianNB()
# feeding the training data into the model
NB_model.fit(X_train, y_train)
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Execution time: 0.24031949043273926 sec


In [72]:
# predicting the values for x-test
y_pred = NB_model.predict(X_test)
# finding the training and testing accuracy
name = ['Fatal','Serious','Slight']
NB_r=recall_score(y_test,y_pred, average='macro')
NB_p=precision_score(y_test,y_pred, average='macro')
NB_f=f1_score(y_test,y_pred, average='macro')
print("Confusion Matrix: - \n",confusion_matrix(y_test, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test, y_pred,target_names=name))
print("Recall:", NB_r)
print("Precision:", NB_p)
print("F1 score:", NB_f)

Confusion Matrix: - 
 [[  32    5    0]
 [ 306   53    4]
 [1677  326   61]]

Classification Report: - 
               precision    recall  f1-score   support

       Fatal       0.02      0.86      0.03        37
     Serious       0.14      0.15      0.14       363
      Slight       0.94      0.03      0.06      2064

    accuracy                           0.06      2464
   macro avg       0.36      0.35      0.08      2464
weighted avg       0.81      0.06      0.07      2464

Recall: 0.3468082126908765
Precision: 0.3641210883650399
F1 score: 0.07679797314874669


### 3. Decision Tree

In [73]:
start_time = time.time()
DT_model = tree.DecisionTreeClassifier(class_weight='balanced', random_state=101)
# feeding the training data into the model
DT_model.fit(X_train, y_train)
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Execution time: 0.6552484035491943 sec


In [74]:
# predicting the values for x-test
y_pred = DT_model.predict(X_test)
# finding the training and testing accuracy
name = ['Fatal','Serious','Slight']
DT_r=recall_score(y_test,y_pred, average='macro')
DT_p=precision_score(y_test,y_pred, average='macro')
DT_f=f1_score(y_test,y_pred, average='macro')
print("Confusion Matrix: - \n",confusion_matrix(y_test, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test, y_pred,target_names=name))
print("Recall:", DT_r)
print("Precision:", DT_p)
print("F1 score:", DT_f)

Confusion Matrix: - 
 [[   3    8   26]
 [   6   54  303]
 [  46  304 1714]]

Classification Report: - 
               precision    recall  f1-score   support

       Fatal       0.05      0.08      0.07        37
     Serious       0.15      0.15      0.15       363
      Slight       0.84      0.83      0.83      2064

    accuracy                           0.72      2464
   macro avg       0.35      0.35      0.35      2464
weighted avg       0.73      0.72      0.72      2464

Recall: 0.3534225894162469
Precision: 0.3470162494933204
F1 score: 0.34934601660022735


### 4. K Nearest Neighbors

In [75]:
start_time = time.time()
knn_model = neighbors.KNeighborsClassifier()
# feeding the training data into the model
knn_model.fit(X_train, y_train)
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Execution time: 0.8317763805389404 sec


In [76]:
# predicting the values for x-test
y_pred = knn_model.predict(X_test)
# finding the training and testing accuracy
name = ['Fatal','Serious','Slight']
knn_r=recall_score(y_test,y_pred, average='macro')
knn_p=precision_score(y_test,y_pred, average='macro')
knn_f=f1_score(y_test,y_pred, average='macro')
print("Confusion Matrix: - \n",confusion_matrix(y_test, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test, y_pred,target_names=name))
print("Recall:", knn_r)
print("Precision:", knn_p)
print("F1 score:", knn_f)

Confusion Matrix: - 
 [[   0    1   36]
 [   1   10  352]
 [   1   64 1999]]

Classification Report: - 
               precision    recall  f1-score   support

       Fatal       0.00      0.00      0.00        37
     Serious       0.13      0.03      0.05       363
      Slight       0.84      0.97      0.90      2064

    accuracy                           0.82      2464
   macro avg       0.32      0.33      0.31      2464
weighted avg       0.72      0.82      0.76      2464

Recall: 0.3320186537681253
Precision: 0.32359540101475587
F1 score: 0.31462907280254776


### 5. Support Vector Machine 

In [77]:
start_time = time.time()
svm_model = svm.SVC(class_weight='balanced',decision_function_shape='ovr')
# feeding the training data into the model
svm_model.fit(X_train, y_train)
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Execution time: 29.80839729309082 sec


In [78]:
# predicting the values for x-test
y_pred = svm_model.predict(X_test)
# finding the training and testing accuracy
name = ['Fatal','Serious','Slight']
svm_r=recall_score(y_test,y_pred, average='macro')
svm_p=precision_score(y_test,y_pred, average='macro')
svm_f=f1_score(y_test,y_pred, average='macro')
print("Confusion Matrix: - \n",confusion_matrix(y_test, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test, y_pred,target_names=name))
print("Recall:", svm_r)
print("Precision:", svm_p)
print("F1 score:", svm_f)

Confusion Matrix: - 
 [[   4   12   21]
 [  21  125  217]
 [ 115  551 1398]]

Classification Report: - 
               precision    recall  f1-score   support

       Fatal       0.03      0.11      0.05        37
     Serious       0.18      0.34      0.24       363
      Slight       0.85      0.68      0.76      2064

    accuracy                           0.62      2464
   macro avg       0.35      0.38      0.35      2464
weighted avg       0.74      0.62      0.67      2464

Recall: 0.37659543552778224
Precision: 0.35492690082230655
F1 score: 0.34624737075607115


### 6. AdaBoost

In [79]:
start_time = time.time()
adb_model = AdaBoostClassifier()
# feeding the training data into the model
adb_model.fit(X_train, y_train)
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Execution time: 1.9867198467254639 sec


In [80]:
# predicting the values for x-test
y_pred = adb_model.predict(X_test)
# finding the training and testing accuracy
name = ['Fatal','Serious','Slight']
adb_r=recall_score(y_test,y_pred, average='macro')
adb_p=precision_score(y_test,y_pred, average='macro')
adb_f=f1_score(y_test,y_pred, average='macro')
print("Confusion Matrix: - \n",confusion_matrix(y_test, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test, y_pred,target_names=name))
print("Recall:", adb_r)
print("Precision:", adb_p)
print("F1 score:", adb_f)

Confusion Matrix: - 
 [[   0    0   37]
 [   0    0  363]
 [   0    0 2064]]

Classification Report: - 
               precision    recall  f1-score   support

       Fatal       0.00      0.00      0.00        37
     Serious       0.00      0.00      0.00       363
      Slight       0.84      1.00      0.91      2064

    accuracy                           0.84      2464
   macro avg       0.28      0.33      0.30      2464
weighted avg       0.70      0.84      0.76      2464

Recall: 0.3333333333333333
Precision: 0.2792207792207792
F1 score: 0.303886925795053



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



### 7. Random Forest

In [81]:
start_time = time.time()
RF_model = RandomForestClassifier(class_weight='balanced', max_depth=2, random_state=42)
# feeding the training data into the model
RF_model.fit(X_train, y_train)
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Execution time: 0.6981334686279297 sec


In [82]:
# predicting the values for x-test
y_pred = RF_model.predict(X_test)
# finding the training and testing accuracy
name = ['Fatal','Serious','Slight']
RF_r=recall_score(y_test,y_pred, average='macro')
RF_p=precision_score(y_test,y_pred, average='macro')
RF_f=f1_score(y_test,y_pred, average='macro')
print("Confusion Matrix: - \n",confusion_matrix(y_test, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test, y_pred,target_names=name))
print("Recall:", RF_r)
print("Precision:", RF_p)
print("F1 score:", RF_f)

Confusion Matrix: - 
 [[ 14  12  11]
 [111 109 143]
 [561 586 917]]

Classification Report: - 
               precision    recall  f1-score   support

       Fatal       0.02      0.38      0.04        37
     Serious       0.15      0.30      0.20       363
      Slight       0.86      0.44      0.59      2064

    accuracy                           0.42      2464
   macro avg       0.34      0.37      0.28      2464
weighted avg       0.74      0.42      0.52      2464

Recall: 0.3743122687361588
Precision: 0.3435966245684192
F1 score: 0.2758246054811236


In [83]:
%matplotlib inline
fig = plt.figure(figsize=(5, 10))
models=['Logistic Regression','Naive Bayes Classifier','Decision Tree Classifier','K-Nearest Neighbor','Support Vector Machine','AdaBoost','Random Forest Classifier']
fig = go.Figure(data=[go.Bar(name='recall_score', x=models, y=[LR_f,NB_f,DT_f,knn_f,svm_f,adb_f,RF_f])])
fig.show()

<Figure size 360x720 with 0 Axes>