In [1]:
##Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt 
import plotly.offline as py
from sklearn import tree,linear_model,neighbors, svm
from sklearn.model_selection import cross_val_score, train_test_split,GridSearchCV, KFold
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,recall_score,f1_score,precision_score
from sklearn.exceptions import ConvergenceWarning
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
import time            
import plotly.offline as py
from plotly import tools
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

In [2]:
## For ignoring warnings to view clean output
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
## Importing the dataset
data = pd.read_csv('Data/cleaned.csv',header=0)

In [4]:
### Separating Independent and Dependent features
X = data.iloc[:,:-1]
y = data.iloc[:, 14]

### Data Transformation
#### Handling Categorical Variables - Creating Dummy Variables

In [5]:
# Shows the columns with their number of categories each variable is having
for col in data.columns:
    print(col, ':', len(data[col].unique()), 'categories')

Age_band_of_driver : 5 categories
Sex_of_driver : 3 categories
Educational_level : 7 categories
Vehicle_driver_relation : 4 categories
Driving_experience : 8 categories
Lanes_or_Medians : 7 categories
Types_of_Junction : 8 categories
Road_surface_type : 6 categories
Light_conditions : 4 categories
Weather_conditions : 9 categories
Type_of_collision : 10 categories
Vehicle_movement : 13 categories
Pedestrian_movement : 9 categories
Cause_of_accident : 20 categories
Accident_severity : 3 categories


In [6]:
pd.get_dummies(data,drop_first=True).shape

(12316, 100)

In [7]:
X = pd.get_dummies(X, drop_first=True)

In [8]:
X.shape

(12316, 99)

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [10]:
print("Mean of the dataset: ", np.mean(X).round(8))
print("Standard deviation of the dataset: ", np.std(X).round(8))

Mean of the dataset:  0.0
Standard deviation of the dataset:  1.0


### Handling Class imbalance

In [11]:
### Checking for data imbalance 
y.value_counts()

2    10415
1     1743
0      158
Name: Accident_severity, dtype: int64

In [12]:
print('Slight Injury: ' + str(round(data['Accident_severity'].value_counts()[2] / len(data) * 100, 2)) + '%\nSerious Injury: ' + 
      str(round(data['Accident_severity'].value_counts()[1] / len(data) * 100, 2))  + '%\nFatal Injury: ' + 
      str(round(data['Accident_severity'].value_counts()[0] / len(data) * 100, 2)) + '%')

Slight Injury: 84.56%
Serious Injury: 14.15%
Fatal Injury: 1.28%


#### SMOTE Oversampling Techniques for handling imbalanced dataset

In [33]:
# Oversampling
sm = SMOTE(random_state=0)
X_over, y_over = sm.fit_sample(X, y)
## train test split
X_train_over,X_test_over,y_train_over,y_test_over = train_test_split(X_over,y_over,test_size=0.2,random_state=42)
#setting 20% aside as validation data for cross validation
x_train_t, x_train_v, y_train_t, y_train_v = train_test_split(X_train_over, y_train_over, test_size = 0.2, random_state = 42)

In [34]:
# Print class frequencies 
pd.Series(y_over).value_counts()

2    10415
1    10415
0    10415
Name: Accident_severity, dtype: int64

In [35]:
# print the shapes of our training and test set 
print(X_train_over.shape)
print(X_test_over.shape)
print(y_train_over.shape)
print(y_test_over.shape)

(24996, 99)
(6249, 99)
(24996,)
(6249,)


In [36]:
y_test_over.value_counts()

1    2100
0    2085
2    2064
Name: Accident_severity, dtype: int64

## Model training : SMOTE Oversampling Technique

#### Model Evaluation : F1 Macro
**In this problem domain all classes should be treated equally. So Macro F1-score will give the same importance to each label/
class. It will be low for models that only perform well on the common classes while performing poorly on the rare classes.***

### 1. Logistic Regression

In [37]:
start_time = time.time()
LR_model = linear_model.LogisticRegression()
# feeding the training data into the model
LR_model.fit(X_train_over, y_train_over)
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Execution time: 1.8251173496246338 sec


In [38]:
# predicting the values for x-test
y_pred = LR_model.predict(X_test_over)
# finding the training and testing accuracy
name = ['Fatal','Serious','Slight']
LR_r=recall_score(y_test_over,y_pred, average='macro')
LR_p=precision_score(y_test_over,y_pred, average='macro')
LR_f=f1_score(y_test_over,y_pred, average='macro')
print("Confusion Matrix: - \n",confusion_matrix(y_test_over, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test_over, y_pred,target_names=name))
print("Recall:", LR_r)
print("Precision:", LR_p)
print("F1 score:", LR_f)

Confusion Matrix: - 
 [[1603  258  224]
 [ 542  935  623]
 [ 545  658  861]]

Classification Report: - 
               precision    recall  f1-score   support

       Fatal       0.60      0.77      0.67      2085
     Serious       0.51      0.45      0.47      2100
      Slight       0.50      0.42      0.46      2064

    accuracy                           0.54      6249
   macro avg       0.54      0.54      0.53      6249
weighted avg       0.54      0.54      0.53      6249

Recall: 0.5437380660255848
Precision: 0.53504716740363
F1 score: 0.5337444169872961


### 2. Naive Bayes

In [39]:
start_time = time.time()
NB_model = GaussianNB()
# feeding the training data into the model
NB_model.fit(X_train_over, y_train_over)
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Execution time: 0.08576846122741699 sec


In [40]:
# predicting the values for x-test
y_pred = NB_model.predict(X_test_over)
# finding the training and testing accuracy
name = ['Fatal','Serious','Slight']
NB_r=recall_score(y_test_over,y_pred, average='macro')
NB_p=precision_score(y_test_over,y_pred, average='macro')
NB_f=f1_score(y_test_over,y_pred, average='macro')
print("Confusion Matrix: - \n",confusion_matrix(y_test_over, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test_over, y_pred,target_names=name))
print("Recall:", NB_r)
print("Precision:", NB_p)
print("F1 score:", NB_f)

Confusion Matrix: - 
 [[2078    6    1]
 [1785  294   21]
 [1697  288   79]]

Classification Report: - 
               precision    recall  f1-score   support

       Fatal       0.37      1.00      0.54      2085
     Serious       0.50      0.14      0.22      2100
      Slight       0.78      0.04      0.07      2064

    accuracy                           0.39      6249
   macro avg       0.55      0.39      0.28      6249
weighted avg       0.55      0.39      0.28      6249

Recall: 0.3916392932165895
Precision: 0.5519730750053423
F1 score: 0.27845083265740966


### 3. Decision Tree

In [41]:
start_time = time.time()
DT_model = tree.DecisionTreeClassifier()
# feeding the training data into the model
DT_model.fit(X_train_over, y_train_over)
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Execution time: 0.6382935047149658 sec


In [42]:
# predicting the values for x-test
y_pred = DT_model.predict(X_test_over)
# finding the training and testing accuracy
name = ['Fatal','Serious','Slight']
DT_r=recall_score(y_test_over,y_pred, average='macro')
DT_p=precision_score(y_test_over,y_pred, average='macro')
DT_f=f1_score(y_test_over,y_pred, average='macro')
print("Confusion Matrix: - \n",confusion_matrix(y_test_over, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test_over, y_pred,target_names=name))
print("Recall:", DT_r)
print("Precision:", DT_p)
print("F1 score:", DT_f)

Confusion Matrix: - 
 [[2004   49   32]
 [  77 1702  321]
 [  31  347 1686]]

Classification Report: - 
               precision    recall  f1-score   support

       Fatal       0.95      0.96      0.95      2085
     Serious       0.81      0.81      0.81      2100
      Slight       0.83      0.82      0.82      2064

    accuracy                           0.86      6249
   macro avg       0.86      0.86      0.86      6249
weighted avg       0.86      0.86      0.86      6249

Recall: 0.86282924490972
Precision: 0.8623294547736647
F1 score: 0.8625559431006898


### 4. K Nearest Neighbors

In [43]:
start_time = time.time()
knn_model = neighbors.KNeighborsClassifier()
# feeding the training data into the model
knn_model.fit(X_train_over, y_train_over)
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Execution time: 3.204432249069214 sec


In [44]:
# predicting the values for x-test
y_pred = knn_model.predict(X_test_over)
# finding the training and testing accuracy
name = ['Fatal','Serious','Slight']
knn_r=recall_score(y_test_over,y_pred, average='macro')
knn_p=precision_score(y_test_over,y_pred, average='macro')
knn_f=f1_score(y_test_over,y_pred, average='macro')
print("Confusion Matrix: - \n",confusion_matrix(y_test_over, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test_over, y_pred,target_names=name))
print("Recall:", knn_r)
print("Precision:", knn_p)
print("F1 score:", knn_f)

Confusion Matrix: - 
 [[2085    0    0]
 [  16 2054   30]
 [ 159  909  996]]

Classification Report: - 
               precision    recall  f1-score   support

       Fatal       0.92      1.00      0.96      2085
     Serious       0.69      0.98      0.81      2100
      Slight       0.97      0.48      0.64      2064

    accuracy                           0.82      6249
   macro avg       0.86      0.82      0.81      6249
weighted avg       0.86      0.82      0.81      6249

Recall: 0.820217792543374
Precision: 0.8621809801317865
F1 score: 0.8052535562718952


### 5. Support Vector Machine 

In [45]:
start_time = time.time()
svm_model = svm.SVC()
# feeding the training data into the model
svm_model.fit(X_train_over, y_train_over)
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Execution time: 91.62406516075134 sec


In [46]:
# predicting the values for x-test
y_pred = svm_model.predict(X_test_over)
# finding the training and testing accuracy
name = ['Fatal','Serious','Slight']
svm_r=recall_score(y_test_over,y_pred, average='macro')
svm_p=precision_score(y_test_over,y_pred, average='macro')
svm_f=f1_score(y_test_over,y_pred, average='macro')
print("Confusion Matrix: - \n",confusion_matrix(y_test_over, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test_over, y_pred,target_names=name))
print("Recall:", svm_r)
print("Precision:", svm_p)
print("F1 score:", svm_f)

Confusion Matrix: - 
 [[2085    0    0]
 [  46 1774  280]
 [  70  383 1611]]

Classification Report: - 
               precision    recall  f1-score   support

       Fatal       0.95      1.00      0.97      2085
     Serious       0.82      0.84      0.83      2100
      Slight       0.85      0.78      0.81      2064

    accuracy                           0.88      6249
   macro avg       0.87      0.88      0.87      6249
weighted avg       0.87      0.88      0.87      6249

Recall: 0.875095053525286
Precision: 0.8738884836934325
F1 score: 0.8736836352111036


### 6. AdaBoost

In [47]:
start_time = time.time()
adb_model = AdaBoostClassifier()
# feeding the training data into the model
adb_model.fit(X_train_over, y_train_over)
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Execution time: 5.280881643295288 sec


In [48]:
# predicting the values for x-test
y_pred = adb_model.predict(X_test_over)
# finding the training and testing accuracy
name = ['Fatal','Serious','Slight']
adb_r=recall_score(y_test_over,y_pred, average='macro')
adb_p=precision_score(y_test_over,y_pred, average='macro')
adb_f=f1_score(y_test_over,y_pred, average='macro')
print("Confusion Matrix: - \n",confusion_matrix(y_test_over, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test_over, y_pred,target_names=name))
print("Recall:", adb_r)
print("Precision:", adb_p)
print("F1 score:", adb_f)

Confusion Matrix: - 
 [[1638  397   50]
 [ 536 1151  413]
 [   9  236 1819]]

Classification Report: - 
               precision    recall  f1-score   support

       Fatal       0.75      0.79      0.77      2085
     Serious       0.65      0.55      0.59      2100
      Slight       0.80      0.88      0.84      2064

    accuracy                           0.74      6249
   macro avg       0.73      0.74      0.73      6249
weighted avg       0.73      0.74      0.73      6249

Recall: 0.738335066166336
Precision: 0.7308769120918267
F1 score: 0.7324507208604653


### 7. Random Forest

In [49]:
start_time = time.time()
RF_model = RandomForestClassifier()
# feeding the training data into the model
RF_model.fit(X_train_over, y_train_over)
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Execution time: 5.965054750442505 sec


In [50]:
# predicting the values for x-test
y_pred = RF_model.predict(X_test_over)
# finding the training and testing accuracy
name = ['Fatal','Serious','Slight']
RF_r=recall_score(y_test_over,y_pred, average='macro')
RF_p=precision_score(y_test_over,y_pred, average='macro')
RF_f=f1_score(y_test_over,y_pred, average='macro')
print("Confusion Matrix: - \n",confusion_matrix(y_test_over, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test_over, y_pred,target_names=name))
print("Recall:", RF_r)
print("Precision:", RF_p)
print("F1 score:", RF_f)

Confusion Matrix: - 
 [[2056    4   25]
 [   5 1729  366]
 [   0   38 2026]]

Classification Report: - 
               precision    recall  f1-score   support

       Fatal       1.00      0.99      0.99      2085
     Serious       0.98      0.82      0.89      2100
      Slight       0.84      0.98      0.90      2064

    accuracy                           0.93      6249
   macro avg       0.94      0.93      0.93      6249
weighted avg       0.94      0.93      0.93      6249

Recall: 0.9303378692394921
Precision: 0.937362595983863
F1 score: 0.9297903294975792


In [51]:
%matplotlib inline
fig = plt.figure(figsize=(5, 10))
models=['Logistic Regression','Naive Bayes Classifier','Decision Tree Classifier','K-Nearest Neighbor','Support Vector Machine','AdaBoost','Random Forest Classifier']
fig = go.Figure(data=[go.Bar(name='F1_score', x=models, y=[LR_f,NB_f,DT_f,knn_f,svm_f,adb_f,RF_f])])
fig.show()

<Figure size 360x720 with 0 Axes>