In [1]:
##Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt 
import plotly.offline as py
from sklearn import tree,linear_model,neighbors, svm
from sklearn.model_selection import cross_val_score, train_test_split,GridSearchCV, KFold
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,recall_score,f1_score,precision_score
from sklearn.exceptions import ConvergenceWarning
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
import time
import matplotlib.pyplot as plt                
import plotly.offline as py
from plotly import tools
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

In [2]:
## For ignoring warnings to view clean output
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
## Importing the dataset
data = pd.read_csv('Data/cleaned.csv',header=0)

In [4]:
### Separating Independent and Dependent features
X = data.iloc[:,:-1]
y = data.iloc[:, 14]

### Data Transformation
#### Handling Categorical Variables - Creating Dummy Variables

In [5]:
# Shows the columns with their number of categories each variable is having
for col in data.columns:
    print(col, ':', len(data[col].unique()), 'categories')

Age_band_of_driver : 5 categories
Sex_of_driver : 3 categories
Educational_level : 7 categories
Vehicle_driver_relation : 4 categories
Driving_experience : 8 categories
Lanes_or_Medians : 7 categories
Types_of_Junction : 8 categories
Road_surface_type : 6 categories
Light_conditions : 4 categories
Weather_conditions : 9 categories
Type_of_collision : 10 categories
Vehicle_movement : 13 categories
Pedestrian_movement : 9 categories
Cause_of_accident : 20 categories
Accident_severity : 3 categories


In [6]:
pd.get_dummies(data,drop_first=True).shape

(12316, 100)

In [7]:
X = pd.get_dummies(X, drop_first=True)

In [8]:
X.shape

(12316, 99)

In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [10]:
print("Mean of the dataset: ", np.mean(X).round(8))
print("Standard deviation of the dataset: ", np.std(X).round(8))

Mean of the dataset:  0.0
Standard deviation of the dataset:  1.0


### Handling Class imbalance

In [11]:
### Checking for data imbalance 
y.value_counts()

2    10415
1     1743
0      158
Name: Accident_severity, dtype: int64

In [12]:
print('Slight Injury: ' + str(round(data['Accident_severity'].value_counts()[2] / len(data) * 100, 2)) + '%\nSerious Injury: ' + 
      str(round(data['Accident_severity'].value_counts()[1] / len(data) * 100, 2))  + '%\nFatal Injury: ' + 
      str(round(data['Accident_severity'].value_counts()[0] / len(data) * 100, 2)) + '%')

Slight Injury: 84.56%
Serious Injury: 14.15%
Fatal Injury: 1.28%


####  Random Undersampling Techniques for handling imbalanced dataset

In [17]:
# Undersampling 
rus = RandomUnderSampler(random_state=42)
X_under, y_under = rus.fit_sample(X, y)
## train test split
X_train_under,X_test_under,y_train_under,y_test_under = train_test_split(X_under,y_under,test_size=0.2,random_state=42)
#setting 20% aside as validation data for cross validation
x_train_t, x_train_v, y_train_t, y_train_v = train_test_split(X_train_under, y_train_under, test_size = 0.2, random_state = 42)

In [18]:
# Print class frequencies 
pd.Series(y_under).value_counts()

2    158
1    158
0    158
Name: Accident_severity, dtype: int64

In [19]:
y_test_under.value_counts()

0    35
2    33
1    27
Name: Accident_severity, dtype: int64

## Model training : Undersampling Technique

#### Model Evaluation : F1 Macro
**In this problem domain all classes should be treated equally. So Macro F1-score will give the same importance to each label/class. It will be low for models that only perform well on the common classes while performing poorly on the rare classes.***

### 1. Logistic Regression

In [20]:
start_time = time.time()
LR_model = linear_model.LogisticRegression()
# feeding the training data into the model
LR_model.fit(X_train_under, y_train_under)
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Execution time: 0.5046496391296387 sec


In [21]:
# predicting the values for x-test
y_pred = LR_model.predict(X_test_under)
# finding the training and testing accuracy
name = ['Fatal','Serious','Slight']
LR_r=recall_score(y_test_under,y_pred, average='macro')
LR_p=precision_score(y_test_under,y_pred, average='macro')
LR_f=f1_score(y_test_under,y_pred, average='macro')
print("Confusion Matrix: - \n",confusion_matrix(y_test_under, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test_under, y_pred,target_names=name))
print("Recall:", LR_r)
print("Precision:", LR_r)
print("F1 score:", LR_r)

Confusion Matrix: - 
 [[16  9 10]
 [10 11  6]
 [11  4 18]]

Classification Report: - 
               precision    recall  f1-score   support

       Fatal       0.43      0.46      0.44        35
     Serious       0.46      0.41      0.43        27
      Slight       0.53      0.55      0.54        33

    accuracy                           0.47        95
   macro avg       0.47      0.47      0.47        95
weighted avg       0.47      0.47      0.47        95

Recall: 0.4700016033349366
Precision: 0.4700016033349366
F1 score: 0.4700016033349366


### 2. Naive Bayes

In [22]:
start_time = time.time()
NB_model = GaussianNB()
# feeding the training data into the model
NB_model.fit(X_train_under, y_train_under)
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Execution time: 0.002994060516357422 sec


In [23]:
# predicting the values for x-test
y_pred = NB_model.predict(X_test_under)
# finding the training and testing accuracy
name = ['Fatal','Serious','Slight']
NB_r=recall_score(y_test_under,y_pred, average='macro')
NB_p=precision_score(y_test_under,y_pred, average='macro')
NB_f=f1_score(y_test_under,y_pred, average='macro')
print("Confusion Matrix: - \n",confusion_matrix(y_test_under, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test_under, y_pred,target_names=name))
print("Recall:", NB_r)
print("Precision:", NB_p)
print("F1 score:", NB_f)

Confusion Matrix: - 
 [[25  6  4]
 [18  1  8]
 [25  0  8]]

Classification Report: - 
               precision    recall  f1-score   support

       Fatal       0.37      0.71      0.49        35
     Serious       0.14      0.04      0.06        27
      Slight       0.40      0.24      0.30        33

    accuracy                           0.36        95
   macro avg       0.30      0.33      0.28        95
weighted avg       0.31      0.36      0.30        95

Recall: 0.3312489979156646
Precision: 0.30350140056022407
F1 score: 0.2820490716894928


### 3. Decision Tree

In [24]:
start_time = time.time()
DT_model = tree.DecisionTreeClassifier()
# feeding the training data into the model
DT_model.fit(X_train_under, y_train_under)
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Execution time: 0.006983280181884766 sec


In [25]:
# predicting the values for x-test
y_pred = DT_model.predict(X_test_under)
# finding the training and testing accuracy
name = ['Fatal','Serious','Slight']
DT_r=recall_score(y_test_under,y_pred, average='macro')
DT_p=precision_score(y_test_under,y_pred, average='macro')
DT_f=f1_score(y_test_under,y_pred, average='macro')
print("Confusion Matrix: - \n",confusion_matrix(y_test_under, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test_under, y_pred,target_names=name))
print("Recall:", DT_r)
print("Precision:", DT_p)
print("F1 score:", DT_f)

Confusion Matrix: - 
 [[20  6  9]
 [11 12  4]
 [10 12 11]]

Classification Report: - 
               precision    recall  f1-score   support

       Fatal       0.49      0.57      0.53        35
     Serious       0.40      0.44      0.42        27
      Slight       0.46      0.33      0.39        33

    accuracy                           0.45        95
   macro avg       0.45      0.45      0.44        95
weighted avg       0.45      0.45      0.45        95

Recall: 0.4497354497354497
Precision: 0.4487127371273713
F1 score: 0.4444444444444445


### 4. K Nearest Neighbors

In [26]:
start_time = time.time()
knn_model = neighbors.KNeighborsClassifier()
# feeding the training data into the model
knn_model.fit(X_train_under, y_train_under)
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Execution time: 0.007979631423950195 sec


In [27]:
# predicting the values for x-test
y_pred = knn_model.predict(X_test_under)
# finding the training and testing accuracy
name = ['Fatal','Serious','Slight']
knn_r=recall_score(y_test_under,y_pred, average='macro')
knn_p=precision_score(y_test_under,y_pred, average='macro')
knn_f=f1_score(y_test_under,y_pred, average='macro')
print("Confusion Matrix: - \n",confusion_matrix(y_test_under, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test_under, y_pred,target_names=name))
print("Recall:", knn_r)
print("Precision:", knn_p)
print("F1 score:", knn_f)

Confusion Matrix: - 
 [[20 10  5]
 [11 10  6]
 [12  9 12]]

Classification Report: - 
               precision    recall  f1-score   support

       Fatal       0.47      0.57      0.51        35
     Serious       0.34      0.37      0.36        27
      Slight       0.52      0.36      0.43        33

    accuracy                           0.44        95
   macro avg       0.44      0.44      0.43        95
weighted avg       0.45      0.44      0.44        95

Recall: 0.43514510181176846
Precision: 0.4438943319038155
F1 score: 0.43284493284493286


### 5. Support Vector Machine 

In [28]:
start_time = time.time()
svm_model = svm.SVC()
# feeding the training data into the model
svm_model.fit(X_train_under, y_train_under)
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Execution time: 0.09773993492126465 sec


In [29]:
# predicting the values for x-test
y_pred = svm_model.predict(X_test_under)
# finding the training and testing accuracy
name = ['Fatal','Serious','Slight']
svm_r=recall_score(y_test_under,y_pred, average='macro')
svm_p=precision_score(y_test_under,y_pred, average='macro')
svm_f=f1_score(y_test_under,y_pred, average='macro')
print("Confusion Matrix: - \n",confusion_matrix(y_test_under, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test_under, y_pred,target_names=name))
print("Recall:", svm_r)
print("Precision:", svm_p)
print("F1 score:", svm_f)

Confusion Matrix: - 
 [[17 11  7]
 [10 10  7]
 [13  8 12]]

Classification Report: - 
               precision    recall  f1-score   support

       Fatal       0.42      0.49      0.45        35
     Serious       0.34      0.37      0.36        27
      Slight       0.46      0.36      0.41        33

    accuracy                           0.41        95
   macro avg       0.41      0.41      0.41        95
weighted avg       0.41      0.41      0.41        95

Recall: 0.40657367324033994
Precision: 0.4104553492484528
F1 score: 0.40575195049771323


### 6. AdaBoost

In [30]:
start_time = time.time()
adb_model = AdaBoostClassifier()
# feeding the training data into the model
adb_model.fit(X_train_under, y_train_under)
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Execution time: 0.17851829528808594 sec


In [31]:
# predicting the values for x-test
y_pred = adb_model.predict(X_test_under)
# finding the training and testing accuracy
name = ['Fatal','Serious','Slight']
adb_r=recall_score(y_test_under,y_pred, average='macro')
adb_p=precision_score(y_test_under,y_pred, average='macro')
adb_f=f1_score(y_test_under,y_pred, average='macro')
print("Confusion Matrix: - \n",confusion_matrix(y_test_under, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test_under, y_pred,target_names=name))
print("Recall:", adb_r)
print("Precision:", adb_p)
print("F1 score:", adb_f)

Confusion Matrix: - 
 [[21  8  6]
 [12  9  6]
 [14  4 15]]

Classification Report: - 
               precision    recall  f1-score   support

       Fatal       0.45      0.60      0.51        35
     Serious       0.43      0.33      0.38        27
      Slight       0.56      0.45      0.50        33

    accuracy                           0.47        95
   macro avg       0.48      0.46      0.46        95
weighted avg       0.48      0.47      0.47        95

Recall: 0.46262626262626266
Precision: 0.476978498255094
F1 score: 0.46239837398373984


### 7. Random Forest

In [32]:
start_time = time.time()
RF_model = RandomForestClassifier()
# feeding the training data into the model
RF_model.fit(X_train_under, y_train_under)
print("Execution time: " + str((time.time() - start_time)) + ' sec')

Execution time: 0.34012389183044434 sec


In [33]:
# predicting the values for x-test
y_pred = RF_model.predict(X_test_under)
# finding the training and testing accuracy
name = ['Fatal','Serious','Slight']
RF_r=recall_score(y_test_under,y_pred, average='macro')
RF_p=precision_score(y_test_under,y_pred, average='macro')
RF_f=f1_score(y_test_under,y_pred, average='macro')
print("Confusion Matrix: - \n",confusion_matrix(y_test_under, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test_under, y_pred,target_names=name))
print("Recall:", RF_r)
print("Precision:", RF_p)
print("F1 score:", RF_f)

Confusion Matrix: - 
 [[18  7 10]
 [11 11  5]
 [12  9 12]]

Classification Report: - 
               precision    recall  f1-score   support

       Fatal       0.44      0.51      0.47        35
     Serious       0.41      0.41      0.41        27
      Slight       0.44      0.36      0.40        33

    accuracy                           0.43        95
   macro avg       0.43      0.43      0.43        95
weighted avg       0.43      0.43      0.43        95

Recall: 0.42844316177649516
Precision: 0.43029208069858477
F1 score: 0.427030539311241


In [34]:
%matplotlib inline
fig = plt.figure(figsize=(5, 10))
models=['Logistic Regression','Naive Bayes Classifier','Decision Tree Classifier','K-Nearest Neighbor','Support Vector Machine','AdaBoost','Random Forest Classifier']
fig = go.Figure(data=[go.Bar(name='F1_score', x=models, y=[LR_f,NB_f,DT_f,knn_f,svm_f,adb_r,RF_f])])
fig.show()

<Figure size 360x720 with 0 Axes>