## Data Preprocessing

In [1]:
##Importing Libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn import tree,linear_model,neighbors, datasets
from sklearn.model_selection import cross_val_score, train_test_split,GridSearchCV, KFold
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.utils.multiclass import unique_labels
from sklearn.exceptions import ConvergenceWarning
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [2]:
## For ignoring warnings to view clean output
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
## Importing the dataset
df = pd.read_csv('Dataset.csv',header=0)

In [5]:
df.sample(2)

Unnamed: 0,Time,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,...,Vehicle_movement,Casualty_class,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Work_of_casuality,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident,Accident_severity
10092,20:01:00,Sunday,18-30,Male,,,,Lorry (41?100Q),Owner,5-10yrs,...,Other,Driver or rider,Male,Over 51,3,Driver,Normal,Not a Pedestrian,Moving Backward,Serious Injury
8231,16:15:00,Thursday,18-30,Male,Junior high school,Employee,1-2yr,Automobile,Owner,Unknown,...,,na,na,na,na,Employee,Normal,na,Improper parking,Slight Injury


In [6]:
print("The Dataset has %d rows and %d columns" % df.shape)

The Dataset has 12316 rows and 32 columns


In [7]:
df.duplicated().sum()

0

### Data Cleansing
* Handling Missing Values

In [8]:
print('The dataset has total of',df.isnull().sum().sum(),'Missing Values')

The dataset has total of 20057 Missing Values


In [9]:
df.isnull().mean().sort_values(ascending=False)*100

Defect_of_vehicle              35.945112
Service_year_of_vehicle        31.893472
Work_of_casuality              25.966223
Fitness_of_casuality           21.394933
Type_of_vehicle                 7.713543
Types_of_Junction               7.202014
Driving_experience              6.731082
Educational_level               6.016564
Vehicle_driver_relation         4.701202
Owner_of_vehicle                3.913608
Lanes_or_Medians                3.126015
Vehicle_movement                2.500812
Area_accident_occured           1.940565
Road_surface_type               1.396557
Type_of_collision               1.258525
Road_allignment                 1.152972
Sex_of_driver                   0.000000
Age_band_of_driver              0.000000
Day_of_week                     0.000000
Accident_severity               0.000000
Cause_of_accident               0.000000
Road_surface_conditions         0.000000
Light_conditions                0.000000
Weather_conditions              0.000000
Number_of_vehicl

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12316 entries, 0 to 12315
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Time                         12316 non-null  object
 1   Day_of_week                  12316 non-null  object
 2   Age_band_of_driver           12316 non-null  object
 3   Sex_of_driver                12316 non-null  object
 4   Educational_level            11575 non-null  object
 5   Vehicle_driver_relation      11737 non-null  object
 6   Driving_experience           11487 non-null  object
 7   Type_of_vehicle              11366 non-null  object
 8   Owner_of_vehicle             11834 non-null  object
 9   Service_year_of_vehicle      8388 non-null   object
 10  Defect_of_vehicle            7889 non-null   object
 11  Area_accident_occured        12077 non-null  object
 12  Lanes_or_Medians             11931 non-null  object
 13  Road_allignment              12

#### Handling Categorical Missing values 
 We can see from the above info all the instances having missing values are Categorical.  

In [11]:
# Shows the columns with their number of catagories each variable is having
for col in df.columns:
    print(col, ':', len(df[col].unique()), 'catagories')

Time : 1074 catagories
Day_of_week : 7 catagories
Age_band_of_driver : 5 catagories
Sex_of_driver : 3 catagories
Educational_level : 8 catagories
Vehicle_driver_relation : 5 catagories
Driving_experience : 8 catagories
Type_of_vehicle : 18 catagories
Owner_of_vehicle : 5 catagories
Service_year_of_vehicle : 7 catagories
Defect_of_vehicle : 4 catagories
Area_accident_occured : 15 catagories
Lanes_or_Medians : 8 catagories
Road_allignment : 10 catagories
Types_of_Junction : 8 catagories
Road_surface_type : 6 catagories
Road_surface_conditions : 4 catagories
Light_conditions : 4 catagories
Weather_conditions : 9 catagories
Type_of_collision : 11 catagories
Number_of_vehicles_involved : 6 catagories
Number_of_casualties : 8 catagories
Vehicle_movement : 15 catagories
Casualty_class : 4 catagories
Sex_of_casualty : 3 catagories
Age_band_of_casualty : 6 catagories
Casualty_severity : 4 catagories
Work_of_casuality : 8 catagories
Fitness_of_casuality : 6 catagories
Pedestrian_movement : 11 ca

In [12]:
### We will handle the catagorical missing value by replacing NaN values with a new catagory
def impute_nan(df,variable):
    df[variable]=np.where(df[variable].isnull(),"Unknown",df[variable])

In [13]:
### Select all the attributes that have missing value and replace with "Unknown" Category
for cat in ['Educational_level','Vehicle_driver_relation','Driving_experience','Type_of_vehicle','Owner_of_vehicle','Service_year_of_vehicle','Defect_of_vehicle','Area_accident_occured',
            'Lanes_or_Medians','Road_allignment','Types_of_Junction','Road_surface_type','Type_of_collision','Vehicle_movement',
            'Work_of_casuality','Fitness_of_casuality','Pedestrian_movement']: impute_nan(df,cat)

In [14]:
print(df.isnull().sum().sum(),"Missing value: All the missing values are handled")

0 Missing value: All the missing values are handled


### Feature Selection
From the total 32 features 16 features are selected by discussing with domain experts and understanding the domain.

In [16]:
data=df.copy()

In [17]:
### Dropping the remaining variables
data.drop(['Time','Day_of_week','Type_of_vehicle','Owner_of_vehicle','Service_year_of_vehicle','Defect_of_vehicle','Area_accident_occured','Road_allignment',
        'Road_surface_conditions','Number_of_vehicles_involved','Number_of_casualties','Casualty_class','Sex_of_casualty','Age_band_of_casualty',
        'Work_of_casuality','Fitness_of_casuality'], axis=1, inplace=True)

In [18]:
data.head(3)

Unnamed: 0,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Lanes_or_Medians,Types_of_Junction,Road_surface_type,Light_conditions,Weather_conditions,Type_of_collision,Vehicle_movement,Casualty_severity,Pedestrian_movement,Cause_of_accident,Accident_severity
0,18-30,Male,Above high school,Employee,1-2yr,Unknown,No junction,Asphalt roads,Daylight,Normal,Collision with roadside-parked vehicles,Going straight,na,na,Moving Backward,Slight Injury
1,31-50,Male,Junior high school,Employee,Above 10yr,Undivided Two way,No junction,Asphalt roads,Daylight,Normal,Vehicle with vehicle collision,Going straight,na,na,Overtaking,Slight Injury
2,18-30,Male,Junior high school,Employee,1-2yr,other,No junction,Asphalt roads,Daylight,Normal,Collision with roadside objects,Going straight,3,Not a Pedestrian,Changing lane to the left,Serious Injury


In [19]:
### The selected features are:
data.columns

Index(['Age_band_of_driver', 'Sex_of_driver', 'Educational_level',
       'Vehicle_driver_relation', 'Driving_experience', 'Lanes_or_Medians',
       'Types_of_Junction', 'Road_surface_type', 'Light_conditions',
       'Weather_conditions', 'Type_of_collision', 'Vehicle_movement',
       'Casualty_severity', 'Pedestrian_movement', 'Cause_of_accident',
       'Accident_severity'],
      dtype='object')

In [20]:
data.shape

(12316, 16)

In [21]:
### Separating Independent and Dependent features
X = data.iloc[:,:-1]
y = data.iloc[:, 15]

### Data Transformation
#### Handling Categorical Variables - Creating Dummy Variables

In [22]:
# Shows the columns with their number of categories each variable is having
for col in data.columns:
    print(col, ':', len(data[col].unique()), 'categories')

Age_band_of_driver : 5 categories
Sex_of_driver : 3 categories
Educational_level : 7 categories
Vehicle_driver_relation : 4 categories
Driving_experience : 8 categories
Lanes_or_Medians : 7 categories
Types_of_Junction : 7 categories
Road_surface_type : 6 categories
Light_conditions : 4 categories
Weather_conditions : 9 categories
Type_of_collision : 10 categories
Vehicle_movement : 14 categories
Casualty_severity : 4 categories
Pedestrian_movement : 11 categories
Cause_of_accident : 20 categories
Accident_severity : 3 categories


In [23]:
pd.get_dummies(data,drop_first=True).shape

(12316, 106)

In [24]:
data['Accident_severity'].replace(('Slight Injury', 'Serious Injury', 'Fatal injury'), (0, 1, 2), inplace = True)

In [25]:
X = pd.get_dummies(X, drop_first=True)

In [26]:
X.shape

(12316, 104)

In [33]:
y.head(2)

0    0
1    0
Name: Accident_severity, dtype: object

In [38]:
y = y.astype('int')

## Model training 

### Splitting the dataset into training and test data

In [29]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=5)
#setting 20% aside as validation data for cross validation
x_train_t, x_train_v, y_train_t, y_train_v = train_test_split(X_train, y_train, test_size = 0.2, random_state = 5)

In [30]:
X_train

Unnamed: 0,Age_band_of_driver_31-50,Age_band_of_driver_Over 51,Age_band_of_driver_Under 18,Age_band_of_driver_Unknown,Sex_of_driver_Male,Sex_of_driver_Unknown,Educational_level_Elementary school,Educational_level_High school,Educational_level_Illiterate,Educational_level_Junior high school,...,Cause_of_accident_No distancing,Cause_of_accident_No priority to pedestrian,Cause_of_accident_No priority to vehicle,Cause_of_accident_Other,Cause_of_accident_Overloading,Cause_of_accident_Overspeed,Cause_of_accident_Overtaking,Cause_of_accident_Overturning,Cause_of_accident_Turnover,Cause_of_accident_Unknown
11743,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7945,0,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
7964,1,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9434,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7691,1,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3046,0,1,0,0,1,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
9917,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4079,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2254,0,0,0,1,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [31]:
X_test

Unnamed: 0,Age_band_of_driver_31-50,Age_band_of_driver_Over 51,Age_band_of_driver_Under 18,Age_band_of_driver_Unknown,Sex_of_driver_Male,Sex_of_driver_Unknown,Educational_level_Elementary school,Educational_level_High school,Educational_level_Illiterate,Educational_level_Junior high school,...,Cause_of_accident_No distancing,Cause_of_accident_No priority to pedestrian,Cause_of_accident_No priority to vehicle,Cause_of_accident_Other,Cause_of_accident_Overloading,Cause_of_accident_Overspeed,Cause_of_accident_Overtaking,Cause_of_accident_Overturning,Cause_of_accident_Turnover,Cause_of_accident_Unknown
175,0,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
636,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7064,1,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4365,0,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1790,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11266,1,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
5543,0,0,0,0,1,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
11984,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6947,0,0,1,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


### Naive Bayesian

In [47]:
model = GaussianNB()
# feeding the training data into the model
y_train =  y_train.astype('int')
y_test =  y_test.astype('int')
model.fit(X_train, y_train)
# predicting the values for x-test
y_pred = model.predict(X_test)
# finding the training and testing accuracy
print("Training Accuracy: ",model.score(X_train, y_train))
print("Testing Accuracy: ", model.score(X_test, y_test))
print()
print("Confusion Matrix: - \n",confusion_matrix(y_test, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test, y_pred))

Training Accuracy:  0.0626268777913114
Testing Accuracy:  0.07061688311688312

Confusion Matrix: - 
 [[ 110  246 1732]
 [   9   40  300]
 [   1    2   24]]

Classification Report: - 
               precision    recall  f1-score   support

           0       0.92      0.05      0.10      2088
           1       0.14      0.11      0.13       349
           2       0.01      0.89      0.02        27

    accuracy                           0.07      2464
   macro avg       0.36      0.35      0.08      2464
weighted avg       0.80      0.07      0.10      2464



### Logistic Regression

In [45]:
model = linear_model.LogisticRegression()
# feeding the training data into the model
y_train =  y_train.astype('int')
y_test =  y_test.astype('int')
model.fit(X_train, y_train)
# predicting the values for x-test
y_pred = model.predict(X_test)
# finding the training and testing accuracy
print("Training Accuracy: ",model.score(X_train, y_train))
print("Testing Accuracy: ", model.score(X_test, y_test))
print()
print("Confusion Matrix: - \n",confusion_matrix(y_test, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test, y_pred))

Training Accuracy:  0.8452090946000812
Testing Accuracy:  0.8474025974025974

Confusion Matrix: - 
 [[2088    0    0]
 [ 349    0    0]
 [  27    0    0]]

Classification Report: - 
               precision    recall  f1-score   support

           0       0.85      1.00      0.92      2088
           1       0.00      0.00      0.00       349
           2       0.00      0.00      0.00        27

    accuracy                           0.85      2464
   macro avg       0.28      0.33      0.31      2464
weighted avg       0.72      0.85      0.78      2464



  _warn_prf(average, modifier, msg_start, len(result))


### Decision Tree

In [44]:
model = tree.DecisionTreeClassifier()
# feeding the training data into the model
y_train =  y_train.astype('int')
y_test =  y_test.astype('int')
model.fit(X_train, y_train)
# predicting the values for x-test
y_pred = model.predict(X_test)
# finding the training and testing accuracy
print("Training Accuracy: ",model.score(X_train, y_train))
print("Testing Accuracy: ", model.score(X_test, y_test))
print()
print("Confusion Matrix: - \n",confusion_matrix(y_test, y_pred))
print()

print("Classification Report: - \n",classification_report(y_test, y_pred))

Training Accuracy:  0.9960414129110841
Testing Accuracy:  0.7309253246753247

Confusion Matrix: - 
 [[1738  319   31]
 [ 282   63    4]
 [  21    6    0]]

Classification Report: - 
               precision    recall  f1-score   support

           0       0.85      0.83      0.84      2088
           1       0.16      0.18      0.17       349
           2       0.00      0.00      0.00        27

    accuracy                           0.73      2464
   macro avg       0.34      0.34      0.34      2464
weighted avg       0.74      0.73      0.74      2464



### K Nearest Neighbors

In [48]:
model = neighbors.KNeighborsClassifier()
# feeding the training data into the model
y_train =  y_train.astype('int')
y_test =  y_test.astype('int')
model.fit(X_train, y_train)
# predicting the values for x-test
y_pred = model.predict(X_test)
# finding the training and testing accuracy
print("Training Accuracy: ",model.score(X_train, y_train))
print("Testing Accuracy: ", model.score(X_test, y_test))
print()
print("Confusion Matrix: - \n",confusion_matrix(y_test, y_pred))
print()
print("Classification Report: - \n",classification_report(y_test, y_pred))

Training Accuracy:  0.8514007308160779
Testing Accuracy:  0.8348214285714286

Confusion Matrix: - 
 [[2051   37    0]
 [ 343    6    0]
 [  24    3    0]]

Classification Report: - 
               precision    recall  f1-score   support

           0       0.85      0.98      0.91      2088
           1       0.13      0.02      0.03       349
           2       0.00      0.00      0.00        27

    accuracy                           0.83      2464
   macro avg       0.33      0.33      0.31      2464
weighted avg       0.74      0.83      0.78      2464



  _warn_prf(average, modifier, msg_start, len(result))
