## Data Preprocessing

In [2]:
##Importing Libraries
import pandas as pd
import numpy as np
import sklearn

In [3]:
## Importing the dataset
df = pd.read_csv('Dataset.csv',header=0)

In [4]:
df.sample(5)

Unnamed: 0,Time,Day_of_week,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Type_of_vehicle,Owner_of_vehicle,Service_year_of_vehicle,...,Vehicle_movement,Casualty_class,Sex_of_casualty,Age_band_of_casualty,Casualty_severity,Work_of_casuality,Fitness_of_casuality,Pedestrian_movement,Cause_of_accident,Accident_severity
8953,14:15:00,Wednesday,Unknown,Female,Junior high school,Employee,2-5yr,,Owner,1-2yr,...,Going straight,Driver or rider,Female,31-50,3,Driver,Normal,Not a Pedestrian,No distancing,Slight Injury
4906,11:15:00,Monday,Unknown,Female,Junior high school,Employee,5-10yr,Automobile,Owner,Unknown,...,Going straight,na,na,na,na,Driver,Normal,na,Changing lane to the left,Slight Injury
5532,14:33:00,Sunday,31-50,Male,Junior high school,Employee,2-5yr,Automobile,Organization,5-10yrs,...,Going straight,Passenger,Female,Under 18,3,Driver,Normal,Not a Pedestrian,Moving Backward,Slight Injury
801,17:45:00,Wednesday,18-30,Male,Junior high school,Employee,5-10yr,Automobile,Owner,Above 10yr,...,Going straight,na,na,na,na,,,na,No distancing,Slight Injury
6804,15:36:00,Saturday,18-30,Male,Elementary school,Employee,5-10yr,Public (13?45 seats),Owner,Unknown,...,Reversing,Driver or rider,Male,18-30,2,Driver,Normal,Not a Pedestrian,Changing lane to the right,Slight Injury


In [5]:
print("The Dataset has %d rows and %d columns" % df.shape)

The Dataset has 12316 rows and 32 columns


In [6]:
df.duplicated().sum()

0

### Data Cleansing
* Handling Missing Values

In [7]:
print('The dataset has total of',df.isnull().sum().sum(),'Missing Values')

The dataset has total of 20057 Missing Values


In [8]:
df.isnull().mean().sort_values(ascending=False)*100

Defect_of_vehicle              35.945112
Service_year_of_vehicle        31.893472
Work_of_casuality              25.966223
Fitness_of_casuality           21.394933
Type_of_vehicle                 7.713543
Types_of_Junction               7.202014
Driving_experience              6.731082
Educational_level               6.016564
Vehicle_driver_relation         4.701202
Owner_of_vehicle                3.913608
Lanes_or_Medians                3.126015
Vehicle_movement                2.500812
Area_accident_occured           1.940565
Road_surface_type               1.396557
Type_of_collision               1.258525
Road_allignment                 1.152972
Sex_of_driver                   0.000000
Age_band_of_driver              0.000000
Day_of_week                     0.000000
Accident_severity               0.000000
Cause_of_accident               0.000000
Road_surface_conditions         0.000000
Light_conditions                0.000000
Weather_conditions              0.000000
Number_of_vehicl

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12316 entries, 0 to 12315
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Time                         12316 non-null  object
 1   Day_of_week                  12316 non-null  object
 2   Age_band_of_driver           12316 non-null  object
 3   Sex_of_driver                12316 non-null  object
 4   Educational_level            11575 non-null  object
 5   Vehicle_driver_relation      11737 non-null  object
 6   Driving_experience           11487 non-null  object
 7   Type_of_vehicle              11366 non-null  object
 8   Owner_of_vehicle             11834 non-null  object
 9   Service_year_of_vehicle      8388 non-null   object
 10  Defect_of_vehicle            7889 non-null   object
 11  Area_accident_occured        12077 non-null  object
 12  Lanes_or_Medians             11931 non-null  object
 13  Road_allignment              12

#### Handling Categorical Missing values 
 We can see from the above info all the instances having missing values are Categorical.  

In [9]:
# Shows the columns with their number of catagories each variable is having
for col in df.columns:
    print(col, ':', len(df[col].unique()), 'catagories')

Time : 1074 catagories
Day_of_week : 7 catagories
Age_band_of_driver : 5 catagories
Sex_of_driver : 3 catagories
Educational_level : 8 catagories
Vehicle_driver_relation : 5 catagories
Driving_experience : 8 catagories
Type_of_vehicle : 18 catagories
Owner_of_vehicle : 5 catagories
Service_year_of_vehicle : 7 catagories
Defect_of_vehicle : 4 catagories
Area_accident_occured : 15 catagories
Lanes_or_Medians : 8 catagories
Road_allignment : 10 catagories
Types_of_Junction : 8 catagories
Road_surface_type : 6 catagories
Road_surface_conditions : 4 catagories
Light_conditions : 4 catagories
Weather_conditions : 9 catagories
Type_of_collision : 11 catagories
Number_of_vehicles_involved : 6 catagories
Number_of_casualties : 8 catagories
Vehicle_movement : 15 catagories
Casualty_class : 4 catagories
Sex_of_casualty : 3 catagories
Age_band_of_casualty : 6 catagories
Casualty_severity : 4 catagories
Work_of_casuality : 8 catagories
Fitness_of_casuality : 6 catagories
Pedestrian_movement : 11 ca

In [10]:
### We will handle the catagorical missing value by replacing NaN values with a new catagory
def impute_nan(df,variable):
    df[variable]=np.where(df[variable].isnull(),"Unknown",df[variable])

In [11]:
### Select all the attributes that have missing value and replace with "Unknown" Category
for cat in ['Educational_level','Vehicle_driver_relation','Driving_experience','Type_of_vehicle','Owner_of_vehicle','Service_year_of_vehicle','Defect_of_vehicle','Area_accident_occured',
            'Lanes_or_Medians','Road_allignment','Types_of_Junction','Road_surface_type','Type_of_collision','Vehicle_movement',
            'Work_of_casuality','Fitness_of_casuality','Pedestrian_movement']: impute_nan(df,cat)

In [12]:
print(df.isnull().sum().sum(),"Missing value: All the missing values are handled")

0 Missing value: All the missing values are handled


### Feature Selection
From the total 32 features 16 features are selected by discussing with domain experts and understanding the domain.

In [13]:
data=df.copy()

In [14]:
### Dropping the remaining variables
data.drop(['Time','Day_of_week','Type_of_vehicle','Owner_of_vehicle','Service_year_of_vehicle','Defect_of_vehicle','Area_accident_occured','Road_allignment',
        'Road_surface_conditions','Number_of_vehicles_involved','Number_of_casualties','Casualty_class','Sex_of_casualty','Age_band_of_casualty',
        'Work_of_casuality','Fitness_of_casuality'], axis=1, inplace=True)

In [11]:
data.head()

Unnamed: 0,Age_band_of_driver,Sex_of_driver,Educational_level,Vehicle_driver_relation,Driving_experience,Lanes_or_Medians,Types_of_Junction,Road_surface_type,Light_conditions,Weather_conditions,Type_of_collision,Vehicle_movement,Casualty_severity,Pedestrian_movement,Cause_of_accident,Accident_severity
0,18-30,Male,Above high school,Employee,1-2yr,,No junction,Asphalt roads,Daylight,Normal,Collision with roadside-parked vehicles,Going straight,na,na,Moving Backward,Slight Injury
1,31-50,Male,Junior high school,Employee,Above 10yr,Undivided Two way,No junction,Asphalt roads,Daylight,Normal,Vehicle with vehicle collision,Going straight,na,na,Overtaking,Slight Injury
2,18-30,Male,Junior high school,Employee,1-2yr,other,No junction,Asphalt roads,Daylight,Normal,Collision with roadside objects,Going straight,3,Not a Pedestrian,Changing lane to the left,Serious Injury
3,18-30,Male,Junior high school,Employee,5-10yr,other,Y Shape,Earth roads,Darkness - lights lit,Normal,Vehicle with vehicle collision,Going straight,3,1,Changing lane to the right,Slight Injury
4,18-30,Male,Junior high school,Employee,2-5yr,other,Y Shape,Asphalt roads,Darkness - lights lit,Normal,Vehicle with vehicle collision,Going straight,na,na,Overtaking,Slight Injury


In [15]:
### The selected features are:
data.columns

Index(['Age_band_of_driver', 'Sex_of_driver', 'Educational_level',
       'Vehicle_driver_relation', 'Driving_experience', 'Lanes_or_Medians',
       'Types_of_Junction', 'Road_surface_type', 'Light_conditions',
       'Weather_conditions', 'Type_of_collision', 'Vehicle_movement',
       'Casualty_severity', 'Pedestrian_movement', 'Cause_of_accident',
       'Accident_severity'],
      dtype='object')

In [16]:
data.shape

(12316, 16)

In [17]:
### Separating Independent and Dependent features
X = data.iloc[:,:-1]
y = data.iloc[:, 15]

### Data Transformation
#### Handling Categorical Variables - Creating Dummy Variables

In [18]:
# Shows the columns with their number of categories each variable is having
for col in data.columns:
    print(col, ':', len(data[col].unique()), 'categories')

Age_band_of_driver : 5 categories
Sex_of_driver : 3 categories
Educational_level : 7 categories
Vehicle_driver_relation : 4 categories
Driving_experience : 8 categories
Lanes_or_Medians : 7 categories
Types_of_Junction : 7 categories
Road_surface_type : 6 categories
Light_conditions : 4 categories
Weather_conditions : 9 categories
Type_of_collision : 10 categories
Vehicle_movement : 14 categories
Casualty_severity : 4 categories
Pedestrian_movement : 11 categories
Cause_of_accident : 20 categories
Accident_severity : 3 categories


In [19]:
pd.get_dummies(data,drop_first=True).shape

(12316, 106)

In [20]:
X = pd.get_dummies(X, drop_first=True)
y = pd.get_dummies(y, drop_first=True) 

In [21]:
X.shape

(12316, 104)

In [22]:
y.head(2)

Unnamed: 0,Serious Injury,Slight Injury
0,0,1
1,0,1


### Splitting the dataset into training and test data

In [25]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [27]:
X_train

Unnamed: 0,Age_band_of_driver_31-50,Age_band_of_driver_Over 51,Age_band_of_driver_Under 18,Age_band_of_driver_Unknown,Sex_of_driver_Male,Sex_of_driver_Unknown,Educational_level_Elementary school,Educational_level_High school,Educational_level_Illiterate,Educational_level_Junior high school,...,Cause_of_accident_No distancing,Cause_of_accident_No priority to pedestrian,Cause_of_accident_No priority to vehicle,Cause_of_accident_Other,Cause_of_accident_Overloading,Cause_of_accident_Overspeed,Cause_of_accident_Overtaking,Cause_of_accident_Overturning,Cause_of_accident_Turnover,Cause_of_accident_Unknown
616,0,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
3689,0,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6986,1,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2081,1,0,0,0,1,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
676,0,0,1,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10955,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
905,0,0,0,1,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
5192,0,0,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12172,0,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [28]:
X_test

Unnamed: 0,Age_band_of_driver_31-50,Age_band_of_driver_Over 51,Age_band_of_driver_Under 18,Age_band_of_driver_Unknown,Sex_of_driver_Male,Sex_of_driver_Unknown,Educational_level_Elementary school,Educational_level_High school,Educational_level_Illiterate,Educational_level_Junior high school,...,Cause_of_accident_No distancing,Cause_of_accident_No priority to pedestrian,Cause_of_accident_No priority to vehicle,Cause_of_accident_Other,Cause_of_accident_Overloading,Cause_of_accident_Overspeed,Cause_of_accident_Overtaking,Cause_of_accident_Overturning,Cause_of_accident_Turnover,Cause_of_accident_Unknown
10974,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5748,0,0,0,0,1,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
4941,0,0,1,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
11832,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8173,0,1,0,0,1,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6337,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8044,1,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
4465,0,0,0,1,1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
5581,0,0,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
