# Imports

In [34]:
import numpy as np
import pandas as pd
import re

#Visualisation Libraries
import matplotlib.pyplot as plt

#Training and Preprocessing Libraries
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

# sampling
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, ADASYN

# classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression


from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score

from sklearn.tree import export_graphviz
from subprocess import call
from IPython.display import Image

# Preprocessing

In [35]:
def convert_date_to_day_of_year(dt):
    result = re.findall(r'(\d{2})/(\d{2})/(\d{4})',dt)
    return result[0][0]

def convert_date_to_month(dt):
    result = re.findall(r'(\d{2})/(\d{2})/(\d{4})',dt)
    return result[0][1]

In [36]:
def preprocess_datetime(data):
    data['day_of_year'] = data.Date.apply(lambda x: convert_date_to_day_of_year(x))
    data['month'] = data.Date.apply(lambda x: convert_date_to_month(x))
    return data

In [37]:
def preprocess_speed(x):
    x = int(int(x)/10)
    return x

In [38]:
def preprocess_data(temp_data):
    temp_data["Pedestrian_Crossing-Physical_Facilities"]= le_Pedestrian_Crossing_Physical_Facilities.transform(temp_data["Pedestrian_Crossing-Physical_Facilities"])
    temp_data["Light_Conditions"]= le_Light_Conditions.transform(temp_data["Light_Conditions"])
    temp_data["Weather_Conditions"] = le_Weather_Conditions.transform(temp_data["Weather_Conditions"])
    temp_data["Road_Surface_Conditions"] = le_Road_Surface_Conditions.transform(temp_data["Road_Surface_Conditions"])
    temp_data["Pedestrian_Crossing-Human_Control"] = le_Pedestrian_Crossing_Human_Control.transform(temp_data["Pedestrian_Crossing-Human_Control"])
    temp_data["Road_Type"] = le_Road_Type.transform(temp_data["Road_Type"])
    temp_data["Time"] = temp_data["Time"].astype(str)
    temp_data['Time'] = temp_data['Time'].str.slice(0, 2, 1)
    temp_data["Time"] = temp_data["Time"].astype(int)
    temp_data['Speed_limit'] = temp_data.Speed_limit.apply(lambda x: preprocess_speed(x))
    return temp_data

# utility to build, predict and print score

In [39]:
def build_model(clf, X_train, Y_train):
    print(clf)
    clf.fit(X_train, Y_train)

In [40]:
def predict(clf, X_test):
    pred = clf.predict(X_test)
    return pred

In [41]:
def print_score(y_test, pred, average="macro"):
    acc = accuracy_score(y_test, pred)
    prec = precision_score(y_test, pred, average=average)
    recall = recall_score(y_test, pred, average=average)
    f2 = f1_score(y_test, pred, average=average)
    print("Accuracy Score: {}".format(acc))
    print("Precision Score: {}".format(prec))
    print("Recall Score: {}".format(recall))
    print("F1 Score: {}".format(f2))
    return acc, prec, recall, f2

## Target Variable

In [42]:
class_dict = {1: 'Fatal', 2: 'Severe', 3: 'Slight'}

class_labels = ['Fatal', 'Severe', 'Slight']

## Read data to train

In [43]:
data1 = pd.read_csv("./data/accidents_2005_to_2007.csv", low_memory=False)
data2 = pd.read_csv("./data/accidents_2009_to_2011.csv", low_memory=False)
data3 = pd.read_csv("./data/accidents_2012_to_2014.csv", low_memory=False)

data = pd.concat([data1, data2, data3])

In [11]:
drop_columns = ['Date', 'Accident_Index', 'Number_of_Casualties', 'Police_Force', 'Junction_Detail',
                'Junction_Control', 'Special_Conditions_at_Site', 'Carriageway_Hazards',
                'Did_Police_Officer_Attend_Scene_of_Accident', 'LSOA_of_Accident_Location',
                'Local_Authority_(District)', 'Local_Authority_(Highway)']

In [12]:
data = preprocess_datetime(data)
data1 = data.drop(drop_columns, axis=1, inplace=False)
data1.dropna(inplace=True)

In [13]:
le_Pedestrian_Crossing_Physical_Facilities = LabelEncoder()
le_Light_Conditions = LabelEncoder()
le_Weather_Conditions = LabelEncoder()
le_Road_Surface_Conditions = LabelEncoder()
le_Pedestrian_Crossing_Human_Control = LabelEncoder()
le_Road_Type = LabelEncoder()

le_Pedestrian_Crossing_Physical_Facilities.fit(data1["Pedestrian_Crossing-Physical_Facilities"])
le_Light_Conditions.fit(data1["Light_Conditions"])
le_Weather_Conditions.fit(data1["Weather_Conditions"])
le_Road_Surface_Conditions.fit(data1["Road_Surface_Conditions"])
le_Pedestrian_Crossing_Human_Control.fit(data1["Pedestrian_Crossing-Human_Control"])
le_Road_Type.fit(data1["Road_Type"])

LabelEncoder()

In [14]:
data1 = data1[data1['Weather_Conditions'] != 'Unknown']
data1 = data1[data1['Road_Type'] != 'Unknown']

In [15]:
data1 = preprocess_data(data1)

In [16]:
# data1

In [17]:
train, test = train_test_split(data1, test_size=.20)

In [18]:
# features = ["Number_of_Vehicles", "Day_of_Week", "Time", "Road_Type", "Speed_limit",
#                          "Pedestrian_Crossing-Human_Control", "Pedestrian_Crossing-Physical_Facilities",
#                          "Light_Conditions", "Weather_Conditions", "Road_Surface_Conditions",
#                          "month"]
features = ["Number_of_Vehicles", "Time", "Road_Type", "Speed_limit",
                         "Pedestrian_Crossing-Human_Control", "Pedestrian_Crossing-Physical_Facilities",
                         "Light_Conditions", "Weather_Conditions", "Road_Surface_Conditions"]

In [19]:
y_train = train['Accident_Severity']
x_train = train[features]

y_test = test['Accident_Severity']
x_test = test[features]

## OverSampling

In [20]:
ros = RandomOverSampler(random_state=0)
x_resampled, y_resampled = ros.fit_resample(x_train, y_train)
# x_resampled, y_resampled = SMOTE().fit_resample(x_train, y_train)
# x_resampled, y_resampled = ADASYN().fit_resample(x_train, y_train)


In [21]:
print(test.loc[:, x_train.columns != 'Accident_Severity'].head(0))

Empty DataFrame
Columns: [Location_Easting_OSGR, Location_Northing_OSGR, Longitude, Latitude, Accident_Severity, Number_of_Vehicles, Day_of_Week, Time, 1st_Road_Class]
Index: []


In [22]:
print("x_train: ", x_train.shape)
print("x_test: ", x_test.shape)
print("y_train: ", y_train.shape)
print("y_test: ", y_test.shape)

x_train:  (1174393, 9)
x_test:  (293599, 9)
y_train:  (1174393,)
y_test:  (293599,)


In [23]:
clf = RandomForestClassifier(n_estimators=10)

In [33]:
# build_model(clf, x_train, y_train)
clf = RandomForestClassifier(n_estimators=10)
build_model(clf, x_train, y_train)
pred = predict(clf, x_test)
print_score(y_test, pred)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Accuracy Score: 0.8470498877720973
Precision Score: 0.3905754917892872
Recall Score: 0.33584963796223993
F1 Score: 0.31412463324299966


(0.8470498877720973,
 0.3905754917892872,
 0.33584963796223993,
 0.31412463324299966)

In [25]:
clf1 = MultinomialNB()
clf2 = GaussianNB()

In [26]:
build_model(clf1, x_train, y_train)
pred = predict(clf1, x_test)
print_score(y_test, pred)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Accuracy Score: 0.8505819161509406
Precision Score: 0.28352730538364684
Recall Score: 0.3333333333333333
F1 Score: 0.30641962175158455


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


(0.8505819161509406,
 0.28352730538364684,
 0.3333333333333333,
 0.30641962175158455)

In [27]:
build_model(clf2, x_train, y_train)
pred = predict(clf2, x_test)
print_score(y_test, pred)

GaussianNB(priors=None, var_smoothing=1e-09)
Accuracy Score: 0.8319408444851651
Precision Score: 0.37182476403806985
Recall Score: 0.35796435012739664
F1 Score: 0.3337591416560202


(0.8319408444851651,
 0.37182476403806985,
 0.35796435012739664,
 0.3337591416560202)

In [28]:
# y_test.shape
print(x_test.shape)
print(len(pred))

(293599, 9)
293599


In [29]:
def get_console(pred, class_label):
    for i in range(len(pred)):
        if pred[i] == class_label:
            return i

In [30]:
def print_df_row(ind):
    df = pd.DataFrame(columns=features)
    df.loc[1] = x_test.iloc[ind]
    print("print_df: ")
    print(df)
    return df

In [31]:
index = get_console(pred, 2)
temp_df = print_df_row(index)
temp_df

print_df: 
  Number_of_Vehicles Time Road_Type Speed_limit  \
1                  1   23         3           6   

  Pedestrian_Crossing-Human_Control Pedestrian_Crossing-Physical_Facilities  \
1                                 2                                       2   

  Light_Conditions Weather_Conditions Road_Surface_Conditions  
1                0                  3                       2  


Unnamed: 0,Number_of_Vehicles,Time,Road_Type,Speed_limit,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions
1,1,23,3,6,2,2,0,3,2


In [32]:
df1 = pd.DataFrame(columns=features)
df1.loc[1] = x_test.iloc[index]
# temp_df

In [40]:
input_dict = {}
# input_data =  {
#     'Pedestrian_Crossing-Physical_Facilities': 'No physical crossing within 50 meters', 
#       'Light_Conditions': 'Daylight: Street light present', 
#     'Weather_Conditions': 'Fine without high winds', 
#     'Road_Surface_Conditions': 'Dry', 
#     'Pedestrian_Crossing-Human_Control': 'None within 50 metres', 
#     'Road_Type': 'Single carriageway', 
#     'Number_of_Vehicles': '4', 
#     'Day_of_Week': '1', 
#     'Time': '14:00', 
#     'Speed_limit': '50', 
#     'month': '4'}
input_data = {'Pedestrian_Crossing-Physical_Facilities': 'No physical crossing within 50 meters', 'Light_Conditions': 'Darkness: Street lights present and lit', 'Weather_Conditions': 'Fine without high winds', 'Road_Surface_Conditions': 'Wet/Damp', 'Pedestrian_Crossing-Human_Control': 'None within 50 metres', 'Road_Type': 'Single carriageway', 'Number_of_Vehicles': '2', 'Day_of_Week': '2', 'Time': '20:00', 'Speed_limit': '30', 'month': '5'}

In [41]:
for key, value in input_data.items():
    input_dict[key] = [value]
input_dict

{'Pedestrian_Crossing-Physical_Facilities': ['No physical crossing within 50 meters'],
 'Light_Conditions': ['Darkness: Street lights present and lit'],
 'Weather_Conditions': ['Fine without high winds'],
 'Road_Surface_Conditions': ['Wet/Damp'],
 'Pedestrian_Crossing-Human_Control': ['None within 50 metres'],
 'Road_Type': ['Single carriageway'],
 'Number_of_Vehicles': ['2'],
 'Day_of_Week': ['2'],
 'Time': ['20:00'],
 'Speed_limit': ['30'],
 'month': ['5']}

In [42]:
print(x_test.iloc[index])

Number_of_Vehicles                          1
Time                                        8
Road_Type                                   3
Speed_limit                                30
Pedestrian_Crossing-Human_Control           2
Pedestrian_Crossing-Physical_Facilities     5
Light_Conditions                            4
Weather_Conditions                          1
Road_Surface_Conditions                     0
Name: 348947, dtype: int64


In [43]:
df = pd.DataFrame.from_dict(input_dict)
df = preprocess_data(df)
df = df[features]
print(df)

  Number_of_Vehicles  Time  Road_Type Speed_limit  \
0                  2    20          3          30   

   Pedestrian_Crossing-Human_Control  Pedestrian_Crossing-Physical_Facilities  \
0                                  2                                        2   

   Light_Conditions  Weather_Conditions  Road_Surface_Conditions  
0                 2                   1                        4  


In [44]:
predict(clf, df)


array([3])

In [45]:
predict(clf, df1)

array([2])

In [46]:
df

Unnamed: 0,Number_of_Vehicles,Time,Road_Type,Speed_limit,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions
0,2,20,3,30,2,2,2,1,4


In [71]:
df1

Unnamed: 0,Number_of_Vehicles,Day_of_Week,Time,Road_Type,Speed_limit,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,month
1,2,2,20,3,30,2,2,2,1,4,5


32


In [73]:
# a = pd.DataFrame(columns=["Footbridge or subway"])
print(le_Pedestrian_Crossing_Physical_Facilities.classes_)
print(le_Pedestrian_Crossing_Physical_Facilities.classes_)
print(le_Light_Conditions.classes_)
print(le_Weather_Conditions.classes_)
print(le_Road_Surface_Conditions.classes_)
print(le_Pedestrian_Crossing_Human_Control.classes_)
print(le_Road_Type.classes_)

['Central refuge' 'Footbridge or subway'
 'No physical crossing within 50 meters'
 'Pedestrian phase at traffic signal junction' 'Zebra crossing'
 'non-junction pedestrian crossing']
['Central refuge' 'Footbridge or subway'
 'No physical crossing within 50 meters'
 'Pedestrian phase at traffic signal junction' 'Zebra crossing'
 'non-junction pedestrian crossing']
['Darkeness: No street lighting' 'Darkness: Street lighting unknown'
 'Darkness: Street lights present and lit'
 'Darkness: Street lights present but unlit'
 'Daylight: Street light present']
['Fine with high winds' 'Fine without high winds' 'Fog or mist' 'Other'
 'Raining with high winds' 'Raining without high winds'
 'Snowing with high winds' 'Snowing without high winds' 'Unknown']
['Dry' 'Flood (Over 3cm of water)' 'Frost/Ice' 'Snow' 'Wet/Damp']
['Control by other authorised person' 'Control by school crossing patrol'
 'None within 50 metres']
['Dual carriageway' 'One way street' 'Roundabout' 'Single carriageway'
 'Slip roa