In [1]:
import numpy as np
import pandas as pd
import re

#Visualisation Libraries
import matplotlib.pyplot as plt

#Training and Preprocessing Libraries
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

# sampling
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, ADASYN

# classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score

from sklearn.tree import export_graphviz
from subprocess import call
from IPython.display import Image

In [2]:
def convert_date_to_day_of_year(dt):
    result = re.findall(r'(\d{2})/(\d{2})/(\d{4})',dt)
    return result[0][0]

def convert_date_to_month(dt):
    result = re.findall(r'(\d{2})/(\d{2})/(\d{4})',dt)
    return result[0][1]

In [3]:
def preprocess_datetime(data):
    data['day_of_year'] = data.Date.apply(lambda x: convert_date_to_day_of_year(x))
    data['month'] = data.Date.apply(lambda x: convert_date_to_month(x))
    return data

In [4]:
def preprocess_data(temp_data):
    temp_data["Pedestrian_Crossing-Physical_Facilities"]= le_Pedestrian_Crossing_Physical_Facilities.transform(temp_data["Pedestrian_Crossing-Physical_Facilities"])
    temp_data["Light_Conditions"]= le_Light_Conditions.transform(temp_data["Light_Conditions"])
    temp_data["Weather_Conditions"] = le_Weather_Conditions.transform(temp_data["Weather_Conditions"])
    temp_data["Road_Surface_Conditions"] = le_Road_Surface_Conditions.transform(temp_data["Road_Surface_Conditions"])
    temp_data["Pedestrian_Crossing-Human_Control"] = le_Pedestrian_Crossing_Human_Control.transform(temp_data["Pedestrian_Crossing-Human_Control"])
    temp_data["Road_Type"] = le_Road_Type.transform(temp_data["Road_Type"])
    temp_data["Time"] = temp_data["Time"].astype(str)
    temp_data['Time'] = temp_data['Time'].str.slice(0, 2, 1)
    temp_data["Time"] = temp_data["Time"].astype(int)
    return temp_data

In [5]:
def build_model(clf, X_train, Y_train):
    print(clf)
    clf.fit(X_train, Y_train)

In [6]:
def predict(clf, X_test):
    pred = clf.predict(X_test)
    return pred

In [7]:
def print_score(y_test, pred, average="macro"):
    acc = accuracy_score(y_test, pred)
    prec = precision_score(y_test, pred, average=average)
    recall = recall_score(y_test, pred, average=average)
    f2 = f1_score(y_test, pred, average=average)
    print("Accuracy Score: {}".format(acc))
    print("Precision Score: {}".format(prec))
    print("Recall Score: {}".format(recall))
    print("F1 Score: {}".format(f2))
    return acc, prec, recall, f2

In [8]:
class_dict = {1: 'Fatal', 2: 'Severe', 3: 'Slight'}

class_labels = ['Fatal', 'Severe', 'Slight']

In [9]:
data1 = pd.read_csv("./data/accidents_2005_to_2007.csv")
data2 = pd.read_csv("./data/accidents_2009_to_2011.csv")
data3 = pd.read_csv("./data/accidents_2012_to_2014.csv")

data = pd.concat([data1, data2, data3])

  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
drop_columns = ['Date', 'Accident_Index', 'Number_of_Casualties', 'Police_Force', 'Junction_Detail',
                'Junction_Control', 'Special_Conditions_at_Site', 'Carriageway_Hazards',
                'Did_Police_Officer_Attend_Scene_of_Accident', 'LSOA_of_Accident_Location',
                'Local_Authority_(District)', 'Local_Authority_(Highway)']

In [11]:
data = preprocess_datetime(data)
data1 = data.drop(drop_columns, axis=1, inplace=False)
data1.dropna(inplace=True)

In [12]:
le_Pedestrian_Crossing_Physical_Facilities = LabelEncoder()
le_Light_Conditions = LabelEncoder()
le_Weather_Conditions = LabelEncoder()
le_Road_Surface_Conditions = LabelEncoder()
le_Pedestrian_Crossing_Human_Control = LabelEncoder()
le_Road_Type = LabelEncoder()

le_Pedestrian_Crossing_Physical_Facilities.fit(data1["Pedestrian_Crossing-Physical_Facilities"])
le_Light_Conditions.fit(data1["Light_Conditions"])
le_Weather_Conditions.fit(data1["Weather_Conditions"])
le_Road_Surface_Conditions.fit(data1["Road_Surface_Conditions"])
le_Pedestrian_Crossing_Human_Control.fit(data1["Pedestrian_Crossing-Human_Control"])
le_Road_Type.fit(data1["Road_Type"])

LabelEncoder()

In [13]:
data1 = data1[data1['Weather_Conditions'] != 'Unknown']
data1 = data1[data1['Road_Type'] != 'Unknown']

In [14]:
data1 = preprocess_data(data1)

In [15]:
data1

Unnamed: 0,Location_Easting_OSGR,Location_Northing_OSGR,Longitude,Latitude,Accident_Severity,Number_of_Vehicles,Day_of_Week,Time,1st_Road_Class,1st_Road_Number,...,2nd_Road_Number,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Urban_or_Rural_Area,Year,day_of_year,month
0,525680.0,178240.0,-0.191170,51.489096,2,1,3,17,3,3218,...,0,2,4,4,5,4,1,2005,04,01
1,524170.0,181650.0,-0.211708,51.520075,3,1,4,17,4,450,...,0,2,3,2,1,0,1,2005,05,01
2,524520.0,182240.0,-0.206458,51.525301,3,2,5,0,5,0,...,0,2,2,2,1,0,1,2005,06,01
3,526900.0,177530.0,-0.173862,51.482442,3,1,6,10,3,3220,...,0,2,2,4,1,0,1,2005,07,01
4,528060.0,179040.0,-0.156618,51.495752,3,1,2,21,6,0,...,0,2,2,1,1,4,1,2005,10,01
5,524770.0,181160.0,-0.203238,51.515540,3,2,3,12,6,0,...,0,2,2,4,5,4,1,2005,11,01
6,524220.0,180830.0,-0.211277,51.512695,3,2,5,20,5,0,...,0,2,2,2,1,0,1,2005,13,01
7,525890.0,179710.0,-0.187623,51.502260,3,1,6,17,3,315,...,0,2,2,4,1,0,1,2005,14,01
8,527350.0,177650.0,-0.167342,51.483420,3,2,7,22,3,3212,...,304,2,3,2,1,0,1,2005,15,01
9,524550.0,180810.0,-0.206531,51.512443,3,2,7,16,4,450,...,0,2,0,4,1,0,1,2005,15,01


In [16]:
train, test = train_test_split(data1, test_size=.20)

In [17]:
features = ["Number_of_Vehicles", "Day_of_Week", "Time", "Road_Type", "Speed_limit",
                         "Pedestrian_Crossing-Human_Control", "Pedestrian_Crossing-Physical_Facilities",
                         "Light_Conditions", "Weather_Conditions", "Road_Surface_Conditions",
                         "month"]

In [18]:
y_train = train['Accident_Severity']
x_train = train[features]

y_test = test['Accident_Severity']
x_test = test[features]

In [19]:
print(test.loc[:, x_train.columns != 'Accident_Severity'].head(0))

Empty DataFrame
Columns: [Location_Easting_OSGR, Location_Northing_OSGR, Longitude, Latitude, Accident_Severity, Number_of_Vehicles, Day_of_Week, Time, 1st_Road_Class, 1st_Road_Number, Road_Type]
Index: []


In [20]:
print("x_train: ", x_train.shape)
print("x_test: ", x_test.shape)
print("y_train: ", y_train.shape)
print("y_test: ", y_test.shape)

x_train:  (1174393, 11)
x_test:  (293599, 11)
y_train:  (1174393,)
y_test:  (293599,)


In [77]:
clf = RandomForestClassifier(n_estimators=30)

In [78]:
build_model(clf, x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [79]:
pred = predict(clf, x_test)

In [80]:
print_score(y_test, pred)

Accuracy Score: 0.8331261346257991
Precision Score: 0.38912664346671755
Recall Score: 0.3455300151372192
F1 Score: 0.3386668804948941


(0.8331261346257991,
 0.38912664346671755,
 0.3455300151372192,
 0.3386668804948941)

In [25]:
# y_test.shape
print(x_test.shape)
print(len(pred))

(293599, 11)
293599


In [None]:
def get_console(pred, class=1):
    for i in range(len(pred)):
        if pred[i] == class:
            return i

In [52]:
get_console

2
Number_of_Vehicles                          2
Day_of_Week                                 2
Time                                       20
Road_Type                                   3
Speed_limit                                30
Pedestrian_Crossing-Human_Control           2
Pedestrian_Crossing-Physical_Facilities     2
Light_Conditions                            2
Weather_Conditions                          1
Road_Surface_Conditions                     4
month                                      05
Name: 67521, dtype: object


In [58]:
df1 = pd.DataFrame(columns=features)
df1.loc[1] = x_test.iloc[index]
df1

Unnamed: 0,Number_of_Vehicles,Day_of_Week,Time,Road_Type,Speed_limit,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,month
1,2,2,20,3,30,2,2,2,1,4,5


In [72]:
input_dict = {}
# input_data =  {
#     'Pedestrian_Crossing-Physical_Facilities': 'No physical crossing within 50 meters', 
#       'Light_Conditions': 'Daylight: Street light present', 
#     'Weather_Conditions': 'Fine without high winds', 
#     'Road_Surface_Conditions': 'Dry', 
#     'Pedestrian_Crossing-Human_Control': 'None within 50 metres', 
#     'Road_Type': 'Single carriageway', 
#     'Number_of_Vehicles': '4', 
#     'Day_of_Week': '1', 
#     'Time': '14:00', 
#     'Speed_limit': '50', 
#     'month': '4'}
input_data = {'Pedestrian_Crossing-Physical_Facilities': 'No physical crossing within 50 meters', 'Light_Conditions': 'Darkness: Street lights present and lit', 'Weather_Conditions': 'Fine without high winds', 'Road_Surface_Conditions': 'Wet/Damp', 'Pedestrian_Crossing-Human_Control': 'None within 50 metres', 'Road_Type': 'Single carriageway', 'Number_of_Vehicles': '2', 'Day_of_Week': '2', 'Time': '20:00', 'Speed_limit': '30', 'month': '5'}

In [73]:
for key, value in input_data.items():
    input_dict[key] = [value]
input_dict

{'Pedestrian_Crossing-Physical_Facilities': ['No physical crossing within 50 meters'],
 'Light_Conditions': ['Darkness: Street lights present and lit'],
 'Weather_Conditions': ['Fine without high winds'],
 'Road_Surface_Conditions': ['Wet/Damp'],
 'Pedestrian_Crossing-Human_Control': ['None within 50 metres'],
 'Road_Type': ['Single carriageway'],
 'Number_of_Vehicles': ['2'],
 'Day_of_Week': ['2'],
 'Time': ['20:00'],
 'Speed_limit': ['30'],
 'month': ['5']}

In [74]:
print(x_test.iloc[index])

Number_of_Vehicles                          2
Day_of_Week                                 2
Time                                       20
Road_Type                                   3
Speed_limit                                30
Pedestrian_Crossing-Human_Control           2
Pedestrian_Crossing-Physical_Facilities     2
Light_Conditions                            2
Weather_Conditions                          1
Road_Surface_Conditions                     4
month                                      05
Name: 67521, dtype: object


In [75]:
df = pd.DataFrame.from_dict(input_dict)
df = preprocess_data(df)
df = df[features]
print(df)

  Number_of_Vehicles Day_of_Week  Time  Road_Type Speed_limit  \
0                  2           2    20          3          30   

   Pedestrian_Crossing-Human_Control  Pedestrian_Crossing-Physical_Facilities  \
0                                  2                                        2   

   Light_Conditions  Weather_Conditions  Road_Surface_Conditions month  
0                 2                   1                        4     5  


In [76]:
predict(clf, df)


array([2])

In [69]:
predict(clf, df1)

array([2])

In [70]:
df

Unnamed: 0,Number_of_Vehicles,Day_of_Week,Time,Road_Type,Speed_limit,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,month
0,2,2,22,3,30,2,2,2,1,4,5


In [71]:
df1

Unnamed: 0,Number_of_Vehicles,Day_of_Week,Time,Road_Type,Speed_limit,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,month
1,2,2,20,3,30,2,2,2,1,4,5


32


In [73]:
# a = pd.DataFrame(columns=["Footbridge or subway"])
print(le_Pedestrian_Crossing_Physical_Facilities.classes_)
print(le_Pedestrian_Crossing_Physical_Facilities.classes_)
print(le_Light_Conditions.classes_)
print(le_Weather_Conditions.classes_)
print(le_Road_Surface_Conditions.classes_)
print(le_Pedestrian_Crossing_Human_Control.classes_)
print(le_Road_Type.classes_)

['Central refuge' 'Footbridge or subway'
 'No physical crossing within 50 meters'
 'Pedestrian phase at traffic signal junction' 'Zebra crossing'
 'non-junction pedestrian crossing']
['Central refuge' 'Footbridge or subway'
 'No physical crossing within 50 meters'
 'Pedestrian phase at traffic signal junction' 'Zebra crossing'
 'non-junction pedestrian crossing']
['Darkeness: No street lighting' 'Darkness: Street lighting unknown'
 'Darkness: Street lights present and lit'
 'Darkness: Street lights present but unlit'
 'Daylight: Street light present']
['Fine with high winds' 'Fine without high winds' 'Fog or mist' 'Other'
 'Raining with high winds' 'Raining without high winds'
 'Snowing with high winds' 'Snowing without high winds' 'Unknown']
['Dry' 'Flood (Over 3cm of water)' 'Frost/Ice' 'Snow' 'Wet/Damp']
['Control by other authorised person' 'Control by school crossing patrol'
 'None within 50 metres']
['Dual carriageway' 'One way street' 'Roundabout' 'Single carriageway'
 'Slip roa