In [1]:
import numpy as np
import pandas as pd
import re

#Visualisation Libraries
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
from pandas.plotting import scatter_matrix

#Training and Preprocessing Libraries
from xgboost import XGBClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

In [2]:
class_names = ['Fatal', 'Severe', 'Slight']


In [3]:
data1 = pd.read_csv("./data/accidents_2005_to_2007.csv")
data2 = pd.read_csv("./data/accidents_2009_to_2011.csv")
data3 = pd.read_csv("./data/accidents_2012_to_2014.csv")

In [4]:
data = pd.concat([data1, data2, data3])


In [5]:
type(data)
print(data.head(0))

Empty DataFrame
Columns: [Accident_Index, Location_Easting_OSGR, Location_Northing_OSGR, Longitude, Latitude, Police_Force, Accident_Severity, Number_of_Vehicles, Number_of_Casualties, Date, Day_of_Week, Time, Local_Authority_(District), Local_Authority_(Highway), 1st_Road_Class, 1st_Road_Number, Road_Type, Speed_limit, Junction_Detail, Junction_Control, 2nd_Road_Class, 2nd_Road_Number, Pedestrian_Crossing-Human_Control, Pedestrian_Crossing-Physical_Facilities, Light_Conditions, Weather_Conditions, Road_Surface_Conditions, Special_Conditions_at_Site, Carriageway_Hazards, Urban_or_Rural_Area, Did_Police_Officer_Attend_Scene_of_Accident, LSOA_of_Accident_Location, Year]
Index: []

[0 rows x 33 columns]


In [6]:
def max_val(s):
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]

year_wise_casualties = data.groupby(['Year'])['Number_of_Casualties'].sum()
year_wise_casualties = year_wise_casualties.reset_index()
year_wise_casualties = year_wise_casualties.style.apply(max_val, axis=0)
year_wise_casualties

Unnamed: 0,Year,Number_of_Casualties
0,2005,271017
1,2006,258404
2,2007,247780
3,2009,222146
4,2010,208648
5,2011,203950
6,2012,241954
7,2013,183670
8,2014,194477


In [7]:
cas_table = data.groupby(['Day_of_Week']).agg({'Number_of_Casualties':['sum'],'Speed_limit':['min','max']})
cas_table = cas_table.sort_values([('Number_of_Casualties','sum')],ascending=False)
cas_table = cas_table.reset_index()
cas_table.style.apply(max_val)

Unnamed: 0_level_0,Day_of_Week,Number_of_Casualties,Speed_limit,Speed_limit
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,min,max
0,6,331934,10,70
1,5,299044,10,70
2,4,297756,20,70
3,3,294476,10,70
4,7,285261,10,70
5,2,284043,10,70
6,1,239532,10,70


In [8]:
corr_matrix = data.corr()
corr_matrix["Accident_Severity"].sort_values(ascending=False)

Accident_Severity             1.000000
Number_of_Vehicles            0.075976
2nd_Road_Class                0.064321
2nd_Road_Number               0.026128
Longitude                     0.013946
Location_Easting_OSGR         0.013465
1st_Road_Class                0.012754
Day_of_Week                   0.002961
1st_Road_Number              -0.004072
Year                         -0.005907
Latitude                     -0.031357
Location_Northing_OSGR       -0.031395
Police_Force                 -0.031712
Local_Authority_(District)   -0.031995
Speed_limit                  -0.079108
Urban_or_Rural_Area          -0.082169
Number_of_Casualties         -0.083544
Junction_Detail                    NaN
Name: Accident_Severity, dtype: float64

In [9]:
# 30/11/2007
def convert_date_to_day_of_year(dt):
    result = re.findall(r'(\d{2})/(\d{2})/(\d{4})',dt)
    return result[0][0]
   
def convert_date_to_month(dt):
    result = re.findall(r'(\d{2})/(\d{2})/(\d{4})',dt)
    return result[0][1]

In [10]:
data['day_of_year'] = data.Date.apply(lambda x: convert_date_to_day_of_year(x))
data['month'] = data.Date.apply(lambda x: convert_date_to_month(x))


In [11]:
# data.hist(bins=50, figsize=(20,15))
# plt.show()

In [12]:
# fig = data.plot(kind="scatter", x="Longitude", y="Latitude", alpha=0.6,
#                    figsize=(18,11),c="Accident_Severity", cmap=plt.get_cmap("inferno"), 
#                    colorbar=True,)

In [13]:
# attributes = ["Number_of_Vehicles", "Time", "Road_Type", "Pedestrian_Crossing-Human_Control", "Pedestrian_Crossing-Physical_Facilities", "Light_Conditions", "Weather_Conditions", "Road_Surface_Conditions","Accident_Severity"]
# scatter_matrix(data[attributes], figsize=(10, 10))




In [14]:
drop_columns = ['Date', 'Accident_Index', 'Number_of_Casualties', 'Police_Force', 'Junction_Detail', 'Junction_Control', 'Special_Conditions_at_Site', 'Carriageway_Hazards', 'Did_Police_Officer_Attend_Scene_of_Accident', 'LSOA_of_Accident_Location', 'Local_Authority_(District)', 'Local_Authority_(Highway)']

In [15]:
data1 = data.drop(drop_columns, axis=1, inplace=False)
data1.dropna(inplace=True)

In [16]:
#Drop rows with 'Unknown' values
data1 = data1[data1.Weather_Conditions!='Unknown']
data1 = data1[data1.Road_Type!='Unknown']

In [17]:
#Encode "String" Labels into "Int" Labels for easy training
le = LabelEncoder()
data1["Pedestrian_Crossing-Physical_Facilities"]= le.fit_transform(data1["Pedestrian_Crossing-Physical_Facilities"])
data1["Light_Conditions"]= le.fit_transform(data1["Light_Conditions"])
data1["Weather_Conditions"] = le.fit_transform(data1["Weather_Conditions"])
data1["Road_Surface_Conditions"] = le.fit_transform(data1["Road_Surface_Conditions"])
data1["Pedestrian_Crossing-Human_Control"] = le.fit_transform(data1["Pedestrian_Crossing-Human_Control"])
data1["Road_Type"] = le.fit_transform(data1["Road_Type"])

#Converting Time into Int for easy training
data1["Time"]= data1["Time"].astype(str)
data1['Time']= data1['Time'].str.slice(0,2, 1)
data1["Time"]= data1["Time"].astype(int)

In [18]:
print(data1[:5])

   Location_Easting_OSGR  Location_Northing_OSGR  Longitude   Latitude  \
0               525680.0                178240.0  -0.191170  51.489096   
1               524170.0                181650.0  -0.211708  51.520075   
2               524520.0                182240.0  -0.206458  51.525301   
3               526900.0                177530.0  -0.173862  51.482442   
4               528060.0                179040.0  -0.156618  51.495752   

   Accident_Severity  Number_of_Vehicles  Day_of_Week  Time  1st_Road_Class  \
0                  2                   1            3    17               3   
1                  3                   1            4    17               4   
2                  3                   2            5     0               5   
3                  3                   1            6    10               3   
4                  3                   1            2    21               6   

   1st_Road_Number  ...  2nd_Road_Number  Pedestrian_Crossing-Human_Control  \
0

In [19]:
print(data1["Accident_Severity"][10:20])
# print(le.fit_transform(data["Accident_Severity"])[100:200])

10    3
11    3
12    3
13    3
14    3
15    3
16    2
17    3
18    3
19    2
Name: Accident_Severity, dtype: int64


In [20]:
# def preprocessing(data):
#     #Drop useless columns and nan values
#     data.drop(drop_columns, axis=1, inplace=True)
#     data.dropna(inplace=True)
    
#     #Drop rows with 'Unknown' values
#     data = data[data.Weather_Conditions!='Unknown']
#     data = data[data.Road_Type!='Unknown']
    
#     #Encode "String" Labels into "Int" Labels for easy training
# le = LabelEncoder()
# data1["Pedestrian_Crossing-Physical_Facilities"]= le.fit_transform(data1["Pedestrian_Crossing-Physical_Facilities"])
# data1["Light_Conditions"]= le.fit_transform(data1["Light_Conditions"])
# data1["Weather_Conditions"] = le.fit_transform(data1["Weather_Conditions"])
# data1["Road_Surface_Conditions"] = le.fit_transform(data1["Road_Surface_Conditions"])
# data1["Pedestrian_Crossing-Human_Control"] = le.fit_transform(data1["Pedestrian_Crossing-Human_Control"])
# data1["Road_Type"] = le.fit_transform(data1["Road_Type"])
    
# #     #Converting Time into Int for easy training
# data1["Time"]= data1["Time"].astype(str)
# data1['Time']=data1['Time'].str.slice(0,2, 1)
# data1["Time"]= data1["Time"].astype(int)
    
#     #Creating 3 additional columns, one each for each class we need to classify into
# #     onehot = pd.get_dummies(data.Accident_Severity,prefix=['Severity'])
    
# #     data["Fatal"] = onehot["['Severity']_1"]
# #     data["Severe"] = onehot["['Severity']_2"]
# #     data["Slight"] = onehot["['Severity']_3"]
    
#     #Finally splitting the data into train and test
#     train,test = train_test_split(data,test_size=.25)
    
#     return (train,test)

In [21]:
# train,test = preprocessing(data1)
train,test = train_test_split(data1,test_size=.25)

In [22]:
train_features = train[["Longitude", "Latitude", "Number_of_Vehicles", "Day_of_Week", "Time", "Road_Type", "Speed_limit", "Pedestrian_Crossing-Human_Control", "Pedestrian_Crossing-Physical_Facilities", "Light_Conditions", "Weather_Conditions", "Road_Surface_Conditions","Year", "day_of_year", "month", "Urban_or_Rural_Area"]]
test_features =test[["Longitude", "Latitude", "Number_of_Vehicles", "Day_of_Week", "Time", "Road_Type", "Speed_limit", "Pedestrian_Crossing-Human_Control", "Pedestrian_Crossing-Physical_Facilities", "Light_Conditions", "Weather_Conditions", "Road_Surface_Conditions","Year", "day_of_year", "month", "Urban_or_Rural_Area"]]

In [23]:
predicted_classes = []

In [24]:
print(train.shape)
train.head(0)

(1100994, 23)


Unnamed: 0,Location_Easting_OSGR,Location_Northing_OSGR,Longitude,Latitude,Accident_Severity,Number_of_Vehicles,Day_of_Week,Time,1st_Road_Class,1st_Road_Number,...,2nd_Road_Number,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Urban_or_Rural_Area,Year,day_of_year,month


In [25]:
y_train = train['Accident_Severity']
# x_train = train.loc[:, train.columns != 'Accident_Severity']
x_train = train_features


In [26]:
y_test = test['Accident_Severity'] 
# x_test = test.loc[:, train.columns != 'Accident_Severity']
x_test = test_features

In [27]:
print(test.loc[:, train.columns != 'Accident_Severity'].head(0))

Empty DataFrame
Columns: [Location_Easting_OSGR, Location_Northing_OSGR, Longitude, Latitude, Number_of_Vehicles, Day_of_Week, Time, 1st_Road_Class, 1st_Road_Number, Road_Type, Speed_limit, 2nd_Road_Class, 2nd_Road_Number, Pedestrian_Crossing-Human_Control, Pedestrian_Crossing-Physical_Facilities, Light_Conditions, Weather_Conditions, Road_Surface_Conditions, Urban_or_Rural_Area, Year, day_of_year, month]
Index: []

[0 rows x 22 columns]


In [28]:
print("x_train: ", x_train.shape)
print("x_test: ", x_test.shape)
print("y_train: ", y_train.shape)
print("y_test: ", y_test.shape)
x_train.head(0)

x_train:  (1100994, 16)
x_test:  (366998, 16)
y_train:  (1100994,)
y_test:  (366998,)


Unnamed: 0,Longitude,Latitude,Number_of_Vehicles,Day_of_Week,Time,Road_Type,Speed_limit,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Year,day_of_year,month,Urban_or_Rural_Area


In [29]:
print(y_test[1:10], y_train[1:10])

127827    3
418014    3
15627     3
318900    3
344358    2
180955    2
443208    3
11018     3
549829    3
Name: Accident_Severity, dtype: int64 212529    2
99562     3
50878     2
253115    3
225893    3
90599     3
339703    3
519837    3
115786    3
Name: Accident_Severity, dtype: int64


In [30]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE, ADASYN

In [31]:
ros = RandomOverSampler(random_state=0)
x_resampled, y_resampled = ros.fit_resample(x_train, y_train)
# x_resampled, y_resampled = SMOTE().fit_resample(x_train, y_train)


In [32]:
print(x_resampled.shape, y_resampled.shape)

(2807646, 16) (2807646,)


In [33]:
def classify(clf, X_train, Y_train, X_test):
    print('_' * 80)
    print("Training: ")
    print(clf)
#     t0 = time()
    clf.fit(X_train, Y_train)
#     train_time = time() - t0
#     print("train time: %0.3fs" % train_time)
    
#     t0 = time()
    pred = clf.predict(X_test)
#     test_time = time() - t0
#     print("test time:  %0.3fs" % test_time)
    print(type(pred))
    return pred

In [34]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score

In [35]:
def print_score(y_test, pred, average="macro"):
    print("Accuracy Score: {}".format(accuracy_score(y_test, pred)))
    print("Precision Score: {}".format(precision_score(y_test, pred, average=average)))
    print("Recall Score: {}".format(recall_score(y_test, pred, average=average)))
    print("F1 Score: {}".format(f1_score(y_test, pred, average=average)))
    

In [48]:
# pred = classify(RandomForestClassifier(), x_train, y_train, x_test)
pred = classify(RandomForestClassifier(), x_resampled, y_resampled, x_test)
print_score(y_test, pred)

________________________________________________________________________________
Training: 
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
<class 'numpy.ndarray'>
Accuracy Score: 0.8144921770690848
Precision Score: 0.4697043143079214
Recall Score: 0.3733066884543182
F1 Score: 0.3839710714477565


In [None]:
pred2 = classify(GradientBoostingClassifier(n_estimators=100), x_train, y_train, x_test)
# pred2 = classify(GradientBoostingClassifier(n_estimators=100), x_resampled, y_resampled, x_test)
print_score(y_test, pred2)

________________________________________________________________________________
Training: 
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)


In [47]:
print_score(y_test, pred2)

Accuracy Score: 0.8496585812456744
Precision Score: 0.7276652418780083
Recall Score: 0.33340725685163214
F1 Score: 0.30639166558277237


In [47]:
print_score(y_test, pred)

Accuracy Score: 0.8381081095809786
Precision Score: 0.5477209102513553
Recall Score: 0.3646011905460511
F1 Score: 0.37076327352975985


In [None]:
f1_score(y_test, pred1, average='macro')

In [36]:
def model(x_train, y_train, x_test):
    classifier = EasyEnsembleClassifier(n_estimators=12, base_estimator=XGBClassifier(max_depth=4, learning_rate=0.2, n_estimators=600, silent=True,
                        subsample = 0.8,
                        gamma=0.5,
                        min_child_weight=10,
#                         objective='binary:logistic',
                        colsample_bytree = 0.6,
                        max_delta_step = 1,
                        nthreads=1,
                        n_jobs=1))
    

    cv_score = np.mean(cross_val_score(
        classifier, x_train, y_train, cv=3, scoring='roc_auc'))
    print('CV score: {}'.format(cv_score))

    classifier.fit(x_train, y_train)
    pred = classifier.predict(x_test)
    acc = roc_auc_score(x_test, y_test)
    print('acc score : {}'.format(acc))
    return pred
    

In [37]:
# pred = model(x_resampled, y_resampled, x_test)
pred = model(x_test, y_test, x_test)

ValueError: multiclass format is not supported

In [None]:
f1_score(y_test, pred1, average='macro')