Using CART to solve accident problem.

Import all required libraries

Using decision trees

In [1]:
#import algorithm implementation
from sklearn import tree
#import metrics
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score

from sklearn.model_selection import KFold,GridSearchCV
import pandas as pd
import numpy as np
#import class imbalance library
from imblearn.over_sampling import SMOTE # deal with oversampling
from imblearn.combine import SMOTETomek
#import plotting libraries
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
from sklearn import preprocessing

Get input and output

In [2]:
df = pd.read_csv('mydata.csv')


In [3]:
y=df.Accident_Severity
df = df.drop(['Accident_Severity'], axis=1)


Class weights

In [4]:
unique, counts = np.unique(y, return_counts=True)
dict(zip(unique, counts))

{1: 63, 2: 745, 3: 3763}

In [5]:
X = df.values

In [6]:
df.head(0).transpose()

Police_Force
Number_of_Vehicles
Number_of_Casualties
Day_of_Week
Location
Speed_limit
Light_Conditions
Weather_Conditions
Urban_or_Rural_Area


Create splits for cross validation

In [7]:
kf = KFold(n_splits=5) #test

In [8]:
def normalize(z):
    x = z.values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(np.vstack(x))
    return x_scaled

In [9]:
for i in range(0,9):
    df.iloc[:,i]=normalize(df.iloc[:,i])



Using decision tree for classification

In [10]:
clf = tree.DecisionTreeClassifier()

print("Train \t\t\t Test")
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    clf = clf.fit(X_train, y_train)
    y_ = clf.predict(X_test)
    y_pred = clf.predict(X_train)
    
    print(accuracy_score(y_train,y_pred),"\t",accuracy_score(y_test,y_))

Train 			 Test
0.981673960613 	 0.689617486339
0.984960350014 	 0.703501094092
0.985233798195 	 0.668490153173
0.984140005469 	 0.652078774617
0.981952420016 	 0.678336980306


In [11]:
confusion_matrix(y_test,y_,labels=[1,2,3])

array([[  0,   4,   8],
       [  1,  35, 104],
       [ 22, 155, 585]])

There is overfitting in the dataset and decision trees are notorious for overfitting.

Using Random Forests

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [13]:
clf = RandomForestClassifier(n_estimators=100,max_features='auto',max_depth=100,criterion='entropy')

sm = SMOTETomek(random_state=4)
sm = SMOTE(random_state = 3)

print("Train \t\t\t Test")
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    X_train_res,y_train_res = sm.fit_sample(X_train,y_train)
    clf = clf.fit(X_train_res, y_train_res)
    #clf = clf.fit(X_train, y_train)
    y_ = clf.predict(X_test)
    y_pred = clf.predict(X_train)
    
    print(accuracy_score(y_train,y_pred),"\t",accuracy_score(y_test,y_))



Train 			 Test
0.966903719912 	 0.681967213115
0.974022422751 	 0.68818380744
0.974022422751 	 0.699124726477
0.969920700027 	 0.655361050328
0.97046759639 	 0.701312910284


In [14]:
clf = RandomForestClassifier(max_features='auto',criterion='entropy',class_weight="balanced")

In [15]:
grid={"n_estimators":[1,100],"max_depth":[1,100]}

clf_cv=GridSearchCV(clf,grid,cv=5)
clf_cv.fit(X,y)

clf_cv.best_params_



{'max_depth': 100, 'n_estimators': 100}

In [16]:
confusion_matrix(y_test,y_,labels=[1,2,3])

array([[  1,   1,  10],
       [  5,  34, 101],
       [ 16, 140, 606]])