# This Notebook is the starting of Capstone Project



In [1]:
from pprint import pprint
import random
import numpy as np
from textwrap import wrap
from datetime import date, datetime
import datetime as dt
from xgboost import XGBClassifier
from rfpimp import *
import pandas as pd
import researchpy as rp
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from geopy.distance import geodesic
from imblearn.under_sampling import RandomUnderSampler
from geopy.distance import great_circle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, confusion_matrix, plot_confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
%matplotlib
FIGS_DIR = 'figs/'
LabelName='SEVERITYCODE'



Using matplotlib backend: MacOSX


In [2]:
def train_test(df):
    # Get the Dependent and Independent Features.
    X = df.drop([LabelName], axis=1)
    y = df[LabelName]

    # Split into 90% train and 10% test
    return train_test_split(X, y, test_size=0.10, shuffle=True, stratify=y)
def XGB(df):
    
    # Split to train test.. 90% <-> 10% (not shuffled)
    X_train, X_test, y_train, y_test = train_test(df)
    print("Features:\n" + str(df.dtypes))
    print("Start Training...")
    lr_list = [0.1]
    n_estimators = [128]
    max_depth = [3,10,2]
    subsample=[0.8]
    min_child_weight=[1,6,2]
    gamma=[0,0.3]
    colsample_bytree=[0.8]
    scale_pos_weight=[1]



    search_grid = {
                   'eta': lr_list,
                   'n_estimators': n_estimators,
                   'min_child_weight':min_child_weight,
                   'gamma':gamma,
                   'colsample_bytree':colsample_bytree,
                   'scale_pos_weight':scale_pos_weight,
                   'max_depth': max_depth,
                   'subsample': subsample}
    pprint(search_grid)

    rf = XGBClassifier()
    grid_search = GridSearchCV(estimator=rf, param_grid=search_grid, cv=10, verbose=2,
                                    n_jobs=-3, scoring='f1')



    grid_search.fit(X_train, y_train)

    pprint(grid_search.best_params_)
    print(grid_search.best_score_)

    best_grid = grid_search.best_estimator_
    res=best_grid.predict(X_test)
    print(confusion_matrix(y_test,res))
    print(f1_score(y_test,res))
def randomForest(df):
    
    # Split to train test.. 90% <-> 10% (not shuffled)
    X_train, X_test, y_train, y_test = train_test(df)
    print("Features:\n" + str(df.dtypes))
    print("Start Training...")
    # Parameters
    # Number of trees in random forest
   # n_estimators = [int(x) for x in np.linspace(start=64, stop=1024, num=10)]
    n_estimators =[64]

    # Number of features to consider at every split
    max_features = ['auto']
    # Maximum number of levels in tree
    #max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
    max_depth =[40]
    #max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [ 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [ 65]
    # Method of selecting samples for training each tree
    bootstrap = [True]
    # Create the random grid
    search_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
    pprint(search_grid)



    rf = RandomForestClassifier()
    grid_search = GridSearchCV(estimator=rf, param_grid=search_grid, cv=10, verbose=2,
                                    n_jobs=-3, scoring='f1')



    grid_search.fit(X_train, y_train)

    pprint(grid_search.best_params_)
    print(grid_search.best_score_)

    best_grid = grid_search.best_estimator_
    res=best_grid.predict(X_test)
    cm= confusion_matrix(y_test,res)
    print(f1_score(y_test,res))
    print(accuracy_score(y_test,res))
    labels = ['Code 1', 'Code 2']
    print(cm)
    plot_confusion_matrix(best_grid,X_test, y_test)
    plt.title('Confusion Matrix of the Classifier')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.savefig(FIGS_DIR + 'cm' + '.png', pad_inches=50)


In [3]:
def underSample(df):
    undersample = RandomUnderSampler(sampling_strategy='majority')
    X = df.drop([LabelName], axis=1)
    y = df[LabelName]
    X_under, y_under = undersample.fit_resample(X, y)
    y_under = y_under.to_frame(name=LabelName)
    X_under[LabelName]= y_under
    
    return X_under
    
    
def buildFreqChart(df, colName):
    x = df[colName].unique()
    numValues = x.shape[0]
    arr1 = np.empty(numValues, dtype=float)
    arr2 = np.empty(numValues, dtype=float)
    totalArr = np.empty(numValues, dtype=float)
    index = 0
    numTotal = df.shape[0]
    xindex = 0
    for i in x:
        isNan = False
        try:
            if np.math.isnan(float(i)):
                isNan = True
                x[xindex] = '#NAN'
        except:
            isNan = False
        xindex = xindex + 1
        if isNan:
            df1 = df.loc[df[colName].isnull()]
        else:
            df1 = df.loc[df[colName] == i]
        numOfRows = df1.shape[0]

        dfc1 = df1.loc[df1[LabelName] == 1]
        dfc2 = df1.loc[df1[LabelName] == 2]
        assert numOfRows == (dfc1.shape[0] + dfc2.shape[0]), "Sum is not equal!!"
        try:
            arr1[index] = (dfc1.shape[0] / numOfRows) * 100
        except Exception as e:
            print(str(e) + ":" + colName + " with " + str(x))
            print(df.head())
            exit(0)
        arr2[index] = (dfc2.shape[0] / numOfRows) * 100
        totalArr[index] = (numOfRows / numTotal) * 100
        index = index + 1

    plt.figure(figsize=(2 + numValues, 3))

    # stack bars
    if df.dtypes[colName]== object:
     x = [ '\n'.join(wrap(l, 12)) for l in x ]

        
    plt.bar(x, arr1, label='Code 1', color='Green', width=0.3)
    plt.bar(x, arr2, bottom=arr1, label='Code 2', color='Red', width=0.3)
    plt.xticks(fontsize=8)

    # add text annotation corresponding to the percentage of each data.
    for xpos, ypos, yval in zip(x, arr1 / 2, arr1):
        plt.text(xpos, ypos, "%.1f" % yval, ha="center", va="center")
    for xpos, ypos, yval in zip(x, arr1 + arr2 / 2, arr2):
        plt.text(xpos, ypos, "%.1f" % yval, ha="center", va="center")

    # add text annotation corresponding to the "total" value of each bar
    for xpos, ypos, yval in zip(x, arr1 + arr2, totalArr):
        plt.text(xpos, ypos, "%.1f%s" % (yval, "%"), ha="center", va="bottom")

    plt.ylim(0, 110)
    plt.legend(bbox_to_anchor=(1.01, 0.5), loc='center left')
    plt.xlabel(colName, labelpad=5)
    plt.ylabel("Percentage", labelpad=5)
    plt.title("Severity Code Grouped by "+colName, y=1.02)

def buildFreqChart1(df, colName):
    
   
    x = df[colName].unique()
    x.sort()
    numValues = x.shape[0]
    yArr = np.empty(numValues, dtype=float)
    index = 0
    for i in x:
        count= df.loc[df[colName] == i].shape[0]
        yArr[index]= count
        index = index + 1
        
    x= x.astype(str)
    plt.figure()
    plt.bar(x, yArr,width=0.2, align='center')
    for index, value in enumerate(yArr):
        plt.text(index-0.04, value, f"{value:,.0f}")
    
    
    plt.gca().get_yaxis().set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))
    plt.xlabel("Severity Code", labelpad=5)
    plt.ylabel("Count of Accidents", labelpad=5)
    plt.title("Accidents Grouped by Severity Code ", y=1.02) 
    #plt.savefig(FIGS_DIR + colName + str(random.randint(0,99))+ '.png', bbox_inches='tight', pad_inches=0.02)
  
def buildBoxPlot(df, colName):

    fig = plt.figure(figsize =(10, 7)) 
    plt.boxplot(df[colName]) 
    plt.ylabel(colName, labelpad=5)
    plt.title("Distribution of " + colName, y=1.02)
    plt.savefig(FIGS_DIR + colName + '.png', pad_inches=50)

def fixDate(df):
    dateCol='INCDATE'
    timeCol= 'INCDTTM'
    time_format1 = '%I:%M:%S %p'
    time_format2 = '%H:%M:%S'
    df[dateCol] = pd.to_datetime(df[dateCol], format='%Y/%m/%d')
    invalidCount=0
    for i in df.index:
        date = df[dateCol][i]
        time= df[timeCol][i]
        df.at[i, 'Month'] = date.strftime("%B")
        df.at[i, 'Day'] = date.strftime("%a")
        x=time.find(' ')
        timeStr= time[x+1:]
        hour=-1
        try:
            hour = datetime.strptime(timeStr, time_format1).time().hour
        except ValueError:
            try:
                hour = datetime.strptime(timeStr, time_format2).time().hour
            except ValueError:
                invalidCount=invalidCount+1
        df.at[i, 'Hour']= hour
        
    print("Accidents with Invalid Time="+str(invalidCount))
    df['Hour'] = df['Hour'].astype(str)

        


    
   
    return df

def computeCorCat(df, colName):
    table, results = rp.crosstab(df[colName], df['SEVERITYCODE'], prop='col', test='chi-square',
                                 correction=False)
    print("Correlation with "+ colName)
    print(results)

In [4]:
print('Hello Capstone Project Course!')

Hello Capstone Project Course!


In [5]:
INPUT_FILE='Data-Collisions.csv'
df=pd.read_csv(INPUT_FILE)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
buildFreqChart1(df,LabelName)
df= underSample(df)
#After Under-sampling
print("After Performing Under-sampling:")
buildFreqChart1(df,LabelName)

After Performing Under-sampling:


In [7]:
df= fixDate(df)

Accidents with Invalid Time=48022


In [8]:
# Keep important features

catCols=['WEATHER','COLLISIONTYPE',"ADDRTYPE",'ROADCOND', 'LIGHTCOND','Hour', 'Day', 'Month', 'UNDERINFL']
numCols= ["PERSONCOUNT", "VEHCOUNT", "PEDCOUNT", "PEDCYLCOUNT"]
df = df.filter(catCols + numCols+ ['SEVERITYCODE'])

for i in catCols:
    buildFreqChart(df,i)
    computeCorCat(df,i)

df.describe()
df.corr()

for i in numCols:
    buildBoxPlot(df,i)
    






Correlation with WEATHER
                 Chi-square test    results
0  Pearson Chi-square ( 10.0) =   4650.2591
1                     p-value =      0.0000
2                  Cramer's V =      0.2023
Correlation with COLLISIONTYPE
                Chi-square test     results
0  Pearson Chi-square ( 9.0) =   27468.1108
1                    p-value =       0.0000
2                 Cramer's V =       0.4915
Correlation with ADDRTYPE
                Chi-square test    results
0  Pearson Chi-square ( 2.0) =   5141.7811
1                    p-value =      0.0000
2                 Cramer's V =      0.2110
Correlation with ROADCOND
                Chi-square test    results
0  Pearson Chi-square ( 8.0) =   4768.6999
1                    p-value =      0.0000
2                 Cramer's V =      0.2049
Correlation with LIGHTCOND
                Chi-square test    results
0  Pearson Chi-square ( 8.0) =   4535.6323
1                    p-value =      0.0000
2                 Cramer's V =      0.19

In [9]:
df['UNDERINFL'] = df['UNDERINFL'].replace(['1'],'Y')
df['UNDERINFL'] = df['UNDERINFL'].replace(['0'],'N')
# Fill missing values
df["UNDERINFL"] = df["UNDERINFL"].fillna('N')
df["ROADCOND"] = df["ROADCOND"].fillna('Dry')
df["LIGHTCOND"] = df["LIGHTCOND"].fillna('Daylight')
df["ADDRTYPE"] = df["ADDRTYPE"].fillna('Block')
df=df.dropna(subset = ['COLLISIONTYPE'])
for i in numCols:
    df[i].fillna((df[i].mean()), inplace=True)
# Remove Outliers
df= df[df.PERSONCOUNT<9]
df= df[df.VEHCOUNT<5]
df= df[df.VEHCOUNT>0]
df= df[df.PEDCOUNT<3]
df= df[df.PEDCYLCOUNT<2]    
df= df.drop(['WEATHER', 'Hour', 'Day', 'Month', 'UNDERINFL'], axis=1)


In [10]:
df = pd.get_dummies(df)
randomForest(df)
print ("DONE")





    




















Features:
PERSONCOUNT                           int64
VEHCOUNT                              int64
PEDCOUNT                              int64
PEDCYLCOUNT                           int64
SEVERITYCODE                          int64
COLLISIONTYPE_Angles                  uint8
COLLISIONTYPE_Cycles                  uint8
COLLISIONTYPE_Head On                 uint8
COLLISIONTYPE_Left Turn               uint8
COLLISIONTYPE_Other                   uint8
COLLISIONTYPE_Parked Car              uint8
COLLISIONTYPE_Pedestrian              uint8
COLLISIONTYPE_Rear Ended              uint8
COLLISIONTYPE_Right Turn              uint8
COLLISIONTYPE_Sideswipe               uint8
ADDRTYPE_Alley                        uint8
ADDRTYPE_Block                        uint8
ADDRTYPE_Intersection                 uint8
ROADCOND_Dry                          uint8
ROADCOND_Ice                          uint8
ROADCOND_Oil                          uint8
ROADCOND_Other                        uint8
ROADCOND_Sand/Mud/Dirt

[Parallel(n_jobs=-3)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-3)]: Done   5 out of  10 | elapsed:    7.4s remaining:    7.4s
[Parallel(n_jobs=-3)]: Done  10 out of  10 | elapsed:   11.8s finished
