# This notebook will run the model using Gradient Boosting Classifier for Crash Service, Traffic Hazard, and Loose Livestock

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
import sklearn.cluster
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, fbeta_score
from sklearn.metrics import confusion_matrix, balanced_accuracy_score

Import data of clean features

In [2]:
file = '/Users/tshields/Documents/GitHub/Real-Time_Traffic_Incident_Reports/FeatureEngineering/feature_engineering4.csv'

In [3]:
df = pd.read_csv(file, index_col=0)
df.head()

Unnamed: 0,published_date,issue_reported,Latitude,Longitude,issue_timespan,geometry,day,hour,month,region,quadrant,dist_cntr_km
0,2018-06-13 06:35:59+00:00,Crash Service,30.283797,-97.741906,144.066667,POINT (-97.741906 30.283797),Wednesday,6,June,cnw,ne,1.849061
1,2018-06-13 10:15:36+00:00,Traffic Hazard,30.339593,-97.700963,64.45,POINT (-97.700963 30.339593),Wednesday,10,June,vne,ne,9.009002
2,2020-04-17 21:25:03+00:00,Crash Service,30.329455,-97.638105,30.0,POINT (-97.638105 30.329455),Friday,21,April,vce,ne,12.228301
3,2020-04-17 21:40:52+00:00,Traffic Hazard,30.202806,-97.760701,14.183333,POINT (-97.760701 30.202806),Friday,21,April,vsw,sw,7.357232
4,2020-04-17 21:00:55+00:00,Crash Service,30.184265,-97.687339,149.15,POINT (-97.68733899999999 30.184265),Friday,21,April,vse,se,10.66518


In [4]:
df.describe()

Unnamed: 0,Latitude,Longitude,issue_timespan,hour,dist_cntr_km
count,243114.0,243114.0,243114.0,243114.0,243114.0
mean,30.301716,-97.733273,72.641459,13.659287,11.826093
std,0.093666,0.086736,90.826579,7.055001,7.320069
min,30.004413,-98.226709,0.333333,0.0,0.007638
25%,30.232036,-97.776903,20.766667,9.0,6.040102
50%,30.294241,-97.73439,41.316667,15.0,10.96141
75%,30.373598,-97.687501,81.166667,19.0,16.95976
max,30.798642,-97.29332,480.0,23.0,60.669983


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 243114 entries, 0 to 243113
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   published_date  243114 non-null  object 
 1   issue_reported  243114 non-null  object 
 2   Latitude        243114 non-null  float64
 3   Longitude       243114 non-null  float64
 4   issue_timespan  243114 non-null  float64
 5   geometry        243114 non-null  object 
 6   day             243114 non-null  object 
 7   hour            243114 non-null  int64  
 8   month           243114 non-null  object 
 9   region          243114 non-null  object 
 10  quadrant        243114 non-null  object 
 11  dist_cntr_km    243114 non-null  float64
dtypes: float64(4), int64(1), object(7)
memory usage: 24.1+ MB


In [6]:
df['issue_reported'].unique()

array(['Crash Service', 'Traffic Hazard', 'Collision', 'Stalled Vehicle',
       'Loose Livestock', 'Traffic Impediment'], dtype=object)

In [7]:
df.issue_reported.value_counts()

Crash Service         93808
Traffic Hazard        93012
Collision             32536
Stalled Vehicle       11965
Loose Livestock        5914
Traffic Impediment     5879
Name: issue_reported, dtype: int64

In [8]:
df = df[df["issue_reported"].str.contains("Collision|Stalled Vehicle|Injury/Fatality|Traffic Impediment|Vehicle Fire")==False]

In [9]:
df.issue_reported.value_counts()

Crash Service      93808
Traffic Hazard     93012
Loose Livestock     5914
Name: issue_reported, dtype: int64

#  Use Label Encoder for categorical data we are trying to predict
Trying to predict the "issue_reported" involves transforming the column using LabelEncoder(). Then a quick check to ensure the numbers line up like before.

In [10]:
le = LabelEncoder()
df['issue_reported']=le.fit_transform(df['issue_reported'])
df.head()

Unnamed: 0,published_date,issue_reported,Latitude,Longitude,issue_timespan,geometry,day,hour,month,region,quadrant,dist_cntr_km
0,2018-06-13 06:35:59+00:00,0,30.283797,-97.741906,144.066667,POINT (-97.741906 30.283797),Wednesday,6,June,cnw,ne,1.849061
1,2018-06-13 10:15:36+00:00,2,30.339593,-97.700963,64.45,POINT (-97.700963 30.339593),Wednesday,10,June,vne,ne,9.009002
2,2020-04-17 21:25:03+00:00,0,30.329455,-97.638105,30.0,POINT (-97.638105 30.329455),Friday,21,April,vce,ne,12.228301
3,2020-04-17 21:40:52+00:00,2,30.202806,-97.760701,14.183333,POINT (-97.760701 30.202806),Friday,21,April,vsw,sw,7.357232
4,2020-04-17 21:00:55+00:00,0,30.184265,-97.687339,149.15,POINT (-97.68733899999999 30.184265),Friday,21,April,vse,se,10.66518


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 192734 entries, 0 to 243113
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   published_date  192734 non-null  object 
 1   issue_reported  192734 non-null  int64  
 2   Latitude        192734 non-null  float64
 3   Longitude       192734 non-null  float64
 4   issue_timespan  192734 non-null  float64
 5   geometry        192734 non-null  object 
 6   day             192734 non-null  object 
 7   hour            192734 non-null  int64  
 8   month           192734 non-null  object 
 9   region          192734 non-null  object 
 10  quadrant        192734 non-null  object 
 11  dist_cntr_km    192734 non-null  float64
dtypes: float64(4), int64(2), object(6)
memory usage: 19.1+ MB


In [12]:
df['issue_reported'].unique()

array([0, 2, 1])

In [13]:
df['issue_reported'].value_counts()

0    93808
2    93012
1     5914
Name: issue_reported, dtype: int64

## Apply LabelEncoder on the new categorical column 'region'.

In [14]:
le2 = LabelEncoder()
df['region']=le2.fit_transform(df['region'])
le3 = LabelEncoder()
df['day']=le3.fit_transform(df['day'])
le4 = LabelEncoder()
df['month']=le4.fit_transform(df['month'])
le5 = LabelEncoder()
df['quadrant']=le5.fit_transform(df['quadrant'])
df.head()

Unnamed: 0,published_date,issue_reported,Latitude,Longitude,issue_timespan,geometry,day,hour,month,region,quadrant,dist_cntr_km
0,2018-06-13 06:35:59+00:00,0,30.283797,-97.741906,144.066667,POINT (-97.741906 30.283797),6,6,6,2,0,1.849061
1,2018-06-13 10:15:36+00:00,2,30.339593,-97.700963,64.45,POINT (-97.700963 30.339593),6,10,6,8,0,9.009002
2,2020-04-17 21:25:03+00:00,0,30.329455,-97.638105,30.0,POINT (-97.638105 30.329455),0,21,0,6,0,12.228301
3,2020-04-17 21:40:52+00:00,2,30.202806,-97.760701,14.183333,POINT (-97.760701 30.202806),0,21,0,11,3,7.357232
4,2020-04-17 21:00:55+00:00,0,30.184265,-97.687339,149.15,POINT (-97.68733899999999 30.184265),0,21,0,10,2,10.66518


## Scaling Timespan and Latitude/Longitude Features
First, create a series variable using .values

In [15]:
series1 = df.issue_timespan.values
series2 = df.dist_cntr_km.values

Then reshape the series to scale it as 2D arrays

In [16]:
series1 = series1.reshape(-1,1)

In [17]:
series2 = series2.reshape(-1,1)

In [18]:
scaler = StandardScaler()

Then create new columns for for each of the series.

In [19]:
df['scaled_timespan'] = scaler.fit_transform(series1)

In [20]:
df['scaled_dist_km'] = scaler.fit_transform(series2)

In [21]:

df.head()

Unnamed: 0,published_date,issue_reported,Latitude,Longitude,issue_timespan,geometry,day,hour,month,region,quadrant,dist_cntr_km,scaled_timespan,scaled_dist_km
0,2018-06-13 06:35:59+00:00,0,30.283797,-97.741906,144.066667,POINT (-97.741906 30.283797),6,6,6,2,0,1.849061,0.765269,-1.30895
1,2018-06-13 10:15:36+00:00,2,30.339593,-97.700963,64.45,POINT (-97.700963 30.339593),6,10,6,8,0,9.009002,-0.111498,-0.267119
2,2020-04-17 21:25:03+00:00,0,30.329455,-97.638105,30.0,POINT (-97.638105 30.329455),0,21,0,6,0,12.228301,-0.490873,0.201317
3,2020-04-17 21:40:52+00:00,2,30.202806,-97.760701,14.183333,POINT (-97.760701 30.202806),0,21,0,11,3,7.357232,-0.665052,-0.507465
4,2020-04-17 21:00:55+00:00,0,30.184265,-97.687339,149.15,POINT (-97.68733899999999 30.184265),0,21,0,10,2,10.66518,0.821249,-0.026131


## One Hot Encoding the Categorical Variables

In [22]:
dummies = pd.get_dummies(df, columns=['day','hour','month','region', 'quadrant'])

In [23]:
dummies = dummies.drop(columns=['issue_reported', 'issue_timespan', 'Latitude','Longitude', 'geometry', 'published_date', 'dist_cntr_km'])

In [24]:
dummies.head()

Unnamed: 0,scaled_timespan,scaled_dist_km,day_0,day_1,day_2,day_3,day_4,day_5,day_6,hour_0,...,region_6,region_7,region_8,region_9,region_10,region_11,quadrant_0,quadrant_1,quadrant_2,quadrant_3
0,0.765269,-1.30895,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1,-0.111498,-0.267119,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
2,-0.490873,0.201317,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
3,-0.665052,-0.507465,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
4,0.821249,-0.026131,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0


In [25]:
X = dummies

In [26]:

y = df.issue_reported

In [27]:
X.shape

(192734, 61)

In [28]:
y.shape

(192734,)

## Train Test Split and Scale

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [30]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Naive Bayes Accuracy and F1 Scores

In [31]:
gbc = GradientBoostingClassifier(random_state = 1)
model_res = gbc.fit(X_train, y_train)
y_pred = model_res.predict(X_test)
y_pred_prob = model_res.predict_proba(X_test)
lr_probs = y_pred_prob[:,1]
ac = accuracy_score(y_test, y_pred)

f1 = f1_score(y_test, y_pred, average='weighted')
cm = confusion_matrix(y_test, y_pred)
ba = balanced_accuracy_score(y_test, y_pred)

print('Naive Bayes: Accuracy=%.3f' % (ac))

print('Naive Bayes: f1-score=%.3f' % (f1))

print('Naive Bayes: Balanced Accuracy=%.3f' % (ba))

Naive Bayes: Accuracy=0.687
Naive Bayes: f1-score=0.686
Naive Bayes: Balanced Accuracy=0.605


def modelfit(alg, dtrain, predictors, performCV=True, printFeatureImportance=True, cv_folds=5):
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Disbursed'])
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    
    #Perform cross-validation:
    if performCV:
        cv_score = cross_validation.cross_val_score(alg, dtrain[predictors], dtrain['Disbursed'], cv=cv_folds, scoring='roc_auc')
    
    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob))

    if performCV:
        print ("CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" % (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))
        
    #Print Feature Importance:
    if printFeatureImportance:
        feat_imp = pd.Series(alg.feature_importances_, predictors).sort_values(ascending=False)
        feat_imp.plot(kind='bar', title='Feature Importances')
        plt.ylabel('Feature Importance Score')