# Predicting Revenue from Parking Citations in Baltimore
Capstone Project for Springboard Data Science Bootcamp

Tamara Monge

### Section 4: Prediction


In [1]:
# Import standard libraries
# analysis
import pandas as pd
import numpy as np
from datetime import datetime, date
from time import time
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

# plotting
import matplotlib.pyplot as plt
%matplotlib inline 

# supervised learning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import scale
from sklearn.externals import joblib
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score   
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier


In [2]:
# Import prepared feature array
X = pd.read_csv('persistence/features_2018-05-25.csv') #('persistence/features_2018-05-22.csv')
X.drop(['date'], axis=1, inplace=True)

# Import prepared target vector
y = pd.read_csv('persistence/target_2018-05-22.csv', header=None).iloc[:,1]

print('Guessing all ones (all citations were paid) results in a ', round(100*y.sum()/y.count()),'% success rate.')
print('Can models do better?')

Guessing all ones (all citations were paid) results in a  67.0 % success rate.
Can models do better?


 We must first normalize the data and split it into training and testing sets.

In [3]:
# Normalize/scale data
Xscaled = scale(X)

# Split data into training and test sets (70% and 30% of the full dataset).
Xtraining, Xtest, ytraining, ytest = train_test_split(Xscaled, y, train_size=0.7, random_state=42)

# Split training set again, into train and evaluation sets (70% and 30% of the training set)
Xtrain, Xeval, ytrain, yeval = train_test_split(Xtraining, ytraining, train_size=0.7, random_state=42)

Let's begin by building a dummy classifier that predicts using only the `fine` feature, which we can use as a benchmark for the other classifiers.

In [4]:
# Import prepared dummy feature array (contains only `fine`)
F = pd.read_csv('persistence/fine_2018-05-25.csv', header=None).iloc[:,1:2]

# Scale it 
Fscaled = scale(F)

# Split it
F_Xtraining, F_Xtest, F_ytraining, F_ytest = train_test_split(Fscaled, y, train_size=0.7, random_state=42)

# Split it again, same logic as cell above. 
F_Xtrain, F_Xeval, F_ytrain, F_yeval = train_test_split(F_Xtraining, F_ytraining, train_size=0.7, random_state=42)

In [5]:
# Build dummy classifier
dummy = DummyClassifier(random_state=42) 

# Train 
dummy.fit(F_Xtrain, F_ytrain)

# Predict
ypred_dummy = dummy.predict(F_Xeval)

# Performance Metrics
print('Dummy Classifier')
print('=====================')
print('Accuracy:',accuracy_score(F_yeval, ypred_dummy))
print('Precision:',precision_score(F_yeval, ypred_dummy))
print('Recall:', recall_score(F_yeval, ypred_dummy))
print('F1:', f1_score(F_yeval, ypred_dummy))

DummyClassifier(constant=None, random_state=42, strategy='stratified')

Dummy Classifier
Accuracy: 0.557796048107
Precision: 0.669984546859
Recall: 0.671625300423
F1: 0.670803920342


Here we see the dummy classifier predicts with 56% accuracy, which is worse than the null hypothesis.

Now let's move on to more complex classifiers.

### A) Logistic Regression

In [6]:
# #### Uncomment this cell to train gridsearchCV estimator from scratch ####
# # Range of parameters to gridsearch
# logreg_params = {'C': [0.01, 0.1, 1, 10, 100]}

# # Build GridSearch Cross Validation estimator to find best hyperparameters
# logreg_gridsearch = GridSearchCV(LogisticRegression(), logreg_params, cv=5, scoring=make_scorer(accuracy_score))

# # Train estimator
# start = time()
# logreg_gridsearch.fit(Xtrain, ytrain)  
# print('Training took ', time()-start, 'seconds.')

# # Save the trained estimator 
# filename = 'persistence/logreg_gridsearch_' + str(date.today()) + '.sav'
# joblib.dump(logreg_gridsearch, filename)

In [7]:
# #### Uncomment this cell to load trained gridsearch estimator from disk ####
# filename = 'persistence/logreg_gridsearch_2018-05-25.sav'
# logreg_gridsearch = joblib.load(filename)
# print('Best parameters:',logreg_gridsearch.best_params_)
# print('Best score:', logreg_gridsearch.best_score_)

In [8]:
# #### Uncomment this cell to train Logistic Regression estimator with best hyperparameter, so can evaluate performance metrics ####
# Build Logistic Regression estimator
# logreg = LogisticRegression(C=1)

# # Train estimator
# logreg.fit(Xtrain, ytrain)

# # Save trained estimator
# filename = 'persistence/logreg_' + str(date.today()) + '.sav'
# joblib.dump(logreg, filename)

In [9]:
#### Uncomment this cell to load trained Logistic Regression estimator from disk ####
filename = 'persistence/logreg_2018-05-31.sav'
logreg = joblib.load(filename)

# Predict on eval set
ypred_logreg = logreg.predict(Xeval)

# Performance Metrics
print('Logistic Regression')
print('====================')
print('Accuracy:',accuracy_score(yeval, ypred_logreg))
print('Precision:',precision_score(yeval, ypred_logreg))
print('Recall:', recall_score(yeval, ypred_logreg))
print('F1:', f1_score(yeval, ypred_logreg))

Logistic Regression
Accuracy: 0.748231123065
Precision: 0.788936304726
Recall: 0.852847037919
F1: 0.81964772466


### B) Linear SVC

In [10]:
# #### Uncomment this cell to train gridsearchCV estimator from scratch ####
# lsvc_params =  {"C":[0.01, 0.1, 1, 10, 100]}
# lsvc_gridsearch = GridSearchCV(LinearSVC(), lsvc_params, n_jobs=4, cv=5, scoring=make_scorer(accuracy_score))
# start = time()
# lsvc_gridsearch.fit(Xtrain, ytrain)  
# print('Training took ', time()-start, 'seconds.')

# # Save the trained model 
# filename = 'persistence/lsvc_gridsearch_'+ str(date.today()) + '.sav'
# joblib.dump(lsvc_gridsearch, filename)

In [11]:
# #### Uncomment this cell to load trained gridsearch estimator from disk ####
# filename = 'persistence/lsvc_gridsearch_2018-05-13.sav'
# lsvc_gridsearch = joblib.load(filename)
# print('Best parameters:', lsvc_gridsearch.best_params_)
# print('Best score:', lsvc_gridsearch.best_score_)

In [12]:
# #### Uncomment this cell to train Linear SVC estimator with best hyperparameter, so can evaluate performance metrics ####
# Build Linear SVC estimator
# lsvc = LinearSVC(C=0.1)

# # Train estimator
# lsvc.fit(Xtrain, ytrain)

# # Save trained estimator
# filename = 'persistence/lsvc_' + str(date.today()) + '.sav'
# joblib.dump(lsvc, filename)

In [13]:
#### Uncomment this cell to load trained Linear SVC estimator from disk ####
filename = 'persistence/lsvc_2018-05-31.sav'
lsvc = joblib.load(filename)

# Predict on eval set
ypred_lsvc = lsvc.predict(Xeval)

# Performance Metrics
print('Linear SVC')
print('============')
print('Accuracy:',accuracy_score(yeval, ypred_lsvc))
print('Precision:',precision_score(yeval, ypred_lsvc))
print('Recall:', recall_score(yeval, ypred_lsvc))
print('F1:', f1_score(yeval, ypred_lsvc))

Linear SVC
Accuracy: 0.745610281645
Precision: 0.786552536384
Recall: 0.851981365257
F1: 0.817960620923


### C) SVC with RBF Kernel


Fitting the RBF-SVC on the entire feature array is untennable (didn't finish in 14 days). 
To work around this, let's select a manageable number (20-30) of the most important features based on the coefficients returned from the logistic regression classifier. 

In [14]:
# Put the Logistic Regression coefficients in a dataframe
df_features = pd.DataFrame({'features':np.array(X.columns), 'coefficients':logreg.coef_[0,:]})
df_features.sort_values('coefficients', ascending=False)

# Determine which threshold to use to get a manageable number of features (20-30)
(abs(df_features.coefficients) >= 0.5).sum()
(abs(df_features.coefficients) >= 0.2).sum()
(abs(df_features.coefficients) >= 0.1).sum()
(abs(df_features.coefficients) >= 0.07).sum()
(abs(df_features.coefficients) >= 0.05).sum()

Unnamed: 0,coefficients,features
318,0.720146,desc_ALL OTHER PARKING METER VIOLATIONS
336,0.513078,desc_NO STOPPING/STANDING NOT TOW-AWAY ZONE
326,0.452920,desc_FIXED SPEED CAMERA
337,0.444737,desc_NO STOPPING/STANDING TOW AWAY ZONE
341,0.439693,desc_RESIDENTIAL PARKING PERMIT ONLY
332,0.406354,desc_NO STOP/PARK STREET CLEANING
319,0.284872,desc_ALL OTHER STOPPING OR PARKING VIOLATIONS
339,0.229094,desc_OBSTRUCT/IMPEDING MOVEMENT OF PEDESTRIAN
328,0.218887,desc_IN TRANSIT ZONE/STOP
1,0.215252,instate


4

17

29

37

44

It seems like a threshold of >= 0.1 will suffice.

In [15]:
# Identify the top 29 features from LogReg 
df_select29 = df_features[abs(df_features.coefficients) >= 0.1] 
df_select29

# Extract only the 29 identified features
X29 = X[df_select29.features]
X29.shape

# Normalize/scale 
X29scaled = scale(X29)

# Split data for training and testing (70%, 30% of full set respectively)
X29training, X29test, y29training, y29test = train_test_split(X29scaled, y, train_size=0.7, random_state=42)

## Split training set again into train and eval (70%, 30% of training set)
X29train, X29eval, y29train, y29eval = train_test_split(X29training, y29training, train_size=0.7, random_state=42)

Unnamed: 0,coefficients,features
0,-0.129364,fine
1,0.215252,instate
281,-0.164091,make_TRA
318,0.720146,desc_ALL OTHER PARKING METER VIOLATIONS
319,0.284872,desc_ALL OTHER STOPPING OR PARKING VIOLATIONS
324,0.119293,desc_EXPIRED TAGS
326,0.45292,desc_FIXED SPEED CAMERA
328,0.218887,desc_IN TRANSIT ZONE/STOP
330,0.162749,desc_LESS THAN 15 FEET FROM FIRE HYDRANT
331,0.164994,desc_NO STOP/PARK HANDICAP


(623208, 29)

Now that we have the pared-down feature array, let's train the svc.

In [16]:
# #### Uncomment this cell to train gridsearchCV estimator from scratch ####
# svc_params =  {"C":[0.01, 0.1, 1, 10, 100], "gamma":[0.25, 0.5, 0.75]}
# svc29_gridsearch = GridSearchCV(SVC(), svc_params, n_jobs=4, cv=5, scoring=make_scorer(accuracy_score))

# # Train
# start = time()
# svc29.fit(X29train, y29train)
# print(time()-start)
# print('Training took ', time()-start, 'seconds.')

# # Save the trained estimator 
# filename = 'persistence/svc29_gridsearch_'+ str(date.today()) + '.sav'
# joblib.dump(svc29_gridsearch, filename)

In [17]:
# #### Uncomment this cell to load trained gridsearch estimator from disk ####
# filename = 'persistence/svc29_gridsearch_2018-05-17.sav'
# svc29_gridsearch = joblib.load(filename)
# print('Best parameters:', svc29_gridsearch.best_params_)
# print('Best score:', svc29_gridsearch.best_score_)

In [18]:
# #### Uncomment this cell to train RBF SVC estimator with best hyperparameter, so can evaluate performance metrics ####
# # Build
# svc29 = SVC(C=1, gamma=0.25)

# # Train
# svc29.fit(X29train, y29train)

# # Save trained estimator
# filename = 'persistence/svc29_' + str(date.today()) + '.sav'
# joblib.dump(svc29, filename)

In [19]:
#### Uncomment this cell to load trained RBF SVC estimator from disk ####
filename = 'persistence/svc29_2018-06-01.sav'
svc29 = joblib.load(filename)

# Predict on eval set
ypred_svc29 = svc29.predict(X29eval)

# Performance Metrics
print('RBF SVC')
print('===========')
print('Accuracy:',accuracy_score(y29eval, ypred_svc29))
print('Precision:',precision_score(y29eval, ypred_svc29))
print('Recall:', recall_score(y29eval, ypred_svc29))
print('F1:', f1_score(y29eval, ypred_svc29))

RBF SVC
Accuracy: 0.752028668796
Precision: 0.795083715474
Recall: 0.849213490825
F1: 0.821257635091


###  D) Decision Tree

In [20]:
# #### Uncomment this cell to train gridsearchCV estimator from scratch ####
# dtc_params =  {"max_depth":[5, 10, 30, 50, 100]}
# dtc_gridsearch = GridSearchCV(DecisionTreeClassifier(), dtc_params, n_jobs=4, cv=5, scoring=make_scorer(accuracy_score))
# start = time()
# dtc_gridsearch.fit(Xtrain, ytrain)  
# print('Training took ', time()-start, 'seconds.')

# # Save the trained estimator 
# filename = 'persistence/dtc_gridsearch_'+ str(date.today()) + '.sav'
# joblib.dump(dtc_gridsearch, filename)

In [21]:
# # #### Uncomment this cell to load trained gridsearch estimator from disk ####
# filename = 'persistence/dtc_gridsearch_2018-05-20.sav'
# dtc_gridsearch = joblib.load(filename)
# print('Best parameters:', dtc_gridsearch.best_params_)
# print('Best score:', dtc_gridsearch.best_score_)

In [22]:
# #### Uncomment this cell to train Decision Tree estimator with best hyperparameter, so can evaluate performance metrics ####
# # Build
# dtc = DecisionTreeClassifier(max_depth=10)

# # Train
# dtc.fit(Xtrain, ytrain)

# # Save trained estimator
# filename = 'persistence/dtc_' + str(date.today()) + '.sav'
# joblib.dump(dtc, filename)

In [23]:
#### Uncomment this cell to load trained Decision Tree estimator from disk ####
filename = 'persistence/dtc_2018-05-31.sav'
dtc = joblib.load(filename)

# Predict on eval set
ypred_dtc = dtc.predict(Xeval)

# Performance Metrics
print('Decision Tree')
print('==============')
print('Accuracy:',accuracy_score(yeval, ypred_dtc))
print('Precision:',precision_score(yeval, ypred_dtc))
print('Recall:', recall_score(yeval, ypred_dtc))
print('F1:', f1_score(yeval, ypred_dtc))

Decision Tree
Accuracy: 0.754886379266
Precision: 0.773924244808
Recall: 0.896483774333
F1: 0.830707851115


### E) Random Forest

In [24]:
# #### Uncomment this cell to train gridsearchCV estimator from scratch ####
# rfc_params =  {"max_depth":[5, 10, 30, 50, 100], "n_estimators":[5, 10, 25, 50], "min_samples_leaf":[1, 2, 4, 7, 10]}
# rfc_gridsearch = GridSearchCV(RandomForestClassifier(random_state=42), rfc_params, n_jobs=4, cv=5, scoring=make_scorer(accuracy_score))
# start = time()
# rfc_gridsearch.fit(Xtrain, ytrain)  
# print('Training took ', time()-start, 'seconds.')

# # Save the trained estimator
# filename = 'persistence/rfc_gridsearch_'+ str(date.today()) + '.sav'
# joblib.dump(rfc_gridsearch, filename)

In [25]:
# # #### Uncomment this cell to load trained gridsearch estimator from disk ####
# filename = 'persistence/rfc_gridsearch_2018-05-20.sav'
# rfc_gridsearch = joblib.load(filename)
# print('Best parameters:', rfc_gridsearch.best_params_)
# print('Best score:', rfc_gridsearch.best_score_)

In [26]:
# #### Uncomment this cell to train Decision Tree estimator with best hyperparameter, so can evaluate performance metrics ####
# Build
# rfc = RandomForestClassifier(random_state=42, max_depth=100, min_samples_leaf=2, n_estimators=50)

# # Train
# rfc.fit(Xtrain, ytrain)

# # Save trained estimator
# filename = 'persistence/rfc_' + str(date.today()) + '.sav'
# joblib.dump(rfc, filename)

In [27]:
#### Uncomment this cell to load trained Random Forest estimator from disk ####
filename = 'persistence/rfc_2018-05-31.sav'
rfc = joblib.load(filename)

# Predict on eval set
ypred_rfc = rfc.predict(Xeval)

# Performance Metrics
print('Random Forest')
print('==============')
print('Accuracy:',accuracy_score(yeval, ypred_rfc))
print('Precision:',precision_score(yeval, ypred_rfc))
print('Recall:', recall_score(yeval, ypred_rfc))
print('F1:', f1_score(yeval, ypred_rfc))

Random Forest
Accuracy: 0.766057429283
Precision: 0.782680061702
Recall: 0.901598077295
F1: 0.837940981871


### F) Naive Bayes


In [28]:
# #### Uncomment this cell to train Naive Bayes estimator from scratch ####
# # Build
# gnb = GaussianNB()

# # Train
# start = time()
# gnb.fit(Xtrain, ytrain)  
# print('Training took ', time()-start, 'seconds.')

# # Save the trained model 
# filename = 'persistence/gnb_'+ str(date.today()) + '.sav'
# joblib.dump(gnb, filename)

In [29]:
#### Uncomment this cell to load trained Naive Bayes estimator from disk ####
filename = 'persistence/gnb_2018-05-31.sav'
gnb = joblib.load(filename)

# Predict on eval set
ypred_gnb = gnb.predict(Xeval)

# Performance Metrics
print('Naive Bayes')
print('==============')
print('Accuracy:',accuracy_score(yeval, ypred_gnb))
print('Precision:',precision_score(yeval, ypred_gnb))
print('Recall:', recall_score(yeval, ypred_gnb))
print('F1:', f1_score(yeval, ypred_gnb))

Naive Bayes
Accuracy: 0.332396045051
Precision: 0.767471410419
Recall: 0.00687981957559
F1: 0.01363738993


### Section 5. Model Evaluation


In [31]:
# Collect the best_score_ from each cross-validated algorithm in a series
model_names = ['Dummy Classifier','Logistic Regression', 'Linear SVC', 'RBF-SVC', 'Decision Tree', 'Random Forest', 'Naive Bayes']
model_scores = [accuracy_score(yeval, ypred_dummy), accuracy_score(yeval, ypred_logreg), accuracy_score(yeval, ypred_lsvc),
                accuracy_score(yeval, ypred_svc29), accuracy_score(yeval, ypred_dtc), accuracy_score(yeval, ypred_rfc),
                accuracy_score(yeval, ypred_gnb)]
model_eval = pd.Series(data = model_scores, index = model_names).sort_values(ascending=False)
model_eval

Random Forest          0.766057
Decision Tree          0.754886
RBF-SVC                0.752029
Logistic Regression    0.748231
Linear SVC             0.745610
Dummy Classifier       0.557796
Naive Bayes            0.332396
dtype: float64

The best classifier, based on accuracy, is the Random Forest. 

### Section 6. Model Results

Now let's predict the Random Forest on the test data to see how well we can expect it to generalize to un-seen data.

In [32]:
# Predict
start = time()
ypred_rfc_finaltest = rfc.predict(Xtest)
print('Predicting took ', time()-start, 'seconds.')
print('')

# Performance Metrics
print('Random Forest')
print('====================')
print('Accuracy:',accuracy_score(ytest, ypred_rfc_finaltest))
print('Precision:',precision_score(ytest, ypred_rfc_finaltest))
print('Recall:', recall_score(ytest, ypred_rfc_finaltest))
print('F1:', f1_score(ytest, ypred_rfc_finaltest))
print('Confusion Matrix:')
print(confusion_matrix(ytest, ypred_rfc_finaltest))
print('Classification Report:')
print(classification_report(ytest, ypred_rfc_finaltest))

Predicting took  4.240575075149536 seconds.

Random Forest
Accuracy: 0.767916646609
Precision: 0.785720221607
Recall: 0.901125433852
F1: 0.839475115332
Confusion Matrix:
[[ 30114  30942]
 [ 12449 113458]]
Classification Report:
             precision    recall  f1-score   support

          0       0.71      0.49      0.58     61056
          1       0.79      0.90      0.84    125907

avg / total       0.76      0.77      0.76    186963



**Based on the test metrics above, we can expect the random forest classifier to predict which citations will be paid with 77% accuracy. When it predicts a ticket will be paid, that ticket is in fact paid 79% of the time. Additionally, the random forest correctly anticipates 90% of all paid tickets.**

Finally, let's see which features of a citation are most important in determining whether it will be paid?

In [33]:
# Put the feature_importances_ in a dataframe
imp_feats = pd.DataFrame({'features':np.array(X.columns), 'importance':rfc.feature_importances_})
imp_feats.sort_values('importance', ascending=False, inplace=True)

In [34]:
imp_feats.head(10)

Unnamed: 0,features,importance
344,yr_2017,0.136559
326,desc_FIXED SPEED CAMERA,0.075206
343,yr_2016,0.070462
354,mo_11,0.048725
0,fine,0.039258
324,desc_EXPIRED TAGS,0.032805
352,mo_9,0.029025
410,quad_SOUTHEAST,0.024903
318,desc_ALL OTHER PARKING METER VIOLATIONS,0.023458
1,instate,0.020739


The most important features for determining whether a citation will be paid according to the random forest model are the: **year** of citation, the **type of violation** that occured, the **month** of citation, and the **fine**.
