# Predicting Revenue from Parking Citations in Baltimore
Capstone Project for Springboard Data Science Bootcamp

Tamara Monge

### Section 4: Prediction


In [1]:
# Import standard libraries
# analysis
import pandas as pd
import numpy as np
from datetime import datetime, date
from time import time
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

# plotting
import matplotlib.pyplot as plt
%matplotlib inline 

# supervised learning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import scale
from sklearn.externals import joblib
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score   
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [3]:
# Import prepared feature array
X = pd.read_csv('persistence/features_2018-05-25.csv') #('persistence/features_2018-05-22.csv')
X.drop(['date'], axis=1, inplace=True)

# Import prepared target vector
y = pd.read_csv('persistence/target_2018-05-22.csv', header=None).iloc[:,1]

print('Guessing all ones (all citations were paid) results in a ', round(100*y.sum()/y.count()),'% success rate.')
print('Can models do better?')

Guessing all ones (all citations were paid) results in a  67.0 % success rate.
Can models do better?


**Let's start with a dummy classifier that predicts using only the `fine` feature.**

In [4]:
# Import prepared dummy feature array (contains only `fine`)
F = pd.read_csv('persistence/fine_' + str(date.today()) + '.csv', header=None).iloc[:,1:2]
F.shape

# Scale it and split it
Fscaled = scale(F)
Ftrain, Ftest, ytrain, ytest = train_test_split(Fscaled, y, train_size=0.7, random_state=42)

(623208, 1)

In [5]:
# Build dummy classifier
dummy = DecisionTreeClassifier() #LogisticRegression(C = 1)
# Fit and test 
dummy.fit(Ftrain, ytrain).score(Ftest, ytest)

0.71368666527601721

Here we see the dummy classifier predicts with 71.4% accuracy, which is better than the null hypothesis.

Now let's move on to more complex classifiers. We must first normalize the data and split it into training and testing sets.

In [6]:
# Normalize/scale data
Xscaled = scale(X)

# Split data for training and testing 70%/30%
Xtrain, Xtest, ytrain, ytest = train_test_split(Xscaled, y, train_size=0.7, random_state=42)


### A) Logistic Regression

In [10]:
### Uncomment this section to train model from scratch ####
# GridSearch Cross Validation to find best hyperparameters
logreg_params = {'C': [0.01, 0.1, 1, 10, 100]}
logreg = GridSearchCV(LogisticRegression(), logreg_params, cv=5, scoring=make_scorer(accuracy_score))
start = time()
logreg.fit(Xtrain, ytrain)  
print('Training took ', time()-start, 'seconds.')
print('Best parameters:',logreg.best_params_)
print('Best score:', logreg.best_score_)

# Save the trained model 
filename = 'persistence/logreg_' + str(date.today()) + '.sav'
joblib.dump(logreg, filename)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.01, 0.1, 1, 10, 100]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True,
       scoring=make_scorer(accuracy_score), verbose=0)

Training took  1708.6171028614044 seconds.
Best parameters: {'C': 1}
Best score: 0.748036080643


['persistence/logreg_2018-05-25.sav']

In [None]:
# # Load trained model from disk
# filename = 'persistence/logreg_2018-05-13.sav'
# logreg = joblib.load(filename)

### B) Linear SVC

In [12]:
### Uncomment this section to train model from scratch ####
# GridSearch Cross Validation to find best hyperparameters
lsvc_params =  {"C":[0.01, 0.1, 1, 10, 100]}
lsvc = GridSearchCV(LinearSVC(), lsvc_params, n_jobs=4, cv=5, scoring=make_scorer(accuracy_score))
start = time()
lsvc.fit(Xtrain, ytrain)  
print('Training took ', time()-start, 'seconds.')
print('Best parameters:', lsvc.best_params_)
print('Best score:', lsvc.best_score_)

# Save the trained model 
filename = 'persistence/lsvc_'+ str(date.today()) + '.sav'
joblib.dump(lsvc, filename)

GridSearchCV(cv=5, error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'C': [0.01, 0.1, 1, 10, 100]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True,
       scoring=make_scorer(accuracy_score), verbose=0)

Training took  4203.443771839142 seconds.
Best parameters: {'C': 0.01}
Best score: 0.745615422526


['persistence/lsvc_2018-05-25.sav']

In [None]:
# # Load trained model from disk
# filename = 'persistence/lsvc_2018-05-13.sav'
# lsvc = joblib.load(filename)

### C) SVC with RBF Kernel


Fitting the RBF-SVC on the entire feature array is untennable (didn't finish in 14 days). 
To work around this, let's select a manageable number (20-30) of the most important features based on the coefficients returned from the logistic regression classifier. 

In [14]:
# Train the logreg    
clf = LogisticRegression(C=1)
clf.fit(Xtrain, ytrain)

# Put the coefficients in a dataframe
df_features = pd.DataFrame({'features':np.array(X.columns), 'coefficients':clf.coef_[0,:]})
df_features.sort_values('coefficients', ascending=False)

# Determine which threshold to use to get a manageable number of features (20-30)
(abs(df_features.coefficients) >= 0.5).sum()
(abs(df_features.coefficients) >= 0.2).sum()
(abs(df_features.coefficients) >= 0.1).sum()
(abs(df_features.coefficients) >= 0.07).sum()
(abs(df_features.coefficients) >= 0.05).sum()

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Unnamed: 0,coefficients,features
318,0.595356,desc_ALL OTHER PARKING METER VIOLATIONS
336,0.422591,desc_NO STOPPING/STANDING NOT TOW-AWAY ZONE
337,0.368754,desc_NO STOPPING/STANDING TOW AWAY ZONE
341,0.368252,desc_RESIDENTIAL PARKING PERMIT ONLY
326,0.342040,desc_FIXED SPEED CAMERA
332,0.317259,desc_NO STOP/PARK STREET CLEANING
319,0.230306,desc_ALL OTHER STOPPING OR PARKING VIOLATIONS
1,0.214621,instate
339,0.195964,desc_OBSTRUCT/IMPEDING MOVEMENT OF PEDESTRIAN
328,0.187244,desc_IN TRANSIT ZONE/STOP


3

15

28

33

39

It seems like a threshold of >= 0.2 will suffice.

In [16]:
# Identify the top 28 features from LogReg #21
df_select28 = df_features[abs(df_features.coefficients) >= 0.1] #>=0.2
df_select28

# Extract only the 28 identified features
X28 = X[df_select28.features]
X28.shape

# Normalize/scale 
X28scaled = scale(X28)

# Split data for training-testing 70%-30%
X28train, X28test, y28train, y28test = train_test_split(X28scaled, y, train_size=0.7, random_state=42)

Unnamed: 0,coefficients,features
0,-0.172743,fine
1,0.214621,instate
280,0.100978,make_TOY
281,-0.161277,make_TRA
318,0.595356,desc_ALL OTHER PARKING METER VIOLATIONS
319,0.230306,desc_ALL OTHER STOPPING OR PARKING VIOLATIONS
326,0.34204,desc_FIXED SPEED CAMERA
328,0.187244,desc_IN TRANSIT ZONE/STOP
330,0.137461,desc_LESS THAN 15 FEET FROM FIRE HYDRANT
331,0.177881,desc_NO STOP/PARK HANDICAP


(623208, 28)

Now that we have the pared-down feature array, let's train the svc.

In [17]:
### Uncomment this section to train model from scratch ####
# GridSearch Cross Validation to find best hyperparameters
svc_params =  {"C":[0.01, 0.1, 1, 10, 100], "gamma":[0.25, 0.5, 0.75]}
svc28 = GridSearchCV(SVC(), svc_params, n_jobs=4, cv=5, scoring=make_scorer(accuracy_score))
start = time()
svc28.fit(X28train, y28train)
print(time()-start)
print('Training took ', time()-start, 'seconds.')
print('Best parameters:', svc28.best_params_)
print('Best score:', svc28.best_score_)

# Save the trained model 
filename = 'persistence/svc28_'+ str(date.today()) + '.sav'
joblib.dump(svc28, filename)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'C': [0.01, 0.1, 1, 10, 100], 'gamma': [0.25, 0.5, 0.75]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(accuracy_score), verbose=0)

227735.89946818352
Training took  227735.89978194237 seconds.
Best parameters: {'C': 1, 'gamma': 0.25}
Best score: 0.752675675366


['persistence/svc28_2018-05-28.sav']

In [None]:
# # Load trained model from disk
# filename = 'persistence/svc21_2018-05-17.sav'
# svc21 = joblib.load(filename)

###  D) Decision Tree

In [7]:
### Uncomment this section to train model from scratch ####
# GridSearch Cross Validation to find best hyperparameters
dtc_params =  {"max_depth":[5, 10, 30, 50, 100]}
dtc = GridSearchCV(DecisionTreeClassifier(), dtc_params, n_jobs=4, cv=5, scoring=make_scorer(accuracy_score))
start = time()
dtc.fit(Xtrain, ytrain)  
print('Training took ', time()-start, 'seconds.')
print('Best parameters:', dtc.best_params_)
print('Best score:', dtc.best_score_)

# Save the trained model 
filename = 'persistence/dtc_'+ str(date.today()) + '.sav'
joblib.dump(dtc, filename)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'max_depth': [5, 10, 30, 50, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(accuracy_score), verbose=0)

Training took  273.9086368083954 seconds.
Best parameters: {'max_depth': 10}
Best score: 0.755270547513


['persistence/dtc_2018-05-25.sav']

In [None]:
# # Load trained model from disk
# filename = 'persistence/dtc_2018-05-20.sav'
# dtc = joblib.load(filename)

### E) Random Forest

In [13]:
### Uncomment this section to train model from scratch ####
# GridSearch Cross Validation to find best hyperparameters
rfc_params =  {"max_depth":[5, 10, 30, 50, 100], "n_estimators":[5, 10, 25, 50], "min_samples_leaf":[1, 2, 4, 7, 10]}
# (max_depth=10, n_estimators=10, max_features=100)
rfc = GridSearchCV(RandomForestClassifier(random_state=42), rfc_params, n_jobs=4, cv=5, scoring=make_scorer(accuracy_score))
start = time()
rfc.fit(Xtrain, ytrain)  
print('Training took ', time()-start, 'seconds.')
print('Best parameters:', rfc.best_params_)
print('Best score:', rfc.best_score_)

# Save the trained model
filename = 'persistence/rfc_'+ str(date.today()) + '.sav'
joblib.dump(rfc, filename)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'max_depth': [5, 10, 30, 50, 100], 'n_estimators': [5, 10, 25, 50], 'min_samples_leaf': [1, 2, 4, 7, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(accuracy_score), verbose=0)

Training took  6050.578860044479 seconds.
Best parameters: {'max_depth': 100, 'min_samples_leaf': 2, 'n_estimators': 50}
Best score: 0.766085571181


['persistence/rfc_2018-05-26.sav']

In [None]:
# # Load trained model from disk
# filename = 'persistence/rfc_2018-05-20.sav'
# rfc = joblib.load(filename)

### F) Naive Bayes


In [9]:
### Uncomment this section to train model from scratch ####
# Fit the model
gnb = GaussianNB()
start = time()
gnb.fit(Xtrain, ytrain)  
print('Training took ', time()-start, 'seconds.')

# Save the trained model 
filename = 'persistence/gnb_'+ str(date.today()) + '.sav'
joblib.dump(gnb, filename)

gnb.score(Xtest, ytest)

GaussianNB(priors=None)

Training took  3.030113935470581 seconds.


['persistence/gnb_2018-05-25.sav']

0.32820932483967413

In [None]:
# # Load trained model from disk
# filename = 'persistence/gnb_2018-05-20.sav'
# gnb = joblib.load(filename)
# gnb.get_params()

### Section 5. Model Evaluation


In [22]:
# Collect the best_score_ from each cross-validated algorithm in a series
model_names = ['Dummy Classifier','Logistic Regression', 'Linear SVC', 'RBF-SVC', 'Decision Tree', 'Random Forest', 'Naive Bayes']
model_scores = [dummy.fit(Ftrain, ytrain).score(Ftest, ytest), logreg.best_score_, lsvc.best_score_, svc28.best_score_, dtc.best_score_, rfc.best_score_, gnb.score(Xtest, ytest)]
model_eval = pd.Series(data = model_scores, index = model_names).sort_values(ascending=False)

In [23]:
model_eval

Random Forest          0.766086
Decision Tree          0.755271
RBF-SVC                0.752676
Logistic Regression    0.748036
Linear SVC             0.745615
Dummy Classifier       0.713687
Naive Bayes            0.328209
dtype: float64

The best classifier, based on accuracy, is the Random Forest. 

Now let's predict the Random Forest on the test data to see how well we can expect it to generalize to unseen data.

### Section 6. Model Results

In [24]:
# Predict
start = time()
ypred_rfc = rfc.predict(Xtest)
print('Predicting took ', time()-start, 'seconds.')
print('')

# Performance Metrics
print('Random Forest')
print('====================')
print('Accuracy:',accuracy_score(ytest, ypred_rfc))
print('Precision:',precision_score(ytest, ypred_rfc))
print('Recall:', recall_score(ytest, ypred_rfc))
print('F1:', f1_score(ytest, ypred_rfc))
print('Confusion Matrix:')
print(confusion_matrix(ytest, ypred_rfc))
print('Classification Report:')
print(classification_report(ytest, ypred_rfc))

Predicting took  2.728756904602051 seconds.

Random Forest
Accuracy: 0.769114744629
Precision: 0.786472038335
Recall: 0.902062633531
F1: 0.840310891946
Confusion Matrix:
[[ 30220  30836]
 [ 12331 113576]]
Classification Report:
             precision    recall  f1-score   support

          0       0.71      0.49      0.58     61056
          1       0.79      0.90      0.84    125907

avg / total       0.76      0.77      0.76    186963



**Based on the test metrics above, we can expect the random forest classifier to predict which citations will be paid with 76.8% accuracy. When it predicts a ticket will be paid, that ticket is in fact paid 78.6% of the time. Additionally, the random forest correctly anticipates 90.2% of all paid tickets.**

Finally, let's see which features of a citation are most important in determining whether it will be paid?

In [25]:
# Build the Random Forest (can't use rfc from above because it has a gridsearch estimator wrapped around it)
model = RandomForestClassifier(max_depth=50, n_estimators= 50, min_samples_leaf=2, random_state=42)

# Train it
model.fit(Xtrain, ytrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=50, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=2,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)

In [26]:
# Put the feature_importances_ in a dataframe
imp_feats = pd.DataFrame({'features':np.array(X.columns), 'importance':model.feature_importances_})
imp_feats.sort_values('importance', ascending=False, inplace=True)

In [27]:
imp_feats.head(10)

Unnamed: 0,features,importance
344,yr_2017,0.147952
326,desc_FIXED SPEED CAMERA,0.074493
343,yr_2016,0.070097
354,mo_11,0.055478
0,fine,0.043222
318,desc_ALL OTHER PARKING METER VIOLATIONS,0.031214
324,desc_EXPIRED TAGS,0.030861
352,mo_9,0.02973
410,quad_SOUTHEAST,0.02419
351,mo_8,0.021665


The most important features for determining whether a citation will be paid according to the random forest model are the: **year** of citation, the **type of violation** that occured, the **month** of citation, and the **fine**.
