In [78]:
import pandas as pd
import numpy as np

### Load data files, inspect variables

In [79]:
# load data sets
df_test = pd.read_csv('readonly/test.csv',encoding = 'ISO-8859-1')
df_train = pd.read_csv('readonly/train.csv',encoding = 'ISO-8859-1',low_memory=False)
df_latlons = pd.read_csv('readonly/latlons.csv',encoding = 'ISO-8859-1')
df_addresses = pd.read_csv('readonly/addresses.csv',encoding = 'ISO-8859-1')

# drop NaNs in outcome variable
df_train=df_train.dropna(subset=['compliance'])

In [80]:
# look at some of the possible predictors

numeric = ['ticket_id','fine_amount','admin_fee','state_fee','late_fee','judgment_amount','discount_amount','clean_up_cost']
print(df_train[numeric].describe())

categorical = ['disposition','agency_name']
for item in categorical:
    print('')
    print(item)
    print(df_train[item].value_counts())

print('')
print('Any NaNs anywhere?',
      df_train[numeric].isnull().sum().sum(),
      df_test[numeric].isnull().sum().sum(),
      df_train[categorical].isnull().sum().sum(),
      df_test[categorical].isnull().sum().sum())

# df_train[['numeric']].groupby(by='compliance').mean()

           ticket_id    fine_amount  admin_fee  state_fee       late_fee  \
count  159880.000000  159880.000000   159880.0   159880.0  159880.000000   
mean   150453.858794     357.035295       20.0       10.0      33.651512   
std     77224.650876     675.655580        0.0        0.0      67.692916   
min     18645.000000       0.000000       20.0       10.0       0.000000   
25%     83370.750000     200.000000       20.0       10.0      10.000000   
50%    149777.500000     250.000000       20.0       10.0      25.000000   
75%    217480.250000     250.000000       20.0       10.0      25.000000   
max    299363.000000   10000.000000       20.0       10.0    1000.000000   

       judgment_amount  discount_amount  clean_up_cost  
count    159880.000000    159880.000000       159880.0  
mean        420.650218         0.195959            0.0  
std         742.555062         4.290344            0.0  
min           0.000000         0.000000            0.0  
25%         250.000000        

### Choose predictor variables, choose model

In [81]:
# choose predictor variable set
predictors = ['late_fee','judgment_amount','fine_amount']
geo_predictors = [] #['lat','lon']
cat_predictors = ['disposition','agency_name'] #,'inspector_name','violation_code','violation_street_name']

In [82]:
# choose model: gbtc or logit
whichmodel = 'gbtc'

### Assemble data sets

In [83]:
df_test['is_in_test'] = 1
df_train['is_in_test'] = 0
df = pd.concat([df_train,df_test])
df.head()

# predictor variable set
templist = predictors[:]+cat_predictors[:]
templist.append('ticket_id')
templist.append('is_in_test')
templist.append('compliance')
df = df[templist]

# set ticket_id as index
df.set_index('ticket_id',inplace=True)

#### Encode categorical variables, if any

In [84]:
# add dummies for the categorical predictors if the list is not empty
if cat_predictors:
#     # dummies
#     dum = pd.get_dummies(df[cat_predictors])
#     df = pd.concat([df, dum.set_index(df.index)], axis=1)

    # labels (provided we're using tree model)
    for item in cat_predictors:
        df[item] = df[item].astype('category') 
    df[cat_predictors] = df[cat_predictors].apply(lambda x: x.cat.codes)

#### Deal with geographical predictors, if any

In [85]:
if geo_predictors:
    # fix NaNs in lat lon data
    df_latlons['was_null'] = 0
    df_latlons.loc[df_latlons['lat'].isnull(),'was_null']=1

    # df_latlons[df_latlons['lat'].isnull()]

    # fix joy 
    df_latlons.loc[df_latlons['address']=='8325 joy rd, Detroit MI 482O4','lat'] = df_latlons.loc[df_latlons['address']=='8325 joy rd, Detroit MI','lat'].values[0]
    df_latlons.loc[df_latlons['address']=='8325 joy rd, Detroit MI 482O4','lon'] = df_latlons.loc[df_latlons['address']=='8325 joy rd, Detroit MI','lon'].values[0]

    # fix prairie
    df_latlons.loc[df_latlons['address']=='12038 prairie, Detroit MI 482O4','lat'] = df_latlons.loc[df_latlons['address']=='12038 prairie, Detroit MI','lat'].values[0]
    df_latlons.loc[df_latlons['address']=='12038 prairie, Detroit MI 482O4','lon'] = df_latlons.loc[df_latlons['address']=='12038 prairie, Detroit MI','lon'].values[0]

    # fix elijah
    df_latlons.loc[df_latlons['address']=='1201 elijah mccoy dr, Detroit MI 482O8','lat'] = df_latlons.loc[df_latlons['address']=='1201 elijah mccoy dr, Detroit MI','lat'].values[0]
    df_latlons.loc[df_latlons['address']=='1201 elijah mccoy dr, Detroit MI 482O8','lon'] = df_latlons.loc[df_latlons['address']=='1201 elijah mccoy dr, Detroit MI','lon'].values[0]

    # fix 16th 
    df_latlons.loc[df_latlons['address']=='6200 16th st, Detroit MI 482O8','lat'] = df_latlons.loc[df_latlons['address']=='6200 16th st, Detroit MI','lat'].values[0]
    df_latlons.loc[df_latlons['address']=='6200 16th st, Detroit MI 482O8','lon'] = df_latlons.loc[df_latlons['address']=='6200 16th st, Detroit MI','lon'].values[0]

    # fix bramford - last because we're using bfill
    df_latlons = df_latlons.sort_values(by=['address'])
    df_latlons = df_latlons.fillna(method='bfill')

    # df_latlons[df_latlons['address'].str.contains("6200 16th")].sort_values(by=['address'])
    df_latlons[df_latlons['lat'].isnull()]

    df_latlons = df_latlons.drop(['was_null'],axis=1)
    # df_latlons.head()

    # ------------------------------------------------------------------------------------
    # merge with addresses 

    df0 = df[['ticket_id','violation_street_number','violation_street_name','violation_zip_code']]

    # df_addresses.head()
    # print(len(df_addresses)) # 311307

    df1 = df0.merge(df_addresses,left_on='ticket_id',right_on='ticket_id')
    # df1 = df1.sort_values(by=['ticket_id'])
    # df1.loc[df1['address'].isnull(),:] # none. every ticket id in train.csv can be matched to an address
    # df1.head(20)

    # ------------------------------------------------------------------------------------
    # merge with lat/lon data

    # df_latlons.head()
    # print(len(df_latlons)) # 121769

    df2 = df1.merge(df_latlons,left_on='address',right_on='address')
    # df2 = df2.sort_values(by=['ticket_id'])
    # df2.head(20)
    # df2.loc[df2['lat'].isnull(),:] # none, fixed those at the beginning of the script

    # ------------------------------------------------------------------------------------
    # merge with rest of predictors
    # only keep lat lon, not the address columns we used for merging

    df = df.merge( df2[['lat','lon','ticket_id']],left_index=True,right_on='ticket_id')

#### Create train test split

In [86]:
# split out the test data set
X_test = df[df['is_in_test']==1]
X_test = X_test.drop(['is_in_test','compliance'],axis=1)
X_test.head()

Unnamed: 0_level_0,late_fee,judgment_amount,fine_amount,disposition,agency_name
ticket_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
284932,20.0,250.0,200.0,5,1
285362,100.0,1130.0,1000.0,5,1
285361,10.0,140.0,100.0,5,1
285338,20.0,250.0,200.0,5,1
285346,10.0,140.0,100.0,5,1


In [87]:
# split out training data set
Xy = df[df['is_in_test']==0]
Xy = Xy.drop(['is_in_test'],axis=1)
Xy.head()

Unnamed: 0_level_0,late_fee,judgment_amount,fine_amount,disposition,agency_name,compliance
ticket_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
22056,25.0,305.0,250.0,5,0,0.0
27586,75.0,855.0,750.0,6,0,1.0
22046,25.0,305.0,250.0,5,0,0.0
18738,75.0,855.0,750.0,5,0,0.0
18735,10.0,140.0,100.0,5,0,0.0


In [88]:
# create X and y variables
y = Xy['compliance']
X = Xy.drop(['compliance'],axis=1)

# split train crossval set
from sklearn.model_selection import train_test_split
X_train, X_crossval, y_train, y_crossval = train_test_split(X, y, random_state=1)

### Model training

In [90]:
if whichmodel=='gbtc':
    # from sklearn.ensemble import GradientBoostingClassifier
    # from sklearn.model_selection import GridSearchCV
    # from sklearn.metrics import roc_auc_score

    # grid_values = {'learning_rate': [0.01, 0.1, 1], 'max_depth': [3, 4, 5]}
    # gbtc_clf = GradientBoostingClassifier(random_state = 0)
    # clf = GridSearchCV(gbtc_clf, param_grid = grid_values, scoring = 'roc_auc')
    # clf.fit(X_train, y_train)

    # print('Grid best parameter (max. AUC): ', clf.best_params_)
    # print('Grid best score (AUC): ', clf.best_score_)
    # print(clf.score(X_train,y_train))
    # print(roc_auc_score(y_train, y_train_decisionfcn))

    # print(clf.cv_results_)

    from sklearn.ensemble import GradientBoostingClassifier
    clf = GradientBoostingClassifier().fit(X_train,y_train)
else:
    from sklearn.linear_model import LogisticRegression
    clf = LogisticRegression().fit(X_train, y_train)

### Evaluation

In [91]:
y_train_predicted = clf.predict(X_train)
y_crossval_predicted = clf.predict(X_crossval)

y_train_decisionfcn = clf.decision_function(X_train)
y_crossval_decisionfcn = clf.decision_function(X_crossval)

In [97]:
if whichmodel=='gbtc':
    # only with tree model
    zippedlist = list(zip(list(X_train),clf.feature_importances_))
    print(pd.DataFrame(zippedlist,columns=['predictor','coefficient']))

         predictor  coefficient
0         late_fee     0.321074
1  judgment_amount     0.171196
2      fine_amount     0.108940
3      disposition     0.246381
4      agency_name     0.152408


In [100]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
# Accuracy = TP + TN / (TP + TN + FP + FN)
# Precision = TP / (TP + FP)
# Recall = TP / (TP + FN)  Also known as sensitivity, or True Positive Rate
# F1 = 2 * Precision * Recall / (Precision + Recall) 

print('Accuracy: {:.2f}'.format(accuracy_score(y_train, y_train_predicted)))
print('Precision: {:.2f}'.format(precision_score(y_train, y_train_predicted)))
print('Recall: {:.2f}'.format(recall_score(y_train, y_train_predicted)))
print('F1: {:.2f}'.format(f1_score(y_train, y_train_predicted)))
print('AUC: {:.2f}'.format(roc_auc_score(y_train, y_train_decisionfcn)))
print('Confusion matrix:',confusion_matrix(y_train, y_train_predicted))
print('')
print('Accuracy: {:.2f}'.format(accuracy_score(y_crossval, y_crossval_predicted)))
print('Precision: {:.2f}'.format(precision_score(y_crossval, y_crossval_predicted)))
print('Recall: {:.2f}'.format(recall_score(y_crossval, y_crossval_predicted)))
print('F1: {:.2f}'.format(f1_score(y_crossval, y_crossval_predicted)))
print('AUC: {:.2f}'.format(roc_auc_score(y_crossval, y_crossval_decisionfcn)))
print('Confusion matrix:', confusion_matrix(y_crossval, y_crossval_predicted))

Accuracy: 0.94
Precision: 0.89
Recall: 0.17
F1: 0.29
AUC: 0.79
Confusion matrix: [[111010    183]
 [  7229   1488]]

Accuracy: 0.94
Precision: 0.88
Recall: 0.17
F1: 0.28
AUC: 0.79
Confusion matrix: [[37027    63]
 [ 2399   481]]


### Output: predicted probability of outcome on test set

In [101]:
# probs = pd.Series(clf.predict_proba(X_test.sort_index())[:,1],index=pd.Index(X_test.index.values))
out = clf.predict_proba(X_test).astype(np.float32)
probs = pd.Series(out[:,1],index=X_test.index)
probs.index = list(probs.index) # to match index type that autograder expects
probs.head()

284932    0.048627
285362    0.016184
285361    0.058392
285338    0.048627
285346    0.058392
dtype: float32