In [24]:
import pandas as pd
import os
import sklearn
import numpy as np
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle
import json
from sklearn.metrics import roc_auc_score
import category_encoders
from sklearn.preprocessing import  FunctionTransformer, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import category_encoders as ce
from sklearn.impute import SimpleImputer 


pd.set_option('display.max_columns', 100) # pandas option to display up until n columns

In [27]:
import seaborn as sns

In [15]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_curve,
    auc,
    roc_auc_score,
    accuracy_score,
    average_precision_score,

    precision_recall_curve,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix
)

In [4]:
figsize = (16,8)

In [5]:
df = pd.read_csv('train.csv')

In [6]:
df['InterventionLocationName'] = df['InterventionLocationName'].str.strip()
df['InterventionLocationName'] = df['InterventionLocationName'].str.lower()
df['Department Name'] = df['Department Name'].str.strip()
df['Department Name'] = df['Department Name'].str.lower()
df = df.drop(columns = ['ReportingOfficerIdentificationID','InterventionDateTime'])
df.SubjectAge = df.SubjectAge.astype(int)
     

In [7]:
df = df.drop_duplicates()
df['SubjectAge'] = df['SubjectAge'].astype(int)

In [9]:
df = df.dropna()

In [11]:
df_ = df.set_index('VehicleSearchedIndicator').filter(like='True', axis=0).reset_index()

In [None]:
#df_ = df_.drop(columns = ['SubjectEthnicityCode', 'SubjectRaceCode', 'SubjectSexCode'])

In [12]:
df_ = df_.drop(columns = 'VehicleSearchedIndicator')

In [19]:
#df_ = df_.drop(columns = 'ResidentIndicator')

In [20]:
#df_ = df_.drop(columns = 'StatuteReason')

In [13]:
seed = 42
X, y = df_.drop('ContrabandIndicator', axis=1), df_.ContrabandIndicator

In [22]:
X.shape

(64293, 11)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

In [24]:
#sensitive_class = ['SubjectEthnicityCode', 'SubjectRaceCode', 'SubjectSexCode']

In [19]:
pipeline = make_pipeline(
    category_encoders.OneHotEncoder(handle_unknown='impute'),
    #SimpleImputer(strategy='mean'),
    LogisticRegression(),
)
pipeline.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('onehotencoder',
                 OneHotEncoder(cols=['Department Name',
                                     'InterventionLocationName',
                                     'InterventionReasonCode',
                                     'SearchAuthorizationCode', 'StatuteReason',
                                     'SubjectEthnicityCode', 'SubjectRaceCode',
                                     'SubjectSexCode'],
                               drop_invariant=False, handle_missing='value',
                               handle_unknown='impute', return_df=True,
                               use_cat_names=False, verbose=0)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
     

In [None]:
new_obs_str = '{"Department Name": "new Haven",  "SearchAuthorizationCode": "C", "InterventionReasonCode": "V", "SubjectRaceCode":"B", "SubjectSexCode":"M", "SubjectEthnicityCode":"H" ,"SubjectAge": 26, "InterventionLocationName": "New Haven", "StatuteReason":"Other","TownResidentIndicator":true, "ResidentIndicator":true }'

In [None]:
new_obs_dict = json.loads(new_obs_str)
print('type {}'.format(type(new_obs_dict)))

In [None]:
new_obs_dict

In [None]:
# First step is to create a dataframe with the columns in the correct
# order. You can get the correct order by getting the columns from
# the X_train dataframe with which the model was trained. Doing this
# will preserve the correct order.

# Also note that that you must pass the dictionary as an entry
# in an array, even if there is only a single one... scikit models
# always assume things are being processed in batches.
obs = pd.DataFrame([new_obs_dict], columns=X_train.columns.tolist())

# Now you need to make sure that the types are correct so that the
# pipeline steps will have things as expected.
obs = obs.astype(X_train.dtypes)

In [None]:
obs.head()

In [None]:
pipeline.predict(obs)[0]

In [None]:
with open('columns.json', 'w') as fh:
    json.dump(X_train.columns.tolist(), fh)

In [None]:
with open('dtypes.pickle', 'wb') as fh:
    pickle.dump(X_train.dtypes, fh)

In [None]:
from sklearn.externals import joblib
joblib.dump(pipeline, 'pipeline.pickle')

In [None]:
#with open('columns.json', 'r') as fh:
    #columns = json.load(fh)

In [None]:
#pipeline = joblib.load('pipeline.pickle')

In [None]:
new_obs_dict = json.loads(new_obs_str)
obs = pd.DataFrame([new_obs_dict], columns = X_train.columns)
obs = obs.astype(X_train.dtypes)

In [None]:
outcome = pipeline.predict(obs)
outcome

In [None]:
# there's only a single observation... so yeah
observation_index = 0
# This is the trick, go for the the positive class index
positive_class_index = 1
# You do indexing of numpy arrays a bit different than normal
# python arrays:
contraband_probabilty = outcome
print('Contraband: {} '.format(contraband_probabilty))

In [None]:
# curl -X POST http://localhost:5000/predict -d "{ "ResidentIndicator": true,\"Department Name": "new Haven", "SearchAuthorizationCode": "C", "StatuteReason": "Speed Related", "SubjectRaceCode": "B","InterventionReasonCode": "V",   "SubjectSexCode": "F", "SubjectEthnicityCode": "M", "SubjectAge": 26, "InterventionLocationName": "New Haven","TownResidentIndicator":true }"

In [20]:
pred_y_4 = pipeline.predict(X_train)

In [None]:
print( accuracy_score(y_train, pred_y_4) )

In [None]:
# What about AUROC?
prob_y_4 = pipeline.predict_proba(X_train)
prob_y_4 = [p[1] for p in prob_y_4]
print( roc_auc_score(y_train, prob_y_4) )

In [None]:
#X_test.drop(sensitive_class

In [35]:
y_pred = pipeline.predict(X_test.drop(sensitive_class, axis=1))
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.7203657444502051


In [None]:
print("F1-score:", f1_score(y_test, y_pred, average="macro"))
print("Precision:", precision_score(y_test, y_pred, average="macro"))
print("Recall:", recall_score(y_test, y_pred, average="macro"))

In [37]:
cr = classification_report(y_test, y_pred)
print(cr)

              precision    recall  f1-score   support

       False       0.76      0.85      0.80     14109
        True       0.61      0.46      0.53      7108

    accuracy                           0.72     21217
   macro avg       0.68      0.66      0.66     21217
weighted avg       0.71      0.72      0.71     21217



In [28]:
cm = confusion_matrix(y_test, y_pred)


ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax); #annot=True to annotate cells

# labels, title and ticks
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['0', '1']); ax.yaxis.set_ticklabels(['0', '1']);
confusion_

NameError: name 'y_pred' is not defined

In [None]:
def compute_outcome_stats(X_test_new):
        
        
        contraband = X_test_new['pred_y_4']
        n_hits  = sum(contraband)
        
        hit_rate = n_hits/len(contraband)
        return(pd.Series(data = { 
            'n_hits': n_hits,
            'hit_rate': hit_rate
        }))

In [None]:
X_test_new = X_test.copy()

In [None]:
X_test_new['pred_y_4'] = y_pred

In [None]:
X_test_new.groupby('SubjectEthnicityCode').apply(compute_outcome_stats)



In [None]:
df_.groupby('SubjectEth')

In [29]:
X_test_old =  X_test.copy()
X_test_old['pred_y_4'] = y_test


In [31]:
X_test_old.groupby('SubjectRaceCode').apply(compute_outcome_stats)

NameError: name 'compute_outcome_stats' is not defined

In [None]:
X_test_new.groupby('SubjectRaceCode').apply(compute_outcome_stats)

In [None]:

X_test_new.SubjectRaceCode.value_counts(normalize=True)