In [47]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.dummy import DummyClassifier
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN
import statsmodels.api as sm
import seaborn as sns
from collections import Counter
import pickle as pkl

=

In the code below, I build a simple logistic regression to predict which demographic group is most likely to have their car searched following a traffic stop in San Francisco. The data is from the Stanford Policing Project. I believe this code sample demonstrates the important skill of data manipulation and wrangling. I am happy to provide more samples upon request.  

=


In [48]:
df = pd.read_csv('sf.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
conditions = [
    (df.subject_race == 'white') & (df.subject_sex == 'female'),
    (df.subject_race == 'hispanic') & (df.subject_sex == 'female'),
    (df.subject_race == 'black') & (df.subject_sex == 'female'),
    (df.subject_race == 'asian/pacific islander') & (df.subject_sex == 'female'),
    (df.subject_race == 'white') & (df.subject_sex == 'male'),
    (df.subject_race == 'hispanic') & (df.subject_sex == 'male'),
    (df.subject_race == 'black') & (df.subject_sex == 'male'),
    (df.subject_race == 'asian/pacific islander') & (df.subject_sex == 'male')]
choices = ['white_female', 'hispanic_female', 'black_female','asian_female',
          'white_male', 'hispanic_male', 'black_male','asian_male']
df['demographic'] = np.select(conditions, choices, default='other')

In [6]:
df.columns

Index(['raw_row_number', 'date', 'time', 'location', 'lat', 'lng', 'district',
       'subject_age', 'subject_race', 'subject_sex', 'type', 'arrest_made',
       'search_conducted', 'search_vehicle', 'search_basis', 'reason_for_stop',
       'raw_search_vehicle_description', 'raw_result_of_contact_description',
       'demographic'],
      dtype='object')

In [7]:
df2 = df[['subject_age','demographic']]

In [8]:
df_to_mod_dummies = pd.get_dummies(df2.demographic)

In [9]:
df_to_mod_dummies['age'] = df2.subject_age

In [10]:
bin_ranges = [0, 18, 35, 45, 60, 75, 100]
bin_names = [1, 2, 3, 4, 5, 6]

In [11]:
df_to_mod_dummies['age_bin_custom_range'] = pd.cut(np.array(df_to_mod_dummies.age),bins=bin_ranges)
df_to_mod_dummies['age_bin_custom_label'] = pd.cut(np.array(df_to_mod_dummies.age),bins=bin_ranges,labels=bin_names)

In [12]:
df_to_mod_dummies.columns

Index(['asian_female', 'asian_male', 'black_female', 'black_male',
       'hispanic_female', 'hispanic_male', 'other', 'white_female',
       'white_male', 'age', 'age_bin_custom_range', 'age_bin_custom_label'],
      dtype='object')

In [34]:
X = df_to_mod_dummies.dropna()
X = X.drop('age',axis=1)
X = X.drop('age_bin_custom_range',axis=1)
y = df.search_conducted
X = X.iloc[:5000,]
y = y[:5000]

In [35]:
print('Original Target Variable Distribution:', Counter(y))

Original Target Variable Distribution: Counter({False: 4813, True: 187})


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=46)

In [36]:
ada = ADASYN(sampling_strategy='minority', random_state= 46 , n_neighbors = 4)

In [37]:
X_res, y_res = ada.fit_resample(X,y)

In [38]:
print('Oversampled Target Variable Distribution:', Counter(y_res))

Oversampled Target Variable Distribution: Counter({False: 4813, True: 4741})


In [39]:
dummy_clf = DummyClassifier(strategy="prior")
dummy_clf.fit(X_res, y_res)

DummyClassifier(strategy='prior')

In [41]:
dummy_clf.score(X_res,y_res)

0.5037680552648105

In [50]:
imputer = KNNImputer(n_neighbors=3, weights="uniform")
imputer.fit_transform(X_res)

array([[0., 0., 0., ..., 0., 1., 4.],
       [0., 0., 0., ..., 0., 1., 2.],
       [1., 0., 0., ..., 0., 0., 4.],
       ...,
       [0., 0., 0., ..., 0., 1., 2.],
       [0., 0., 0., ..., 0., 1., 2.],
       [0., 0., 0., ..., 0., 1., 2.]])

In [42]:
clf_rf = RandomForestClassifier(max_depth=2, random_state=46)

In [43]:
clf_rf.fit(X_res, y_res)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
clf = LogisticRegression(C=1000, class_weight={0:0.05,1:1},random_state=42, solver='liblinear').fit(X_train, y_train)

In [None]:
clf.score(X_test,y_test)

In [None]:
np.exp(clf.coef_)

        p       odds     logodds  
      .001    .001001  -6.906755
       .01    .010101   -4.59512
       .15   .1764706  -1.734601
        .2        .25  -1.386294
       .25   .3333333  -1.098612
        .3   .4285714  -.8472978
       .35   .5384616  -.6190392
        .4   .6666667  -.4054651
       .45   .8181818  -.2006707
        .5          1          0
       .55   1.222222   .2006707
        .6        1.5   .4054651
       .65   1.857143   .6190392
        .7   2.333333   .8472978
       .75          3   1.098612
        .8          4   1.386294
       .85   5.666667   1.734601
        .9          9   2.197225
      .999        999   6.906755
     .9999       9999    9.21024

In [None]:
original_variables = list(X)
zipped_together = list(zip(original_variables, clf.coef_[0]))
coefs = [list(x) for x in zipped_together]
coefs = pd.DataFrame(coefs, columns=["Variable", "Coefficient"])
coefs.sort_values(by=["Coefficient"], axis=0, inplace=True, ascending=False)
coefs.drop_duplicates(keep=False)
coefs

In [None]:
coefs.sort_index()

In [None]:
confusion_matrix(y_test,clf.predict(X_test))

In [None]:
sns.distplot()

In [None]:
sm_log_reg.cov_params()

In [None]:
sm_log_reg.summary()