In [32]:
import pandas as pd
import datetime
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import ComplementNB
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.inspection import plot_partial_dependence
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
import statsmodels.api as sm
import seaborn as sns
import pickle as pkl

In [33]:
df = pd.read_csv('sf.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
conditions = [
    (df.subject_race == 'white') & (df.subject_sex == 'female'),
    (df.subject_race == 'hispanic') & (df.subject_sex == 'female'),
    (df.subject_race == 'black') & (df.subject_sex == 'female'),
    (df.subject_race == 'asian/pacific islander') & (df.subject_sex == 'female'),
    (df.subject_race == 'white') & (df.subject_sex == 'male'),
    (df.subject_race == 'hispanic') & (df.subject_sex == 'male'),
    (df.subject_race == 'black') & (df.subject_sex == 'male'),
    (df.subject_race == 'asian/pacific islander') & (df.subject_sex == 'male')]
choices = ['white_female', 'hispanic_female', 'black_female','asian_female',
          'white_male', 'hispanic_male', 'black_male','asian_male']
df['demographic'] = np.select(conditions, choices, default='other')

In [4]:
df.columns

Index(['raw_row_number', 'date', 'time', 'location', 'lat', 'lng', 'district',
       'subject_age', 'subject_race', 'subject_sex', 'type', 'arrest_made',
       'search_conducted', 'search_vehicle', 'search_basis', 'reason_for_stop',
       'raw_search_vehicle_description', 'raw_result_of_contact_description',
       'demographic'],
      dtype='object')

In [5]:
df2 = df[['subject_age','demographic']]

In [6]:
df_to_mod_dummies = pd.get_dummies(df2.demographic)

In [8]:
df_to_mod_dummies['age'] = df2.subject_age

In [10]:
bin_ranges = [0, 18, 35, 45, 60, 75, 100]
bin_names = [1, 2, 3, 4, 5, 6]

In [11]:
df_to_mod_dummies['age_bin_custom_range'] = pd.cut(np.array(df_to_mod_dummies.age),bins=bin_ranges)
df_to_mod_dummies['age_bin_custom_label'] = pd.cut(np.array(df_to_mod_dummies.age),bins=bin_ranges,labels=bin_names)

In [12]:
df_to_mod_dummies.columns

Index(['asian_female', 'asian_male', 'black_female', 'black_male',
       'hispanic_female', 'hispanic_male', 'other', 'white_female',
       'white_male', 'age', 'age_bin_custom_range', 'age_bin_custom_label'],
      dtype='object')

In [110]:
X = df_to_mod_dummies.dropna()
X = X.drop('age',axis=1)
X = X.drop('age_bin_custom_range',axis=1)
y = df.search_conducted
X = X.iloc[:5000,]
y = y[:5000]

In [111]:
y_test.value_counts()

False    1240
True       10
Name: arrest_made, dtype: int64

In [112]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [113]:
clf = LogisticRegression(C=1000, class_weight={0:0.05,1:1},random_state=42, solver='liblinear').fit(X_train, y_train)

In [114]:
clf.score(X_test,y_test)

0.9264

In [115]:
np.exp(clf.coef_)

array([[1.21852611, 0.78595396, 0.84645542, 0.59295682, 1.76922872,
        0.86718357, 1.09800604, 0.95114528, 1.12261599, 0.97747859]])

        p       odds     logodds  
      .001    .001001  -6.906755
       .01    .010101   -4.59512
       .15   .1764706  -1.734601
        .2        .25  -1.386294
       .25   .3333333  -1.098612
        .3   .4285714  -.8472978
       .35   .5384616  -.6190392
        .4   .6666667  -.4054651
       .45   .8181818  -.2006707
        .5          1          0
       .55   1.222222   .2006707
        .6        1.5   .4054651
       .65   1.857143   .6190392
        .7   2.333333   .8472978
       .75          3   1.098612
        .8          4   1.386294
       .85   5.666667   1.734601
        .9          9   2.197225
      .999        999   6.906755
     .9999       9999    9.21024

In [116]:
original_variables = list(X)
zipped_together = list(zip(original_variables, clf.coef_[0]))
coefs = [list(x) for x in zipped_together]
coefs = pd.DataFrame(coefs, columns=["Variable", "Coefficient"])
coefs.sort_values(by=["Coefficient"], axis=0, inplace=True, ascending=False)
coefs.drop_duplicates(keep=False)
coefs

Unnamed: 0,Variable,Coefficient
4,hispanic_female,0.570544
0,asian_female,0.197642
8,white_male,0.115662
6,other,0.093496
9,age_bin_custom_label,-0.022779
7,white_female,-0.050088
5,hispanic_male,-0.142505
2,black_female,-0.166698
1,asian_male,-0.240857
3,black_male,-0.522634


In [117]:
coefs.sort_index()

Unnamed: 0,Variable,Coefficient
0,asian_female,0.197642
1,asian_male,-0.240857
2,black_female,-0.166698
3,black_male,-0.522634
4,hispanic_female,0.570544
5,hispanic_male,-0.142505
6,other,0.093496
7,white_female,-0.050088
8,white_male,0.115662
9,age_bin_custom_label,-0.022779


In [118]:
confusion_matrix(y_test,clf.predict(X_test))

array([[1154,   50],
       [  42,    4]])

In [None]:
sns.distplot()

In [44]:
sm_log_reg.cov_params()

Unnamed: 0,asian_female,asian_male,black_female,black_male,hispanic_female,hispanic_male,other,white_female,white_male
asian_female,167728500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
asian_male,0.0,0.507194,0.0,0.0,0.0,0.0,0.0,0.0,0.0
black_female,0.0,0.0,1.02,0.0,0.0,0.0,0.0,0.0,0.0
black_male,0.0,0.0,0.0,0.25813,0.0,0.0,0.0,0.0,0.0
hispanic_female,0.0,0.0,0.0,0.0,2460292000000000.0,0.0,0.0,0.0,0.0
hispanic_male,0.0,0.0,0.0,0.0,0.0,0.342593,0.0,0.0,0.0
other,0.0,0.0,0.0,0.0,0.0,0.0,1.007813,0.0,0.0
white_female,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.009524,0.0
white_male,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.004065


In [45]:
sm_log_reg.summary()

0,1,2,3
Dep. Variable:,arrest_made,No. Observations:,1000.0
Model:,Logit,Df Residuals:,991.0
Method:,MLE,Df Model:,8.0
Date:,"Thu, 18 Jun 2020",Pseudo R-squ.:,0.06301
Time:,11:17:38,Log-Likelihood:,-65.001
converged:,False,LL-Null:,-69.372
Covariance Type:,nonrobust,LLR p-value:,0.3645

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
asian_female,-22.8697,1.3e+04,-0.002,0.999,-2.54e+04,2.54e+04
asian_male,-4.2413,0.712,-5.955,0.000,-5.637,-2.845
black_female,-3.9120,1.010,-3.873,0.000,-5.891,-1.933
black_male,-3.4259,0.508,-6.743,0.000,-4.422,-2.430
hispanic_female,-39.0500,4.96e+07,-7.87e-07,1.000,-9.72e+07,9.72e+07
hispanic_male,-3.5835,0.585,-6.122,0.000,-4.731,-2.436
other,-4.8520,1.004,-4.833,0.000,-6.820,-2.884
white_female,-4.6540,1.005,-4.632,0.000,-6.623,-2.685
white_male,-5.5053,1.002,-5.494,0.000,-7.469,-3.541


In [50]:
sm_log_reg.pred_table()

array([[987.,   0.],
       [ 13.,   0.]])

In [None]:
blk_searched.outcome.value_counts()

In [None]:
wht_searched.outcome.value_counts()

In [None]:
2071 / 11707

In [None]:
2573 / 23622

In [None]:
P = np.asarray([.16, .84, .11, .89]).reshape(2,2)

In [None]:
step_one = [1,0]

In [None]:
step_two = np.dot(step_one, P)

In [None]:
step_two

In [None]:
steady = np.dot(step_one, np.linalg.matrix_power(P,100))

In [None]:
steady

In [None]:
X = np.asarray([.8, .92, .18, .82]).reshape(2,2)

In [None]:
first_step = [1,0]

In [None]:
steady_w = np.dot(first_step, np.linalg.matrix_power(X,100))

In [None]:
steady_w