In [1]:
import sys
sys.path.append("../../src")
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

from project import rf_models, preprocessing

df = pd.read_csv("../../data/initial_custody_2017_gsprs.csv", low_memory=False)

df['high_ic_instit_adj'] = np.where(df['ic_institut_adj']>2, 1, 0)
df['ic_override_up'] = np.where(df['ic_ovride_cust_lvl']-df['ic_custdy_level']>0,1,0)
df['high_ic'] = np.where(df['ic_custdy_level']>3, 1, 0)


# preprocessing
data = preprocessing.preprocess_input_vars(df)
data = data[
    [
        "gender_female",
        "age_gt_45",
        "age_lt_25",
        "race_B",
        "race_A",
        "race_H",
        "race_I",
        "race_O",
        "off_1_prs_max",
        "off_1_gs_max",
        "ic_custdy_level",
        "prior_commits",
        "ic_institut_adj",
        "escape_hist_1",
        "escape_hist_2",
        "escape_hist_3",
        "escape_hist_4",
        "escape_hist_5",
        "mrt_stat_DIV",
        "mrt_stat_SEP",
        "mrt_stat_MAR",
        "mrt_stat_WID",
        "employed",
        "high_ic_instit_adj",
        "re_discip_reports",
        "ic_ovride_cust_lvl",
        "ic_override_up",
        "high_ic"
    ]
]
data = data.dropna()




## Calculating independence

$P(D = 1\mid A=a) \stackrel{?}{=} P(D=1\mid A=a^\prime)$

### First for high institutional adjustment or not

In [2]:
pd.crosstab(data["high_ic"], data["race_B"], margins=True, normalize="columns")

race_B,0,1,All
high_ic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.715152,0.438298,0.6
1,0.284848,0.561702,0.4


In [3]:
pd.crosstab(data["high_ic"], data["race_H"], margins=True, normalize="columns")

race_H,0,1,All
high_ic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.612205,0.491228,0.6
1,0.387795,0.508772,0.4


In [4]:
pd.crosstab(data["high_ic"], data["age_gt_45"], margins=True, normalize="columns")

age_gt_45,0,1,All
high_ic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.579681,0.761905,0.6
1,0.420319,0.238095,0.4


In [5]:
pd.crosstab(data["high_ic"], data["gender_female"], margins=True, normalize="columns")

gender_female,0,1,All
high_ic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.588126,0.846154,0.6
1,0.411874,0.153846,0.4


In [6]:
pd.crosstab(data["high_ic_instit_adj"], data["race_B"], margins=True, normalize="columns")

race_B,0,1,All
high_ic_instit_adj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.690909,0.548936,0.631858
1,0.309091,0.451064,0.368142


In [7]:
# just checking that I calculated things correctly
(data.groupby(['high_ic_instit_adj', 'race_B']).count() / data.groupby(['race_B']).count())["age_gt_45"]
# I'll use Crosstab from now on

high_ic_instit_adj  race_B
0                   0         0.690909
                    1         0.548936
1                   0         0.309091
                    1         0.451064
Name: age_gt_45, dtype: float64

In [8]:
pd.crosstab(data["high_ic_instit_adj"], data["race_H"], margins=True, normalize="columns")

race_H,0,1,All
high_ic_instit_adj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.641732,0.54386,0.631858
1,0.358268,0.45614,0.368142


In [9]:
pd.crosstab(data["high_ic_instit_adj"], data["age_gt_45"], margins=True, normalize="columns")

age_gt_45,0,1,All
high_ic_instit_adj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.621514,0.714286,0.631858
1,0.378486,0.285714,0.368142


In [10]:
pd.crosstab(data["high_ic_instit_adj"], data["gender_female"], margins=True, normalize="columns")

gender_female,0,1,All
high_ic_instit_adj,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.628942,0.692308,0.631858
1,0.371058,0.307692,0.368142


Summary:  We observe that these probablities of assigning an incarcerated person a high institutional adjustment score (defined to be > 2) conditioned on whether the person is female (0.476) or male (0.454) are relatively similar as they are when condition on whether a person is Hispanic (0.516) or not (0.471).  We find that there is a larger difference when the probability is conditioned on the person being Black (0.541) or not (0.402) or when the person is younger than 45 years old (0.518) or not (0.255). 

### Now for override to a higher custody level

In [11]:
pd.crosstab(data["ic_override_up"], data["race_B"], margins=True, normalize="columns")

race_B,0,1,All
ic_override_up,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.633333,0.455319,0.559292
1,0.366667,0.544681,0.440708


In [12]:
pd.crosstab(data["ic_override_up"], data["race_H"], margins=True, normalize="columns")

race_H,0,1,All
ic_override_up,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.557087,0.578947,0.559292
1,0.442913,0.421053,0.440708


In [13]:
pd.crosstab(data["ic_override_up"], data["age_gt_45"], margins=True, normalize="columns")

age_gt_45,0,1,All
ic_override_up,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.567729,0.492063,0.559292
1,0.432271,0.507937,0.440708


In [14]:
pd.crosstab(data["ic_override_up"], data["gender_female"], margins=True, normalize="columns")

gender_female,0,1,All
ic_override_up,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.55102,0.730769,0.559292
1,0.44898,0.269231,0.440708


Summary: We observe that these probablities of overriding an incarcerated person's custody level to a higher custody level (defined to be > 2) conditioned on whether the person is Hispanic (0.421) or not (0.443) are not that different. On the other hand, we find that there is a larger difference when the probability is conditioned on the person being Black (0.545) or not (0.367), when the person is younger than 45 years old (0.432) or not (0.508), or when the person is male (0.449) or not (0.269). 

### Do overrides happen at the same rate for people by race and similar institutional adjustments

$P(D = 1\mid A=a\, \&\, B=1) \stackrel{?}{=} P(D=1\mid A=a^\prime\, \& \,B=1)$, where $B$ is high custody level and $D$ is whether or not an higher override was given

In [15]:
data_high_instit_adj = data[data["ic_institut_adj"]>2]

In [16]:
pd.crosstab(data_high_instit_adj["ic_override_up"],data_high_instit_adj["race_B"],margins=True, normalize="columns")

race_B,0,1,All
ic_override_up,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.421569,0.254717,0.336538
1,0.578431,0.745283,0.663462


In [17]:
pd.crosstab(data_high_instit_adj["ic_override_up"],data_high_instit_adj["age_gt_45"],margins=True, normalize="columns")

age_gt_45,0,1,All
ic_override_up,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.352632,0.166667,0.336538
1,0.647368,0.833333,0.663462


In [18]:
pd.crosstab(data_high_instit_adj["ic_override_up"],data_high_instit_adj["gender_female"],margins=True, normalize="columns")

gender_female,0,1,All
ic_override_up,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.33,0.5,0.336538
1,0.67,0.5,0.663462


In [19]:
pd.crosstab(data_high_instit_adj["ic_override_up"],data_high_instit_adj["race_H"],margins=True, normalize="columns")

race_H,0,1,All
ic_override_up,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.340659,0.307692,0.336538
1,0.659341,0.692308,0.663462


## Algorithmic fairness.  Same kinds of calculations but for the prediction from the model

In [30]:
X = data.drop("ic_custdy_level", axis=1)
Y = data["ic_custdy_level"]

data_array = X
ground_truth = Y

#Fit the classifier
rf = rf_models.BasicRFModel(data_array, ground_truth)
rf.fit()

In [37]:
data["pred_rf"] = rf.predict(data_array)
data["hi_pred_rf"] = np.where(data['pred_rf']>3, 1, 0)
data_high_instit_adj = data[data["ic_institut_adj"]>2]

### Are the predictions independent of the protected variables
$P(\hat{Y} = 1\mid A=a) \stackrel{?}{=} P(\hat{Y}=1\mid A=a^\prime)$


In [38]:
pd.crosstab(data["hi_pred_rf"], data["race_B"], margins=True, normalize="columns")

race_B,0,1,All
hi_pred_rf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.715152,0.438298,0.6
1,0.284848,0.561702,0.4


In [39]:
pd.crosstab(data["hi_pred_rf"], data["race_H"], margins=True, normalize="columns")

race_H,0,1,All
hi_pred_rf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.612205,0.491228,0.6
1,0.387795,0.508772,0.4


In [40]:
pd.crosstab(data["hi_pred_rf"], data["age_gt_45"], margins=True, normalize="columns")

age_gt_45,0,1,All
hi_pred_rf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.579681,0.761905,0.6
1,0.420319,0.238095,0.4


In [41]:
pd.crosstab(data["hi_pred_rf"], data["gender_female"], margins=True, normalize="columns")

gender_female,0,1,All
hi_pred_rf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.588126,0.846154,0.6
1,0.411874,0.153846,0.4


### What if we control for institutional adjustment

$P(\hat{Y} = 1\mid A=a\, \&\, B=1) \stackrel{?}{=} P(\hat{Y}=1\mid A=a^\prime\, \& \,B=1)$, where $B$ is high custody level and $D$ is whether or not an higher override was given

In [46]:
pd.crosstab(data_high_instit_adj["hi_pred_rf"], data_high_instit_adj["race_B"], margins=True, normalize="columns")

race_B,0,1,All
hi_pred_rf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.460784,0.113208,0.283654
1,0.539216,0.886792,0.716346


In [47]:
pd.crosstab(data_high_instit_adj["hi_pred_rf"], data_high_instit_adj["race_H"], margins=True, normalize="columns")

race_H,0,1,All
hi_pred_rf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.296703,0.192308,0.283654
1,0.703297,0.807692,0.716346


In [48]:
pd.crosstab(data_high_instit_adj["hi_pred_rf"], data_high_instit_adj["age_gt_45"], margins=True, normalize="columns")

age_gt_45,0,1,All
hi_pred_rf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.268421,0.444444,0.283654
1,0.731579,0.555556,0.716346


In [49]:
pd.crosstab(data_high_instit_adj["hi_pred_rf"], data_high_instit_adj["gender_female"], margins=True, normalize="columns")

gender_female,0,1,All
hi_pred_rf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.265,0.75,0.283654
1,0.735,0.25,0.716346
