In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
import csv
from numpy import std, mean, sqrt
from scipy import stats

In [2]:
# define performance gain function
def treatment_gain(treatment_avg, baseline_avg, ra_average):
    return((treatment_avg - baseline_avg)/(ra_average - baseline_avg))

In [3]:
# read csv files into dataframes
workers = pd.read_csv('datasets/participants.csv')
results = pd.read_csv('datasets/results.csv')
sample = pd.read_csv('datasets/defendants.csv')

### Performance Summary - Across Treatments
Risk assessment model & participant performance in all treatments

In [4]:
ra_score_avg_performance = sum(workers['ra_brier_score']) / len(workers)
print("risk assessment model performance average:", ra_score_avg_performance)

risk assessment model performance average: 0.7954048896833629


In [5]:
participant_score_avg_performance = sum(workers['participant_brier_score']) / len(workers)
print("participant performance average:", participant_score_avg_performance)

participant performance average: 0.7443864042915929


### Treatment 0: Baseline
#### Performance

In [6]:
baseline_treatment = workers.loc[workers['treatment']==0]
baseline_avg = (sum(baseline_treatment['participant_brier_score']) / len(baseline_treatment))
print("average participant prediction score: ", baseline_avg)
print("performance gain: ", treatment_gain(baseline_avg, baseline_avg, 0))

average participant prediction score:  0.7108422939569895
performance gain:  -0.0


#### False Positive Rate

In [7]:
part_false_positive_avg = mean(workers.loc[workers['treatment']==0]['false_positive_participant'])
ra_false_positive_avg = mean(workers.loc[workers['treatment']==0]['false_positive_ra_black'] + workers.loc[workers['treatment']==0]['false_positive_ra_white'])
print('average participant false positive rate: ', part_false_positive_avg)
print('average risk assessment model false positive rate: ', ra_false_positive_avg)

average participant false positive rate:  5.408602150537634
average risk assessment model false positive rate:  nan


### Treatment 1: Risk assessment model only (unexplained)
#### Performance

In [8]:
ra_treatment = workers.loc[workers['treatment']==1]
part_ra_avg = (sum(ra_treatment['participant_brier_score']) / len(ra_treatment))
ra_avg = (sum(ra_treatment['ra_brier_score']) / len(ra_treatment))
print("average participant prediction score: ",part_ra_avg)
print("average risk assessment model prediction score: ",ra_avg)
print("performance gain ", treatment_gain(part_ra_avg, baseline_avg, ra_avg))

average participant prediction score:  0.7552974910215053
average risk assessment model prediction score:  0.7907383512795702
performance gain  0.556412901390463


In [9]:
print('significance of performance difference between participants in treatment and baseline')
t, p = stats.ttest_ind(list(ra_treatment['participant_brier_score']),list(baseline_treatment['participant_brier_score']))
print("t = " + str(t))
print("p = " + str(p))

significance of performance difference between participants in treatment and baseline
t = 4.952181204461114
p = 1.654033953927632e-06


In [10]:
print('significance of performance difference between participants and risk assessment model in this treatment')
t, p = stats.ttest_ind(list(ra_treatment['participant_brier_score']),list(ra_treatment['ra_brier_score']))
print("t = " + str(t))
print("p = " + str(p))

significance of performance difference between participants and risk assessment model in this treatment
t = -6.134892509090908
p = 5.097641234515415e-09


#### False Positive Rate

In [11]:
part_false_positive_avg = mean(workers.loc[workers['treatment']==1]['false_positive_participant'])
ra_false_positive_avg = mean(workers.loc[workers['treatment']==1]['false_positive_ra_black'] + workers.loc[workers['treatment']==1]['false_positive_ra_white'])
print('average participant false positive rate: ', part_false_positive_avg)
print('average risk assessment model false positive rate: ', ra_false_positive_avg)

average participant false positive rate:  4.548387096774194
average risk assessment model false positive rate:  3.774193548387097


In [12]:
print('significance of false positive rate difference between participants in treatment and baseline')
t, p = stats.ttest_ind(list(workers.loc[workers['treatment']==1]['false_positive_participant']),list(workers.loc[workers['treatment']==0]['false_positive_participant']))
print("t = " + str(t))
print("p = " + str(p))

significance of false positive rate difference between participants in treatment and baseline
t = -1.4621305912777358
p = 0.14541127040561125


### Treatment 2: Diverse Counterfactual
#### Performance

In [13]:
dcf_treatment = workers.loc[workers['treatment']==2]
dcf_avg = (sum(dcf_treatment['participant_brier_score']) / len(dcf_treatment))
dcf_ra_avg = (sum(dcf_treatment['ra_brier_score']) / len(dcf_treatment))
print("average participant prediction score: ",dcf_avg)
print("average risk assessment model prediction score: ",dcf_ra_avg)
print("performance gain ", treatment_gain(dcf_avg, baseline_avg, dcf_ra_avg))

average participant prediction score:  0.7498539326179776
average risk assessment model prediction score:  0.7974719100898878
performance gain  0.450326809726831


In [14]:
print('significance of performance difference between participants in treatment and baseline')
t, p = stats.ttest_ind(list(dcf_treatment['participant_brier_score']),list(baseline_treatment['participant_brier_score']))
print("t = " + str(t))
print("p = " + str(p))

significance of performance difference between participants in treatment and baseline
t = 4.019157106510655
p = 8.576195940785335e-05


In [15]:
print('significance of performance difference between participants in this treatment and risk assessment model only treatment')
t, p = stats.ttest_ind(list(dcf_treatment['participant_brier_score']),list(ra_treatment['participant_brier_score']))
print("t = " + str(t))
print("p = " + str(p))

significance of performance difference between participants in this treatment and risk assessment model only treatment
t = -0.7325351291591108
p = 0.46479480836807663


In [16]:
print('significance of performance difference between participants and risk assessment model in this treatment')
t, p = stats.ttest_ind(list(dcf_treatment['participant_brier_score']),list(dcf_treatment['ra_brier_score']))
print("t = " + str(t))
print("p = " + str(p))

significance of performance difference between participants and risk assessment model in this treatment
t = -7.123841474982188
p = 2.5860581750219422e-11


#### False Positive Rate

In [17]:
part_false_positive_avg = mean(workers.loc[workers['treatment']==2]['false_positive_participant'])
ra_false_positive_avg = mean(workers.loc[workers['treatment']==2]['false_positive_ra_black'] + workers.loc[workers['treatment']==2]['false_positive_ra_white'])
print('average participant false positive rate: ', part_false_positive_avg)
print('average RA false positive rate: ', ra_false_positive_avg)

average participant false positive rate:  5.269662921348314
average RA false positive rate:  3.2134831460674156


In [18]:
print('significance of false positive rate difference between participants in treatment and baseline')
t, p = stats.ttest_ind(list(workers.loc[workers['treatment']==2]['false_positive_participant']),list(workers.loc[workers['treatment']==0]['false_positive_participant']))
print("t = " + str(t))
print("p = " + str(p))

significance of false positive rate difference between participants in treatment and baseline
t = -0.21039128190777795
p = 0.8336002654028125


In [19]:
print('significance of false positive rate difference between participants in this treatment and risk assessment model only treatment')
t, p = stats.ttest_ind(list(workers.loc[workers['treatment']==2]['false_positive_participant']),list(workers.loc[workers['treatment']==1]['false_positive_participant']))
print("t = " + str(t))
print("p = " + str(p))

significance of false positive rate difference between participants in this treatment and risk assessment model only treatment
t = 1.3994348334758961
p = 0.16340359157624668


### Treatment 3: Selective Counterfactual
#### Performance

In [20]:
scf_treatment = workers.loc[workers['treatment']==3]
scf_avg = (sum(scf_treatment['participant_brier_score']) / len(scf_treatment))
scf_ra_avg = (sum(scf_treatment['ra_brier_score']) / len(scf_treatment))
print("participant prediction score average ",scf_avg)
print("risk assessment model prediction score average ",scf_ra_avg)
print("performance gain ", treatment_gain(scf_avg, baseline_avg, scf_ra_avg))

participant prediction score average  0.7432187499895831
risk assessment model prediction score average  0.7964548610833333
performance gain  0.3781741059675701


In [21]:
print('significance of performance difference between participants in treatment and baseline')
t, p = stats.ttest_ind(list(scf_treatment['participant_brier_score']),list(baseline_treatment['participant_brier_score']))
print("t = " + str(t))
print("p = " + str(p))

significance of performance difference between participants in treatment and baseline
t = 2.9663952574239496
p = 0.0034064770032793856


In [22]:
print('significance of performance difference between participants in this treatment and risk assessment model only treatment')
t, p = stats.ttest_ind(list(scf_treatment['participant_brier_score']),list(ra_treatment['participant_brier_score']))
print("t = " + str(t))
print("p = " + str(p))

significance of performance difference between participants in this treatment and risk assessment model only treatment
t = -1.3258986373886041
p = 0.1864905941081249


In [23]:
print('significance of performance difference between participants and risk assessment model in this treatment')
t, p = stats.ttest_ind(list(scf_treatment['participant_brier_score']),list(scf_treatment['ra_brier_score']))
print("t = " + str(t))
print("p = " + str(p))

significance of performance difference between participants and risk assessment model in this treatment
t = -6.343535926106329
p = 1.603214359018354e-09


#### False Positive Rates

In [24]:
part_false_positive_avg = mean(workers.loc[workers['treatment']==3]['false_positive_participant'])
ra_false_positive_avg = mean(workers.loc[workers['treatment']==3]['false_positive_ra_black'] + workers.loc[workers['treatment']==3]['false_positive_ra_white'])
print('average participant false positive rate: ', part_false_positive_avg)
print('average risk assessment model false positive rate: ', ra_false_positive_avg)

average participant false positive rate:  4.291666666666667
average risk assessment model false positive rate:  3.2395833333333335


In [25]:
print('significance of false positive rate difference between participants in treatment and baseline')
t, p = stats.ttest_ind(list(workers.loc[workers['treatment']==3]['false_positive_participant']),list(workers.loc[workers['treatment']==0]['false_positive_participant']))
print("t = " + str(t))
print("p = " + str(p))

significance of false positive rate difference between participants in treatment and baseline
t = -1.7866538118944646
p = 0.07561343888338606


In [26]:
print('significance of false positive rate difference between participants in this treatment and risk assessment model only treatment')
t, p = stats.ttest_ind(list(workers.loc[workers['treatment']==3]['false_positive_participant']),list(workers.loc[workers['treatment']==1]['false_positive_participant']))
print("t = " + str(t))
print("p = " + str(p))

significance of false positive rate difference between participants in this treatment and risk assessment model only treatment
t = -0.5320157226035148
p = 0.5953466012283722


### Treatment 4: Complete Feature Attribution
#### Performance

In [27]:
cpr_treatment = workers.loc[workers['treatment']==4]
cpr_avg = (sum(cpr_treatment['participant_brier_score']) / len(cpr_treatment))
cpr_ra_avg = (sum(cpr_treatment['ra_brier_score']) / len(cpr_treatment))
print("participant prediction score average ", cpr_avg)
print("risk assessment prediction score average ", cpr_ra_avg)
print("performance gain ", treatment_gain(cpr_avg, baseline_avg, cpr_ra_avg))

participant prediction score average  0.7566140350736843
risk assessment prediction score average  0.7958947368631579
performance gain  0.538159041089403


In [28]:
print('significance of peformance difference between participants in treatment and baseline')
t, p = stats.ttest_ind(list(cpr_treatment['participant_brier_score']),list(baseline_treatment['participant_brier_score']))
print("t = " + str(t))
print("p = " + str(p))

significance of peformance difference between participants in treatment and baseline
t = 4.945501763909773
p = 1.6912520066908718e-06


In [29]:
print('significance of peformance difference between participants in this treatment and risk assessment model only treatment')
t, p = stats.ttest_ind(list(cpr_treatment['participant_brier_score']),list(ra_treatment['participant_brier_score']))
print("t = " + str(t))
print("p = " + str(p))

significance of peformance difference between participants in this treatment and risk assessment model only treatment
t = 0.18780611120063762
p = 0.8512333698443775


In [30]:
print('significance of peformance difference between participants and risk assessment model in this treatment')
t, p = stats.ttest_ind(list(cpr_treatment['participant_brier_score']),list(cpr_treatment['ra_brier_score']))
print("t = " + str(t))
print("p = " + str(p))

significance of peformance difference between participants and risk assessment model in this treatment
t = -6.372814874373279
p = 1.3960016989729182e-09


#### False Positive Rates

In [31]:
part_false_positive_avg = mean(workers.loc[workers['treatment']==4]['false_positive_participant'])
ra_false_positive_avg = mean(workers.loc[workers['treatment']==4]['false_positive_ra_black'] + workers.loc[workers['treatment']==4]['false_positive_ra_white'])
print('average participant false positive rate: ', part_false_positive_avg)
print('average risk assessment model false positive rate: ', ra_false_positive_avg)

average participant false positive rate:  4.105263157894737
average risk assessment model false positive rate:  3.231578947368421


In [32]:
print('significance of false positive rate difference between participants in treatment and baseline')
t, p = stats.ttest_ind(list(workers.loc[workers['treatment']==4]['false_positive_participant']),list(workers.loc[workers['treatment']==0]['false_positive_participant']))
print("t = " + str(t))
print("p = " + str(p))

significance of false positive rate difference between participants in treatment and baseline
t = -2.113075082680535
p = 0.03592811927903101


In [33]:
print('significance of false positive rate difference between participants in this treatment and risk assessment model only treatment')
t, p = stats.ttest_ind(list(workers.loc[workers['treatment']==4]['false_positive_participant']),list(workers.loc[workers['treatment']==1]['false_positive_participant']))
print("t = " + str(t))
print("p = " + str(p))

significance of false positive rate difference between participants in this treatment and risk assessment model only treatment
t = -0.9430390457509231
p = 0.34688421776220957


### Treatment 5: Selective Feature Attribution
#### Performance 

In [34]:
spr_treatment = workers.loc[workers['treatment']==5]
spr_avg = (sum(spr_treatment['participant_brier_score']) / len(spr_treatment))
spr_ra_avg = (sum(spr_treatment['ra_brier_score']) / len(spr_treatment))
print("participant prediction score average: ",spr_avg)
print("risk assessment model prediction score average: ",spr_ra_avg)
print("performance gain: ", treatment_gain(spr_avg, baseline_avg, spr_ra_avg))

participant prediction score average:  0.7505017920645162
risk assessment model prediction score average:  0.7954372759892476
performance gain:  0.46881620108865957


In [35]:
print('significance of peformance difference between participants in treatment and baseline')
t, p = stats.ttest_ind(list(spr_treatment['participant_brier_score']),list(baseline_treatment['participant_brier_score']))
print("t = " + str(t))
print("p = " + str(p))

significance of peformance difference between participants in treatment and baseline
t = 3.662455693920708
p = 0.0003264363998128492


In [36]:
print('significance of peformance difference between participants in this treatment and risk assessment model only treatment')
t, p = stats.ttest_ind(list(spr_treatment['participant_brier_score']),list(ra_treatment['participant_brier_score']))
print("t = " + str(t))
print("p = " + str(p))

significance of peformance difference between participants in this treatment and risk assessment model only treatment
t = -0.5363333032454781
p = 0.5923760364424032


In [37]:
print('significance of peformance difference between participants and risk assessment model in this treatment')
t, p = stats.ttest_ind(list(spr_treatment['participant_brier_score']),list(spr_treatment['ra_brier_score']))
print("t = " + str(t))
print("p = " + str(p))

significance of peformance difference between participants and risk assessment model in this treatment
t = -5.417239936006993
p = 1.8771365535670148e-07


#### False Positive Rates

In [38]:
part_false_positive_avg = mean(workers.loc[workers['treatment']==5]['false_positive_participant'])
ra_false_positive_avg = mean(workers.loc[workers['treatment']==5]['false_positive_ra_black'] + workers.loc[workers['treatment']==5]['false_positive_ra_white'])
print('average participant false positive rate: ', part_false_positive_avg)
print('average risk assessment model false positive rate: ', ra_false_positive_avg)

average participant false positive rate:  3.870967741935484
average risk assessment model false positive rate:  3.3225806451612905


In [39]:
print('significance of false positive rate difference between participants in treatment and baseline')
t, p = stats.ttest_ind(list(workers.loc[workers['treatment']==5]['false_positive_participant']),list(workers.loc[workers['treatment']==0]['false_positive_participant']))
print("t = " + str(t))
print("p = " + str(p))

significance of false positive rate difference between participants in treatment and baseline
t = -2.5524834718418834
p = 0.011507876812880645


In [40]:
print('significance of false positive rate difference between participants in this treatment and risk assessment model only treatment')
t, p = stats.ttest_ind(list(workers.loc[workers['treatment']==5]['false_positive_participant']),list(workers.loc[workers['treatment']==1]['false_positive_participant']))
print("t = " + str(t))
print("p = " + str(p))

significance of false positive rate difference between participants in this treatment and risk assessment model only treatment
t = -1.5155158579369528
p = 0.1313578300829341
