In [1]:
from pathlib import Path

In [2]:
import ethicml
from ethicml.algorithms.inprocess import GPyT, GPyTDemPar, GPyTEqOdds, LR, SVM, Agarwal, Kamiran, Kamishima, LRCV, ZafarEqOpp
from ethicml.evaluators import evaluate_models, CrossValidator, run_metrics
from ethicml.data import Compas, Adult, load_data
from ethicml.metrics import Accuracy, ProbPos, TPR, TNR, AbsCV
from ethicml.preprocessing import train_test_split
from ethicml.visualisation.plot import plot_mean_std_box

In [3]:
# YOU DON'T HAVE TO UNDERSTAND THE CODE IN THIS CELL
# we only tell the model where the "run.py" is and where the python executable is
code_dir = Path('..')
def gp(**kwargs):
    return GPyT(code_dir=code_dir, **kwargs)
def gp_dp(**kwargs):
    return GPyTDemPar(code_dir=code_dir, **kwargs)
def gp_eo(**kwargs):
    return GPyTEqOdds(code_dir=code_dir, **kwargs)

In [4]:
# tnr_race_False = 0.724
# tnr_race_True = 0.702
# tnr_sex_True = 0.724
# tnr_sex_False = 0.744
tnr_in_true_race = 0.71
tnr_in_false_race = 0.74
tnr_in_true_sex = 0.72
tnr_in_false_sex = 0.77

In [9]:
# specify flags for GP
gp_flags = dict(epochs=70, length_scale=1.2, use_loo=False, iso=False)

In [10]:
algos = []

algos += [gp(s_as_input=True, flags=gp_flags)]
algos += [gp(s_as_input=False, flags=gp_flags)]
# algos += [gp_dp(epochs=70, s_as_input=True)]

for tpr in [0.6,0.7,0.8,0.9]:
    algos += [gp_eo(s_as_input=True, tnr1=tnr_in_true_race, tnr0=tnr_in_true_race, tpr0=tpr, tpr1=tpr, flags=gp_flags)]
    algos += [gp_eo(s_as_input=False, tnr1=tnr_in_false_race, tnr0=tnr_in_false_race, tpr0=tpr, tpr1=tpr, flags=gp_flags)]

baselines = [
     LR(),
     SVM(),
#     Agarwal(fairness="EqOd"),
#     Kamiran(),
#     Kamishima(),
    ZafarEqOpp(),
]
algos += baselines

In [6]:
data = [
    Compas("Race"),
    Compas("Sex"),
#     Adult("Race"),
#     Adult("Sex"),
]

In [7]:
results = evaluate_models(
    datasets=data,
    inprocess_models=algos,
    metrics=[Accuracy(), ProbPos(), TPR(), TNR()],
    per_sens_metrics=[ProbPos(), TPR(), TNR()], 
    repeats=5,
    delete_prev=True,  # delete previous results
)
results

100%|██████████| 130/130 [38:30<00:00, 11.53s/it, model=ZafarEqOpp, τ=5.0, μ=1.2, dataset=Compas Sex, transform=no_transform, repeat=4]                                   


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Accuracy,TNR,TNR_race_0,TNR_race_0-race_1,TNR_race_0/race_1,TNR_race_1,TNR_sex_0,TNR_sex_0-sex_1,TNR_sex_0/sex_1,TNR_sex_1,...,TPR_sex_1,prob_pos,prob_pos_race_0,prob_pos_race_0-race_1,prob_pos_race_0/race_1,prob_pos_race_1,prob_pos_sex_0,prob_pos_sex_0-sex_1,prob_pos_sex_0/sex_1,prob_pos_sex_1
dataset,transform,model,repeat,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
Compas Sex,no_transform,GPyT_in_True,0-14460,0.682334,0.703170,,,,,0.884615,0.234058,0.735413,0.650558,...,0.680761,0.453809,,,,,0.224215,0.280236,0.444474,0.504451
Compas Sex,no_transform,GPyT_in_False,0-14460,0.686386,0.749280,,,,,0.826923,0.100157,0.878880,0.726766,...,0.617336,0.405997,,,,,0.278027,0.156197,0.640285,0.434224
Compas Sex,no_transform,GPyT_eq_odds_in_True_0tnr_0.71_1tnr_0.71_0tpr_0.6_1tpr_0.6,0-14460,0.679092,0.693084,,,,,0.756410,0.081689,0.892004,0.674721,...,0.676533,0.461912,,,,,0.336323,0.153291,0.686914,0.489614
Compas Sex,no_transform,GPyT_eq_odds_in_False_0tnr_0.74_1tnr_0.74_0tpr_0.6_1tpr_0.6,0-14460,0.682334,0.746398,,,,,0.826923,0.103875,0.874384,0.723048,...,0.615222,0.405186,,,,,0.269058,0.166154,0.618223,0.435213
Compas Sex,no_transform,GPyT_eq_odds_in_True_0tnr_0.71_1tnr_0.71_0tpr_0.7_1tpr_0.7,0-14460,0.683955,0.704611,,,,,0.788462,0.108164,0.862816,0.680297,...,0.678647,0.453809,,,,,0.300448,0.187188,0.616133,0.487636
Compas Sex,no_transform,GPyT_eq_odds_in_False_0tnr_0.74_1tnr_0.74_0tpr_0.7_1tpr_0.7,0-14460,0.681524,0.670029,,,,,0.743590,0.094891,0.872388,0.648699,...,0.714588,0.490276,,,,,0.349776,0.171490,0.671012,0.521266
Compas Sex,no_transform,GPyT_eq_odds_in_True_0tnr_0.71_1tnr_0.71_0tpr_0.8_1tpr_0.8,0-14460,0.669368,0.623919,,,,,0.685897,0.079949,0.883438,0.605948,...,0.742072,0.529984,,,,,0.408072,0.148803,0.732790,0.556874
Compas Sex,no_transform,GPyT_eq_odds_in_False_0tnr_0.74_1tnr_0.74_0tpr_0.8_1tpr_0.8,0-14460,0.667747,0.623919,,,,,0.705128,0.104756,0.851436,0.600372,...,0.737844,0.528363,,,,,0.394619,0.163245,0.707375,0.557864
Compas Sex,no_transform,GPyT_eq_odds_in_True_0tnr_0.71_1tnr_0.71_0tpr_0.9_1tpr_0.9,0-14460,0.635332,0.512968,,,,,0.557692,0.057692,0.896552,0.500000,...,0.801268,0.620746,,,,,0.529148,0.111802,0.825569,0.640950
Compas Sex,no_transform,GPyT_eq_odds_in_False_0tnr_0.74_1tnr_0.74_0tpr_0.9_1tpr_0.9,0-14460,0.669368,0.618156,,,,,0.698718,0.103922,0.851267,0.594796,...,0.750529,0.536467,,,,,0.399103,0.167662,0.704177,0.566766


In [15]:
# import pandas as pd
# results = pd.read_csv("results/Compas Race_no_transform.csv").set_index(["dataset", "transform", "model", "repeat"])

plot_mean_std_box(results, Accuracy(), TPR())
# figs_plots = plot_mean_std_box(results, "Accuracy", "TPR_race_0/race_1")
# figs_plots[0][0]

[(<Figure size 1800x1200 with 1 Axes>,
  <matplotlib.axes._subplots.AxesSubplot at 0x7f82cc336630>),
 (<Figure size 1800x1200 with 1 Axes>,
  <matplotlib.axes._subplots.AxesSubplot at 0x7f82cc269518>)]

In [10]:
train, test = train_test_split(load_data(Adult()))

In [None]:
# Fair grid search
primary = Accuracy()
fair_measure = AbsCV()
hyperparams = dict(C=[1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7])
lr_cv = CrossValidator(LR, hyperparams, folds=5)
lr_cv.run(train, measures=[primary, fair_measure])
lr_cv.best_hyper_params(primary)

In [None]:
lr_best_acc = lr_cv.results.get_best_result(primary)
lr_best_fair = lr_cv.results.get_best_in_top_k(primary, fair_measure, top_k=3)
print("best accuracy:", lr_best_acc)
print("best fair(+accuracy):", lr_best_fair)

In [None]:
lr_best_acc_model = LRCV()
results = evaluate_models(
    datasets=[Adult()],
    inprocess_models=[LR(**lr_best_fair.params), lr_best_acc_model],
    metrics=[Accuracy(), ProbPos(), TPR(), TNR()],
    per_sens_metrics=[ProbPos(), TPR(), TNR()], 
    repeats=3,
    delete_prev=True,  # delete previous results
)

In [None]:
plot_mean_std_box(results, Accuracy(), ProbPos())