In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import random
import warnings
import numpy as np
import itertools as iter
from copy import deepcopy
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

# silent warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=FutureWarning)
rng = np.random.default_rng(1234)
random.seed()

from utils import *
from algorithms import *

### Fetch dataset - either Adult Income or German Credit

In [2]:
## Specify dataset - choose either "adult" or "german" - and protected attribute - "race" or "gender" 
## For German, we only use "gender" as protected attribute

dataset = "adult"
protected_attribute = "race"

In [3]:
if dataset == "adult":
    dataset_all, PROT_GRP_INDEX = get_adult_dataset(protected_attribute)
elif dataset == "german":
    dataset_all, PROT_GRP_INDEX = get_german_dataset()    

### Specify parameters for the algorithms

In [4]:
# FDR constraint parameter
ALPHA = 0.15

# initial exploit FDR parameter
EXPLOIT_ALPHA = 0.075

# initial explore FDR parameter
EXPLORE_ALPHA = ALPHA - EXPLOIT_ALPHA

# total number of online iterations
ITERS = 40

# total number of repetitions using random dataset splits
reps = 50

# whether results for each iteration should be printed or not
verbose = False

In [5]:
X, y = dataset_all[0], dataset_all[1]
# X = np.hstack((X, np.ones((len(X),1))))

target_clf = get_rev_max_classifier([X, y], np.ones(len(dataset_all[0][1])), ALPHA, True, PROT_GRP_INDEX, C=0)


### Execute our algorithm and baselines using above parameters

In [138]:
def execute_explore_exploit_all_variants(data_l, data_s, initial_clf_cons, PROT_GRP_INDEX):
    sample_weights = np.ones(len(data_s[0][1]))
    
    ## Our explore-exploit algorithm
    exp_exp = Exp_Exp_Algorithm(exploit_eps = EXPLOIT_ALPHA, explore_eps = EXPLORE_ALPHA,
                               data_l = data_l,
                               clf_f = initial_clf_cons)
    
    results = [[] for _ in range(7)]

    explore_eps = EXPLORE_ALPHA
    exploit_eps = EXPLOIT_ALPHA
    fdr_exp = []

    def execute_variant(exp_exp, data_l, data_s, results, i, exploit_eps, explore_eps, PROT_GRP_INDEX, exploit_fair, explore_fair):
        if verbose:
            print ("-------------")
            print ("Executing variant with exploit_fairness = " + str(exploit_fair) + " and exploire_fairness = " + str(explore_fair))
            
        for t in (range(1,ITERS)):
            a = exp_exp
            X_t = data_s[t][0]
            y_t = data_s[t][1]

            preds, weights, explore_indices = a._predict(X_t, y_t, PROT_GRP_INDEX, exploit_eps, explore_eps, exploit_fair=exploit_fair, explore_fair=explore_fair)

            a._update_datasets(preds, data_s[t], weights)
            stats = get_stats(preds, data_s[t], PROT_GRP_INDEX)
            fdr_inc = max(EXPLOIT_ALPHA + EXPLORE_ALPHA - stats['fdr'], 0)
            exploit_eps = EXPLOIT_ALPHA * (t**0.2) # +fdr_inc
            explore_eps = max(0, ALPHA - exploit_eps)

            results[i].append(get_stats(preds, data_s[t], PROT_GRP_INDEX))        
            if verbose:
                print(f"Iteration {t}")
                print ("FDR", results[i][-1]["fdr"])
                print ("Revenue", results[i][-1]["revenue"])
                print ("Statistical rate", results[i][-1]["stat_rate"])
                print ("Group-0 TPR", results[i][-1]["tp-0"])
                print ("Group-1 TPR", results[i][-1]["tp-1"], "\n")
                
    
    ## No fairness constraint
    execute_variant(exp_exp, data_l, data_s, results, 0, exploit_eps, explore_eps, PROT_GRP_INDEX, False, False)
    
    ## Only exploit fairness constraint
    execute_variant(exp_exp, data_l, data_s, results, 1, exploit_eps, explore_eps, PROT_GRP_INDEX, True, False)
    
    ## Only explore fairness constraint
    execute_variant(exp_exp, data_l, data_s, results, 2, exploit_eps, explore_eps, PROT_GRP_INDEX, False, True)
    
    ## Both explore and exploit fairness constraint
    execute_variant(exp_exp, data_l, data_s, results, 3, exploit_eps, explore_eps, PROT_GRP_INDEX, True, True)
    
    ## Baselines
    ## Only-exploit algorithm with no exploration - FAIR_CLF
    no_exp_fair = No_Exploration_Fair_Algorithm(eps = ALPHA,\
                               data_l = data_l,
                               clf_f = initial_clf_cons)


    ## OPT-OFFLINE baseline
#     target_clf = get_rev_max_classifier(data_s[0], sample_weights, ALPHA+0.01, True, PROT_GRP_INDEX, C=0)
    target = Target(eps = ALPHA,\
                               data_l = data_l,
                               clf_f = target_clf)
        
    def execute_baseline(algo, data_l, data_s, results, i, PROT_GRP_INDEX):
        if verbose:
            print ("-------------")
            print ("Executing baseline", algo._name)
        for t in (range(1,ITERS)):
            a = algo
            X_t = data_s[t][0]
            y_t = data_s[t][1]

            preds, weights, frac_exp = a._predict(X = X_t, eps = ALPHA, PROT_GRP_INDEX = PROT_GRP_INDEX)
            a._update_datasets(preds, data_s[t], weights)

            results[i].append(get_stats(preds, data_s[t], PROT_GRP_INDEX))        

            if verbose:
                print(f"Iteration {t}")
                print ("FDR", results[i][-1]["fdr"])
                print ("Revenue", results[i][-1]["revenue"])
                print ("Statistical rate", results[i][-1]["stat_rate"])
                print ("Group-0 TPR", results[i][-1]["tp-0"])
                print ("Group-1 TPR", results[i][-1]["tp-1"], "\n")


    ## OPT-OFFLINE
    execute_baseline(target, data_l, data_s, results, 4, PROT_GRP_INDEX)
    
    ## FAIR-CLF
#     execute_baseline(no_exp_fair, data_l, data_s, results, 5, PROT_GRP_INDEX)
        
    return results

In [119]:
results_all = []

for r in (range(reps)):
    print ("rep", r)
    
    if dataset == "adult":
        data_l, data_s, initial_clf = get_adult_data_splits_and_initial_clf(dataset_all, ITERS)        
    elif dataset == "german":
        data_l, data_s, initial_clf = get_german_data_splits_and_initial_clf(dataset_all, ITERS)        

    results = execute_explore_exploit_all_variants(data_l, data_s, initial_clf, PROT_GRP_INDEX)
    results_all.append(list(results))
    

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
rep 15
rep 16
rep 17
rep 18
rep 19
rep 20
rep 21
rep 22
rep 23
rep 24
rep 25
rep 26
rep 27
rep 28
rep 29
rep 30
rep 31
rep 32
rep 33
rep 34
rep 35
rep 36
rep 37
rep 38
rep 39
rep 40
rep 41
rep 42
rep 43
rep 44
rep 45
rep 46
rep 47
rep 48
rep 49


In [139]:
results_all_2 = []

for r in (range(reps)):
    print ("rep", r)
    
    if dataset == "adult":
        data_l, data_s, initial_clf = get_adult_data_splits_and_initial_clf(dataset_all, ITERS)        
    elif dataset == "german":
        data_l, data_s, initial_clf = get_german_data_splits_and_initial_clf(dataset_all, ITERS)        

    results = execute_explore_exploit_all_variants(data_l, data_s, initial_clf, PROT_GRP_INDEX)
    results_all_2.append(list(results))
    

rep 0
rep 1
rep 2
rep 3
rep 4
rep 5
rep 6
rep 7
rep 8
rep 9
rep 10
rep 11
rep 12
rep 13
rep 14
rep 15
rep 16
rep 17
rep 18
rep 19
rep 20
rep 21
rep 22
rep 23
rep 24
rep 25
rep 26
rep 27
rep 28
rep 29
rep 30
rep 31
rep 32
rep 33
rep 34
rep 35
rep 36
rep 37
rep 38
rep 39
rep 40
rep 41
rep 42
rep 43
rep 44
rep 45
rep 46
rep 47
rep 48
rep 49


### Plot results for all variants of our algorithm

In [None]:

legend = ["Algorithm 1 w/ no fairness constraints", "Algorithm 1 w/ exploit fairness", "Algorithm 1 w/ explore fairness", "Algorithm 1 w/ both exploit & explore fairness"]

ITERS = 40
plt.figure(figsize=(21,3))

plt.subplot(131)
def plot_y(dat, k, key, kl):
    ys = [np.mean([dat[i][k][j][key] for i in range(len(dat))]) for j in range(ITERS-1)]
    err = [np.std([dat[i][k][j][key] for i in range(len(dat))]) for j in range(ITERS-1)]
    plt.errorbar(range(1, ITERS), ys, yerr=err, fmt="-o", label=legend[kl])

key = "revenue"
plot_y(results_all, 0, key, 0)
plot_y(results_all, 1, key, 1)
plot_y(results_all, 2, key, 2)
plot_y(results_all, 3, key, 3)
plot_y(results_all_2, 4, key, 3)
    
plt.xlabel("Iteration", fontsize=15)
plt.ylabel("TPR for Black Applicants", fontsize=14)
# plt.ylim([0, 0.6])

plt.subplot(132)
key = "fdr"
plot_y(results_all, 0, key, 0)
plot_y(results_all, 1, key, 1)
plot_y(results_all, 2, key, 2)
plot_y(results_all, 3, key, 3)
plot_y(results_all, 4, key, 3)
    
plt.xlabel("Iteration", fontsize=15)
plt.ylabel("TPR for White Applicants", fontsize=14)
# plt.ylim([0, 0.6])


plt.subplot(133)
def get_sr(dat):
    return abs(dat['stat_rate'])

def plot_sr(dat, k, kl):
    ys = [np.mean([get_sr(dat[i][k][j])  for i in range(len(results_all))]) for j in range(ITERS-1)]
    err = [np.std([get_sr(dat[i][k][j]) for i in range(len(results_all))]) for j in range(ITERS-1)]
    plt.errorbar(range(1, ITERS), ys, yerr=err, fmt="-o", label=legend[kl])

plot_sr(results_all, 0, 0)
plot_sr(results_all, 1, 1)
plot_sr(results_all, 2, 2)
plot_sr(results_all, 3, 3)

plt.xlabel("Iteration", fontsize=15)
plt.ylabel("Acceptance rate\n disparity across groups", fontsize=14)
# plt.ylim([0, 0.16])

plt.legend(ncol=4, bbox_to_anchor=(1.05, 1.25), fontsize=14, columnspacing=0.8)


In [18]:
# [np.mean([data_all[i][j][0]['revenue'] for i in range(len(data_all))]) for j in range(ITERS-1)]
# data_all[0][0][0], sum(data_s[0][0][:,PROT_GRP_INDEX])


def pres(k, lab, keys):
    res = lab + " "
    for key in keys:
        if key == "total_loans":
            ms = np.mean([[abs(results_all[i][k][j][key])/len(data_s[0][1]) for i in range(len(results_all))] for j in range(ITERS-1)])
            ss = np.std([[abs(results_all[i][k][j][key])/len(data_s[0][1]) for i in range(len(results_all))] for j in range(ITERS-1)])            
            res += " & " + str(np.round(ms, 2)) + " (" + str(np.round(ss, 2)) + ") "
            
        elif key == "revenue":
            ms = np.mean([[abs(results_all[i][k][j][key])/1000 for i in range(len(results_all))] for j in range(ITERS-1)])
            ss = np.std([[abs(results_all[i][k][j][key])/1000 for i in range(len(results_all))] for j in range(ITERS-1)])                        
            res += " & " + str(np.round(ms, 1)) + " (" + str(np.round(ss, 1)) + ") "
            
        else:
            ms = np.mean([[abs(results_all[i][k][j][key]) for i in range(len(results_all))] for j in range(ITERS-1)])
            ss = np.std([[abs(results_all[i][k][j][key]) for i in range(len(results_all))] for j in range(ITERS-1)])
            res += " & " + str(np.round(ms, 2)) + " (" + str(np.round(ss, 2)) + ") "
        
    res += "\\\ "
    print (res)
    
keys = ["revenue", "fdr", "stat_rate", "tp_rate"]
labs = ["Algorithm~\\ref{alg:main_algorithm} - no fairness constraint", 
        "Algorithm~\\ref{alg:main_algorithm} - only exploit fairness", 
        "Algorithm~\\ref{alg:main_algorithm} - only explore fairness", 
        "Algorithm~\\ref{alg:main_algorithm} - both fairness constraints",
        "Baseline - \\textsc{Opt-offline}"] 
#         "Baseline - \\textsc{Fair-clf}"]

for i, lab in enumerate(labs):
    pres(i, lab, keys) 


Algorithm~\ref{alg:main_algorithm} - no fairness constraint  & 70.6 (13.1)  & 0.15 (0.02)  & 0.02 (0.02)  & 0.09 (0.06) \\ 
Algorithm~\ref{alg:main_algorithm} - only exploit fairness  & 71.9 (12.9)  & 0.15 (0.02)  & 0.02 (0.01)  & 0.08 (0.05) \\ 
Algorithm~\ref{alg:main_algorithm} - only explore fairness  & 73.8 (13.0)  & 0.14 (0.02)  & 0.03 (0.02)  & 0.07 (0.05) \\ 
Algorithm~\ref{alg:main_algorithm} - both fairness constraints  & 73.1 (13.0)  & 0.14 (0.02)  & 0.01 (0.01)  & 0.06 (0.04) \\ 
Baseline - \textsc{Opt-offline}  & 72.2 (10.5)  & 0.16 (0.02)  & 0.05 (0.03)  & 0.16 (0.1) \\ 


In [None]:
legend = ["Algorithm 1 w/ no fairness constraints", "Algorithm 1 w/ exploit fairness", "Algorithm 1 w/ explore fairness", "Algorithm 1 w/ both exploit & explore fairness", "OPT-OFFLINE"]

ITERS = 40
plt.figure(figsize=(21,3))

plt.subplot(131)
def plot_y(dat, k, key, kl):
    ys = [np.mean([(dat[i][k][j][key]) for i in range(len(dat))]) for j in range(ITERS-1)]
    err = [np.std([(dat[i][k][j][key]) for i in range(len(dat))]) for j in range(ITERS-1)]
    plt.errorbar(range(2, ITERS), ys[1:], yerr=err[1:], fmt="-o", label=legend[kl])

key = "fdr"
plot_y(results_all, 0, key, 0)
plot_y(results_all, 1, key, 1)
plot_y(results_all, 2, key, 2)
plot_y(results_all, 3, key, 3)
plot_y(results_all_2, 4, key, 4)
plt.legend(ncol=1, bbox_to_anchor=(1.05, 1.75), fontsize=14, columnspacing=0.8)
plt.xlabel("Iterations", fontsize=15)
plt.ylabel("Log revenue", fontsize=15)


In [6]:
# results_all