In [41]:
import pandas as pd
import json
import os
import deepsig
from IPython.display import display

In [42]:
cols = ['dataset', 'method', 'fitness_rule', 'fitness', 'ACC', 'MCC', 'f1_score', 'avg_odds_diff', 'stat_par_diff', 'eq_opp_diff']

In [43]:
mlp_baseline_results = pd.read_csv('mlp_baseline_results.csv')
mlp_baseline_results.replace({'simple_mlp_initializer': 'Standard MLP (baseline)'}, inplace=True)

mlp_pearson_results = pd.read_csv('mlp_pearson_results.csv')
mlp_pearson_results.replace({'mlp_preg_initializer': 'Pearson regularized MLP'}, inplace=True)

mlp_spearman_results = pd.read_csv('mlp_spearman_results.csv')
mlp_spearman_results.replace({'mlp_sreg_initializer': 'Spearman regularized MLP'}, inplace=True)

mlp_auto_reg_results = pd.read_csv('mlp_auto_reg_results.csv')
mlp_auto_reg_results.replace({'mlp_auto_reg_initializer': 'Auto corr regularized MLP'}, inplace=True)

mlp_results = pd.concat([mlp_baseline_results, mlp_pearson_results, mlp_spearman_results, mlp_auto_reg_results])

ftl_baseline_results = pd.read_csv('ftl_baseline_results.csv')
ftl_baseline_results.replace({'ftl_mlp_initializer': 'Standard FTL (baseline)'}, inplace=True)

ftl_pearson_results = pd.read_csv('ftl_pearson_results.csv')
ftl_pearson_results.replace({'ftl_mlp_preg_initializer': 'Pearson regularized FTL'}, inplace=True)

ftl_spearman_results = pd.read_csv('ftl_spearman_results.csv')
ftl_spearman_results.replace({'ftl_mlp_sreg_initializer': 'Spearman regularized FTL'}, inplace=True)

ftl_auto_reg_results = pd.read_csv('ftl_spearman_results.csv')
ftl_auto_reg_results.replace({'ftl_mlp_auto_reg_initializer': 'Auto corr regularized FTL'}, inplace=True)

ftl_results = pd.concat([ftl_baseline_results, ftl_pearson_results, ftl_spearman_results, ftl_auto_reg_results])

In [44]:
for results in [mlp_results,ftl_results]:
    results.replace({'adult_dataset_reader': 'Adult Income', 'compas_dataset_reader': 'Compas Recidivism', 'german_dataset_reader': 'German Credit', 'bank_dataset_reader': 'Bank Marketing'}, inplace=True)
    results.rename(columns={'avg_odds_diff': 'Equalized Odds', 'stat_par_diff': 'Statistical Parity', 'eq_opp_diff': 'Equal Opportunity', 'MCC': 'Mathew Correlation', 'ACC': 'Accuracy'}, inplace=True)

In [45]:
fitness_rules_target_metrics = {
    'mcc_parity': {'performance': 'Mathew Correlation', 'fairness': 'Statistical Parity'},
    'mcc_opportunity': {'performance': 'Mathew Correlation', 'fairness': 'Equal Opportunity'},
    'mcc_odds': {'performance': 'Mathew Correlation', 'fairness': 'Equalized Odds'},
    'acc_parity': {'performance': 'Accuracy', 'fairness': 'Statistical Parity'},
    'acc_opportunity': {'performance': 'Accuracy', 'fairness': 'Equal Opportunity'},
    'acc_odds': {'performance': 'Accuracy', 'fairness': 'Equalized Odds'}
}

fitness_rules_target_metrics = {
    'mcc_parity': ('Mathew Correlation', 'Statistical Parity'),
    'mcc_opportunity': ('Mathew Correlation', 'Equal Opportunity'),
    'mcc_odds': ('Mathew Correlation', 'Equalized Odds'),
    'acc_parity': ('Accuracy', 'Statistical Parity'),
    'acc_opportunity': ('Accuracy', 'Equal Opportunity'),
    'acc_odds': ('Accuracy', 'Equalized Odds')
}
fitness_rules_abvr = {
    'mcc_parity': 'Max(MCC - Stat. Parity)',
    'mcc_opportunity': 'Max(MCC - Eq. Odds)',
    'mcc_odds': 'Max(MCC - Eq. Opp.)',
    'acc_parity': 'Max(Acc - Stat. Parity)',
    'acc_opportunity': 'Max(Acc - Eq. Odds)',
    'acc_odds': 'Max(Acc - Eq. Opp.)'
}

for results in [mlp_results,ftl_results]:
    results['Performance'] = 0
    results['Fairness'] = 0
    results['Fitness Rule'] = ''
    for fitness_rule, (performance_metric, fairness_metric) in fitness_rules_target_metrics.items():
        results.loc[results.fitness_rule == fitness_rule,'Performance'] = results.loc[results.fitness_rule == fitness_rule,performance_metric]
        results.loc[results.fitness_rule == fitness_rule,'Fairness'] = results.loc[results.fitness_rule == fitness_rule,fairness_metric]
        results.loc[results.fitness_rule == fitness_rule,'Fitness Rule Abvr'] = fitness_rules_abvr[fitness_rule]
        results.loc[results.fitness_rule == fitness_rule,'Fitness Rule'] = 'Max(%s - %s)' % fitness_rules_target_metrics[fitness_rule]

 0.58075151 0.57811729 0.57147862 0.55847176 0.56402881 0.58191431
 0.58343253 0.57740222 0.51579307 0.30958237 0.37962947 0.55899189
 0.53558872 0.2598879  0.57717035 0.50086739 0.33275221 0.37929293
 0.47729109 0.29196869 0.29037704 0.52303609 0.27480633 0.38668781
 0.50626951 0.29960979 0.4042848  0.52119218 0.2936422  0.33522388
 0.49730721 0.25112662 0.35119067 0.510052   0.26843855 0.36081008
 0.52157495 0.30114722 0.42995147 0.48267704 0.29056691 0.23139881
 0.54223416 0.23653437 0.27041017 0.52121263 0.25759348 0.23912165
 0.5223892  0.27406652 0.22303564 0.54094974 0.27699558 0.30939251
 0.51731345 0.30339828 0.37825089 0.52113734 0.26323857 0.33963196
 0.5538225  0.27850228 0.30234984 0.51877126 0.29494101 0.2648939
 0.52742877 0.24492927 0.13407928 0.53530394 0.29087766 0.26820576
 0.55204731 0.29318377 0.14015216 0.51983069 0.3145689  0.47077449
 0.50770986 0.28880123 0.29866465 0.5180664  0.29093567 0.3636368
 0.53289704 0.29737768 0.38084556 0.54328387 0.28002914 0.282325

In [46]:
datasets = ['Adult Income', 'Bank Marketing', 'Compas Recidivism','German Credit']
datasets

['Adult Income', 'Bank Marketing', 'Compas Recidivism', 'German Credit']

In [47]:
fitness_rules = ['mcc_parity', 'mcc_opportunity', 'mcc_odds', 'acc_parity', 'acc_opportunity', 'acc_odds']
fitness_rules

['mcc_parity',
 'mcc_opportunity',
 'mcc_odds',
 'acc_parity',
 'acc_opportunity',
 'acc_odds']

In [48]:
ftl_methods = ['Standard FTL (baseline)', 'Pearson regularized FTL', 'Spearman regularized FTL', 'Auto corr regularized FTL']
mlp_methods = ['Standard MLP (baseline)', 'Pearson regularized MLP', 'Spearman regularized MLP', 'Auto corr regularized MLP']
significances = []
grouped_results_list = []

In [49]:
for path, methods, results in zip(['mlp_multi_aso_data_list.json', 'ftl_multi_aso_data_list.json'],
                                  [mlp_methods, ftl_methods],
                                  (mlp_results,ftl_results)):
    if os.path.exists(path):
        with open(path) as file:
            multi_aso_data_list = json.load(file)
    else:    
        multi_aso_data_list = []
        for d in datasets:
            multi_aso_data = []
            for f in fitness_rules:
                methods_results = []
                for m in methods:
                    r = results.loc[ (results['dataset'] == d) &
                                         (results['fitness_rule'] == f) &
                                         (results['method'] == m) ]\
                                .fitness.tolist()
                    if len(r) == 0:
                        r = [-1]
                    methods_results.append(r)
                min_eps = deepsig.multi_aso(methods_results, confidence_level=0.95)
                multi_aso_data_list.append({'fitness_rule': f, 'dataset': d, 'min_eps': min_eps.tolist()})
        with open(path, 'w') as file:
            json.dump(multi_aso_data_list, file)
    
    aso_df_resume = []
    for aso_result in sorted(multi_aso_data_list, key=lambda x: x['dataset']):
        fitness_rule = aso_result['fitness_rule']
        dataset = aso_result['dataset']
    
        aso_df = pd.DataFrame(aso_result['min_eps'], columns=methods)
        aso_df['method'] = methods
        aso_df['dataset'] = dataset
        aso_df['fitness_rule'] = fitness_rule
        aso_df_resume.append(aso_df)
    
    print('Significance Testing')
    significance = pd.concat(aso_df_resume)
    significance.replace(fitness_rules_abvr, inplace=True)
    significance = significance.set_index(['fitness_rule', 'dataset'])
    significance = significance.sort_values(by=['fitness_rule', 'dataset'], ascending=[False, True])
    significances.append(significance)
    
    grouped_results = results\
        .groupby(['Fitness Rule Abvr', 'dataset', 'method'])\
        .agg({'fitness': ['mean', 'std', 'count'], 'Performance': ['mean', 'std'], 'Fairness': ['mean', 'std']})\
        .sort_values(by=['Fitness Rule Abvr', 'dataset', ('fitness','mean')], ascending=False)
    grouped_results['formatted_fitness'] = grouped_results.apply(lambda row: f"${row[('fitness', 'mean')]:.3f} (\pm{row[('fitness', 'std')]:.2f})$", axis=1)
    grouped_results['formatted_performance'] = grouped_results.apply(lambda row: f"${row[('Performance', 'mean')]:.3f} (\pm{row[('Performance', 'std')]:.2f})$", axis=1)
    grouped_results['formatted_fairness'] = grouped_results.apply(lambda row: f"${row[('Fairness', 'mean')]:.3f} (\pm{row[('Fairness', 'std')]:.2f})$", axis=1)
    grouped_results = grouped_results.sort_values(by=['Fitness Rule Abvr', 'dataset'])
    grouped_results_list.append(grouped_results)

Significance Testing


Model comparisons: 100%|█████████▉| 5994/6000 [00:13<00:00, 439.34it/s]
Model comparisons: 100%|█████████▉| 5994/6000 [00:13<00:00, 441.48it/s]
Model comparisons: 100%|█████████▉| 5994/6000 [00:13<00:00, 438.22it/s]
Model comparisons: 100%|█████████▉| 5994/6000 [00:13<00:00, 429.04it/s]
Model comparisons: 100%|█████████▉| 5994/6000 [00:14<00:00, 424.87it/s]
Model comparisons: 100%|█████████▉| 5994/6000 [00:13<00:00, 442.72it/s]
Model comparisons: 100%|█████████▉| 5994/6000 [00:12<00:00, 465.43it/s]
Model comparisons: 100%|█████████▉| 5994/6000 [00:12<00:00, 466.83it/s]
Model comparisons: 100%|█████████▉| 5994/6000 [00:12<00:00, 469.72it/s]
Model comparisons: 100%|█████████▉| 5994/6000 [00:12<00:00, 468.91it/s]
Model comparisons: 100%|█████████▉| 5994/6000 [00:12<00:00, 461.86it/s]
Model comparisons: 100%|█████████▉| 5994/6000 [00:12<00:00, 469.01it/s]
Model comparisons: 100%|█████████▉| 5994/6000 [00:12<00:00, 464.18it/s]
Model comparisons: 100%|█████████▉| 5994/6000 [00:12<00:00, 473.

Significance Testing





In [50]:
significances[0].sort_values(by=['fitness_rule', 'dataset'], ascending=[False, True])

Unnamed: 0_level_0,Unnamed: 1_level_0,Standard MLP (baseline),Pearson regularized MLP,Spearman regularized MLP,Auto corr regularized MLP,method
fitness_rule,dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Max(MCC - Stat. Parity),Adult Income,1.000000,1.000000,0.921788,1.000000,Standard MLP (baseline)
Max(MCC - Stat. Parity),Adult Income,0.523903,1.000000,0.560019,1.000000,Pearson regularized MLP
Max(MCC - Stat. Parity),Adult Income,1.000000,1.000000,1.000000,1.000000,Spearman regularized MLP
Max(MCC - Stat. Parity),Adult Income,0.232197,0.705617,0.386660,1.000000,Auto corr regularized MLP
Max(MCC - Stat. Parity),Bank Marketing,1.000000,1.000000,1.000000,0.651367,Standard MLP (baseline)
...,...,...,...,...,...,...
Max(Acc - Eq. Odds),Compas Recidivism,1.000000,1.000000,1.000000,1.000000,Auto corr regularized MLP
Max(Acc - Eq. Odds),German Credit,1.000000,1.000000,1.000000,0.990951,Standard MLP (baseline)
Max(Acc - Eq. Odds),German Credit,0.197113,1.000000,0.232851,1.000000,Pearson regularized MLP
Max(Acc - Eq. Odds),German Credit,0.640273,1.000000,1.000000,0.999065,Spearman regularized MLP


In [51]:
grouped_results_list[0]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,fitness,fitness,fitness,Performance,Performance,Fairness,Fairness,formatted_fitness,formatted_performance,formatted_fairness
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,count,mean,std,mean,std,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Fitness Rule Abvr,dataset,method,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Max(Acc - Eq. Odds),Adult Income,Auto corr regularized MLP,0.765534,0.022879,6,0.851446,0.002568,0.085913,0.022909,$0.766 (\pm0.02)$,$0.851 (\pm0.00)$,$0.086 (\pm0.02)$
Max(Acc - Eq. Odds),Adult Income,Spearman regularized MLP,0.761024,0.041571,6,0.847835,0.005097,0.086811,0.038167,$0.761 (\pm0.04)$,$0.848 (\pm0.01)$,$0.087 (\pm0.04)$
Max(Acc - Eq. Odds),Adult Income,Standard MLP (baseline),0.758290,0.035161,15,0.847864,0.004669,0.089574,0.035575,$0.758 (\pm0.04)$,$0.848 (\pm0.00)$,$0.090 (\pm0.04)$
Max(Acc - Eq. Odds),Adult Income,Pearson regularized MLP,0.741188,0.045045,6,0.850341,0.002727,0.109153,0.046080,$0.741 (\pm0.05)$,$0.850 (\pm0.00)$,$0.109 (\pm0.05)$
Max(Acc - Eq. Odds),Bank Marketing,Standard MLP (baseline),0.803182,0.070894,30,0.902154,0.002618,0.098971,0.070290,$0.803 (\pm0.07)$,$0.902 (\pm0.00)$,$0.099 (\pm0.07)$
...,...,...,...,...,...,...,...,...,...,...,...,...
Max(MCC - Stat. Parity),Compas Recidivism,Spearman regularized MLP,0.067990,0.069200,6,0.285655,0.022689,0.217666,0.054432,$0.068 (\pm0.07)$,$0.286 (\pm0.02)$,$0.218 (\pm0.05)$
Max(MCC - Stat. Parity),German Credit,Auto corr regularized MLP,0.288807,0.078940,6,0.386198,0.032812,0.097391,0.069361,$0.289 (\pm0.08)$,$0.386 (\pm0.03)$,$0.097 (\pm0.07)$
Max(MCC - Stat. Parity),German Credit,Standard MLP (baseline),0.265661,0.099801,30,0.329468,0.090834,0.063807,0.046639,$0.266 (\pm0.10)$,$0.329 (\pm0.09)$,$0.064 (\pm0.05)$
Max(MCC - Stat. Parity),German Credit,Pearson regularized MLP,0.262448,0.030490,6,0.345602,0.041158,0.083154,0.045541,$0.262 (\pm0.03)$,$0.346 (\pm0.04)$,$0.083 (\pm0.05)$


In [52]:
significances[1].sort_values(by=['fitness_rule', 'dataset'], ascending=[False, True])

Unnamed: 0_level_0,Unnamed: 1_level_0,Standard FTL (baseline),Pearson regularized FTL,Spearman regularized FTL,Auto corr regularized FTL,method
fitness_rule,dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Max(MCC - Stat. Parity),Adult Income,1.000000,1.000000,0.346959,0.0,Standard FTL (baseline)
Max(MCC - Stat. Parity),Adult Income,0.550914,1.000000,0.223979,0.0,Pearson regularized FTL
Max(MCC - Stat. Parity),Adult Income,1.000000,1.000000,1.000000,0.0,Spearman regularized FTL
Max(MCC - Stat. Parity),Adult Income,1.000000,1.000000,1.000000,1.0,Auto corr regularized FTL
Max(MCC - Stat. Parity),Bank Marketing,1.000000,1.000000,1.000000,0.0,Standard FTL (baseline)
...,...,...,...,...,...,...
Max(Acc - Eq. Odds),Compas Recidivism,1.000000,1.000000,1.000000,1.0,Auto corr regularized FTL
Max(Acc - Eq. Odds),German Credit,1.000000,0.795869,0.000000,0.0,Standard FTL (baseline)
Max(Acc - Eq. Odds),German Credit,1.000000,1.000000,0.000000,0.0,Pearson regularized FTL
Max(Acc - Eq. Odds),German Credit,1.000000,1.000000,1.000000,0.0,Spearman regularized FTL


In [53]:
grouped_results_list[1]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,fitness,fitness,fitness,Performance,Performance,Fairness,Fairness,formatted_fitness,formatted_performance,formatted_fairness
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,count,mean,std,mean,std,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Fitness Rule Abvr,dataset,method,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Max(Acc - Eq. Odds),Adult Income,Pearson regularized FTL,0.816948,0.023932,17,0.844002,0.008850,0.027055,0.022190,$0.817 (\pm0.02)$,$0.844 (\pm0.01)$,$0.027 (\pm0.02)$
Max(Acc - Eq. Odds),Adult Income,Standard FTL (baseline),0.815099,0.022059,14,0.847485,0.003112,0.032386,0.020821,$0.815 (\pm0.02)$,$0.847 (\pm0.00)$,$0.032 (\pm0.02)$
Max(Acc - Eq. Odds),Adult Income,Spearman regularized FTL,0.796648,0.036947,13,0.843398,0.003089,0.046751,0.037872,$0.797 (\pm0.04)$,$0.843 (\pm0.00)$,$0.047 (\pm0.04)$
Max(Acc - Eq. Odds),Adult Income,ftl_mlp_sreg_initializer,0.796648,0.036947,13,0.843398,0.003089,0.046751,0.037872,$0.797 (\pm0.04)$,$0.843 (\pm0.00)$,$0.047 (\pm0.04)$
Max(Acc - Eq. Odds),Bank Marketing,Pearson regularized FTL,0.842059,0.037943,5,0.882781,0.017358,0.040722,0.053618,$0.842 (\pm0.04)$,$0.883 (\pm0.02)$,$0.041 (\pm0.05)$
...,...,...,...,...,...,...,...,...,...,...,...,...
Max(MCC - Stat. Parity),Compas Recidivism,ftl_mlp_sreg_initializer,0.235185,0.047326,5,0.283411,0.018978,0.048226,0.043467,$0.235 (\pm0.05)$,$0.283 (\pm0.02)$,$0.048 (\pm0.04)$
Max(MCC - Stat. Parity),German Credit,Pearson regularized FTL,0.298430,0.061080,4,0.349387,0.034872,0.050957,0.031027,$0.298 (\pm0.06)$,$0.349 (\pm0.03)$,$0.051 (\pm0.03)$
Max(MCC - Stat. Parity),German Credit,Standard FTL (baseline),0.256495,0.089117,4,0.351942,0.027961,0.095446,0.062298,$0.256 (\pm0.09)$,$0.352 (\pm0.03)$,$0.095 (\pm0.06)$
Max(MCC - Stat. Parity),German Credit,Spearman regularized FTL,0.183352,0.166189,4,0.313005,0.111170,0.129653,0.078399,$0.183 (\pm0.17)$,$0.313 (\pm0.11)$,$0.130 (\pm0.08)$
