In [31]:
import pandas as pd
import json
import os
import deepsig
from IPython.display import display

In [32]:
cols = ['dataset', 'method', 'fitness_rule', 'fitness', 'ACC', 'MCC', 'f1_score', 'avg_odds_diff', 'stat_par_diff', 'eq_opp_diff']

In [33]:
mlp_baseline_results = pd.read_csv('mlp_baseline_results.csv')
mlp_baseline_results.replace({'simple_mlp_initializer': 'Standard MLP (baseline)'}, inplace=True)

mlp_pearson_results = pd.read_csv('mlp_pearson_results.csv')
mlp_pearson_results.replace({'mlp_preg_initializer': 'Pearson regularized MLP'}, inplace=True)

mlp_spearman_results = pd.read_csv('mlp_spearman_results.csv')
mlp_spearman_results.replace({'mlp_sreg_initializer': 'Spearman regularized MLP'}, inplace=True)

mlp_results = pd.concat([mlp_baseline_results, mlp_pearson_results, mlp_spearman_results])

ftl_baseline_results = pd.read_csv('ftl_baseline_results.csv')
ftl_baseline_results.replace({'ftl_mlp_initializer': 'Standard FTL (baseline)'}, inplace=True)

ftl_pearson_results = pd.read_csv('ftl_pearson_results.csv')
ftl_pearson_results.replace({'ftl_mlp_initializer': 'Pearson regularized FTL'}, inplace=True)

ftl_spearman_results = pd.read_csv('ftl_spearman_results.csv')
ftl_spearman_results.replace({'ftl_mlp_initializer': 'Spearman regularized FTL'}, inplace=True)

ftl_results = pd.concat([ftl_baseline_results, ftl_pearson_results, ftl_spearman_results])

In [34]:
for results in [mlp_results,ftl_results]:
    results.replace({'adult_dataset_reader': 'Adult Income', 'compas_dataset_reader': 'Compas Recidivism', 'german_dataset_reader': 'German Credit', 'bank_dataset_reader': 'Bank Marketing'}, inplace=True)
    results.rename(columns={'avg_odds_diff': 'Equalized Odds', 'stat_par_diff': 'Statistical Parity', 'eq_opp_diff': 'Equal Opportunity', 'MCC': 'Mathew Correlation', 'ACC': 'Accuracy'}, inplace=True)

In [35]:
fitness_rules_target_metrics = {
    'mcc_parity': {'performance': 'Mathew Correlation', 'fairness': 'Statistical Parity'},
    'mcc_opportunity': {'performance': 'Mathew Correlation', 'fairness': 'Equal Opportunity'},
    'mcc_odds': {'performance': 'Mathew Correlation', 'fairness': 'Equalized Odds'},
    'acc_parity': {'performance': 'Accuracy', 'fairness': 'Statistical Parity'},
    'acc_opportunity': {'performance': 'Accuracy', 'fairness': 'Equal Opportunity'},
    'acc_odds': {'performance': 'Accuracy', 'fairness': 'Equalized Odds'}
}

fitness_rules_target_metrics = {
    'mcc_parity': ('Mathew Correlation', 'Statistical Parity'),
    'mcc_opportunity': ('Mathew Correlation', 'Equal Opportunity'),
    'mcc_odds': ('Mathew Correlation', 'Equalized Odds'),
    'acc_parity': ('Accuracy', 'Statistical Parity'),
    'acc_opportunity': ('Accuracy', 'Equal Opportunity'),
    'acc_odds': ('Accuracy', 'Equalized Odds')
}
fitness_rules_abvr = {
    'mcc_parity': 'Max(MCC - Stat. Parity)',
    'mcc_opportunity': 'Max(MCC - Eq. Odds)',
    'mcc_odds': 'Max(MCC - Eq. Opp.)',
    'acc_parity': 'Max(Acc - Stat. Parity)',
    'acc_opportunity': 'Max(Acc - Eq. Odds)',
    'acc_odds': 'Max(Acc - Eq. Opp.)'
}

for results in [mlp_results,ftl_results]:
    results['Performance'] = 0
    results['Fairness'] = 0
    results['Fitness Rule'] = ''
    for fitness_rule, (performance_metric, fairness_metric) in fitness_rules_target_metrics.items():
        results.loc[results.fitness_rule == fitness_rule,'Performance'] = results.loc[results.fitness_rule == fitness_rule,performance_metric]
        results.loc[results.fitness_rule == fitness_rule,'Fairness'] = results.loc[results.fitness_rule == fitness_rule,fairness_metric]
        results.loc[results.fitness_rule == fitness_rule,'Fitness Rule Abvr'] = fitness_rules_abvr[fitness_rule]
        results.loc[results.fitness_rule == fitness_rule,'Fitness Rule'] = 'Max(%s - %s)' % fitness_rules_target_metrics[fitness_rule]

 0.58075151 0.57811729 0.57147862 0.55847176 0.56402881 0.58191431
 0.58343253 0.57740222 0.51579307 0.30958237 0.37962947 0.55899189
 0.53558872 0.2598879  0.57717035 0.50086739 0.33275221 0.37929293
 0.47729109 0.29196869 0.29037704 0.52303609 0.27480633 0.38668781
 0.50626951 0.29960979 0.4042848  0.52119218 0.2936422  0.33522388
 0.49730721 0.25112662 0.35119067 0.510052   0.26843855 0.36081008
 0.52157495 0.30114722 0.42995147 0.48267704 0.29056691 0.23139881
 0.54223416 0.23653437 0.27041017 0.52121263 0.25759348 0.23912165
 0.5223892  0.27406652 0.22303564 0.54094974 0.27699558 0.30939251
 0.56841327 0.56481476 0.58141906 0.57136131 0.58334192 0.55288815
 0.56058845 0.5737951  0.57000708 0.57640052 0.58261133 0.57287806
 0.57274737 0.57959695 0.51101525 0.28242759 0.33054111 0.57762814
 0.5368048  0.31623248 0.3465674  0.53986748 0.2479788  0.36563621
 0.51959505 0.31018195 0.49570477 0.49453409 0.29658544 0.35273781
 0.57072426 0.26732327 0.33607704 0.5028025  0.28860323 0.4110

In [36]:
display(mlp_results)

Unnamed: 0,dataset,method,fitness_rule,fitness,Accuracy,Mathew Correlation,Equalized Odds,Statistical Parity,Equal Opportunity,solution,Performance,Fairness,Fitness Rule,Fitness Rule Abvr
0,Adult Income,Standard MLP (baseline),mcc_parity,0.392537,0.851299,0.585170,0.113127,0.192633,0.142532,{'dropout': 0.16228954240968418},0.585170,0.192633,Max(Mathew Correlation - Statistical Parity),Max(MCC - Stat. Parity)
1,Adult Income,Standard MLP (baseline),mcc_odds,0.473935,0.851520,0.585500,0.111565,0.198915,0.132052,{'dropout': 0.1482543145261793},0.585500,0.111565,Max(Mathew Correlation - Equalized Odds),Max(MCC - Eq. Opp.)
2,Adult Income,Standard MLP (baseline),mcc_opportunity,0.460175,0.851410,0.585064,0.103235,0.191606,0.124889,{'dropout': 0.17861515209545933},0.585064,0.124889,Max(Mathew Correlation - Equal Opportunity),Max(MCC - Eq. Odds)
3,Adult Income,Standard MLP (baseline),acc_parity,0.661796,0.844887,0.565373,0.082386,0.183090,0.084174,{'dropout': 0.15655371267306142},0.844887,0.183090,Max(Accuracy - Statistical Parity),Max(Acc - Stat. Parity)
4,Adult Income,Standard MLP (baseline),acc_odds,0.766170,0.850857,0.577932,0.084687,0.180366,0.093891,{'dropout': 0.1698638434734938},0.850857,0.084687,Max(Accuracy - Equalized Odds),Max(Acc - Eq. Opp.)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355,German Credit,Spearman regularized MLP,mcc_odds,0.225149,0.730000,0.350563,0.125414,0.074725,0.050121,"{'l2': 0.0001, 'dropout': 0.152236259935622}",0.350563,0.125414,Max(Mathew Correlation - Equalized Odds),Max(MCC - Eq. Opp.)
356,German Credit,Spearman regularized MLP,mcc_opportunity,0.346545,0.765000,0.397535,0.102469,0.096646,0.050990,"{'l2': 0.001, 'dropout': 0.052866765100860254}",0.397535,0.050990,Max(Mathew Correlation - Equal Opportunity),Max(MCC - Eq. Odds)
357,German Credit,Spearman regularized MLP,acc_parity,0.685911,0.745000,0.236132,0.051943,0.059089,0.078632,"{'l2': 0.01, 'dropout': 0.12328619122121832}",0.745000,0.059089,Max(Accuracy - Statistical Parity),Max(Acc - Stat. Parity)
358,German Credit,Spearman regularized MLP,acc_odds,0.704234,0.760000,0.401230,0.055766,0.009692,0.015102,"{'l2': 0.001, 'dropout': 0.14909089094980715}",0.760000,0.055766,Max(Accuracy - Equalized Odds),Max(Acc - Eq. Opp.)


In [37]:
datasets = ['Adult Income', 'Bank Marketing', 'Compas Recidivism','German Credit']
datasets

['Adult Income', 'Bank Marketing', 'Compas Recidivism', 'German Credit']

In [38]:
fitness_rules = ['mcc_parity', 'mcc_opportunity', 'mcc_odds', 'acc_parity', 'acc_opportunity', 'acc_odds']
fitness_rules

['mcc_parity',
 'mcc_opportunity',
 'mcc_odds',
 'acc_parity',
 'acc_opportunity',
 'acc_odds']

In [42]:
ftl_methods = ['Standard FTL (baseline)', 'Pearson regularized FTL', 'Spearman regularized FTL']
mlp_methods = ['Standard MLP (baseline)', 'Pearson regularized MLP', 'Spearman regularized MLP']
significances = []

In [43]:
for path, methods, results in zip(['mlp_multi_aso_data_list.json', 'ftl_multi_aso_data_list.json'],
                                  [mlp_methods, ftl_methods],
                                  (mlp_results,ftl_results)):
    if os.path.exists(path):
        with open(path) as file:
            multi_aso_data_list = json.load(file)
    else:    
        multi_aso_data_list = []
        for d in datasets:
            multi_aso_data = []
            for f in fitness_rules:
                methods_results = []
                for m in methods:
                    r = results.loc[ (results['dataset'] == d) &
                                         (results['fitness_rule'] == f) &
                                         (results['method'] == m) ]\
                                .fitness.tolist()
                    if len(r) == 0:
                        r = [-1]
                    methods_results.append(r)
                min_eps = deepsig.multi_aso(methods_results, confidence_level=0.95)
                multi_aso_data_list.append({'fitness_rule': f, 'dataset': d, 'min_eps': min_eps.tolist()})
        with open(path, 'w') as file:
            json.dump(multi_aso_data_list, file)
    
    aso_df_resume = []
    for aso_result in sorted(multi_aso_data_list, key=lambda x: x['dataset']):
        fitness_rule = aso_result['fitness_rule']
        dataset = aso_result['dataset']
    
        aso_df = pd.DataFrame(aso_result['min_eps'], columns=methods)
        aso_df['method'] = methods
        aso_df['dataset'] = dataset
        aso_df['fitness_rule'] = fitness_rule
        aso_df_resume.append(aso_df)
    
    print('Significance Testing')
    significance = pd.concat(aso_df_resume)
    significance.replace(fitness_rules_abvr, inplace=True)
    significance = significance.set_index(['fitness_rule', 'dataset'])
    significance = significance.sort_values(by=['fitness_rule', 'dataset'], ascending=[False, True])
    display(significance)
    significances.append(significance)
    
    grouped_results = results\
        .groupby(['Fitness Rule Abvr', 'dataset', 'method'])\
        .agg({'fitness': ['mean', 'std', 'count'], 'Performance': ['mean', 'std'], 'Fairness': ['mean', 'std']})\
        .sort_values(by=['Fitness Rule Abvr', 'dataset', ('fitness','mean')], ascending=False)
    grouped_results['formatted_fitness'] = grouped_results.apply(lambda row: f"${row[('fitness', 'mean')]:.3f} (\pm{row[('fitness', 'std')]:.2f})$", axis=1)
    grouped_results['formatted_performance'] = grouped_results.apply(lambda row: f"${row[('Performance', 'mean')]:.3f} (\pm{row[('Performance', 'std')]:.2f})$", axis=1)
    grouped_results['formatted_fairness'] = grouped_results.apply(lambda row: f"${row[('Fairness', 'mean')]:.3f} (\pm{row[('Fairness', 'std')]:.2f})$", axis=1)
    grouped_results = grouped_results.sort_values(by=['Fitness Rule Abvr', 'dataset'])
    display(grouped_results)

Significance Testing


Unnamed: 0_level_0,Unnamed: 1_level_0,Standard MLP (baseline),Pearson regularized MLP,Spearman regularized MLP,method
fitness_rule,dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Max(MCC - Stat. Parity),Adult Income,1.000000,1.000000,1.000000,Standard MLP (baseline)
Max(MCC - Stat. Parity),Adult Income,0.734049,1.000000,1.000000,Pearson regularized MLP
Max(MCC - Stat. Parity),Adult Income,0.130162,0.316765,1.000000,Spearman regularized MLP
Max(MCC - Stat. Parity),Bank Marketing,1.000000,0.468484,0.146821,Standard MLP (baseline)
Max(MCC - Stat. Parity),Bank Marketing,1.000000,1.000000,0.442639,Pearson regularized MLP
...,...,...,...,...,...
Max(Acc - Eq. Odds),Compas Recidivism,0.890634,1.000000,0.563892,Pearson regularized MLP
Max(Acc - Eq. Odds),Compas Recidivism,1.000000,1.000000,1.000000,Spearman regularized MLP
Max(Acc - Eq. Odds),German Credit,1.000000,1.000000,1.000000,Standard MLP (baseline)
Max(Acc - Eq. Odds),German Credit,0.451214,1.000000,0.884433,Pearson regularized MLP


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,fitness,fitness,fitness,Performance,Performance,Fairness,Fairness,formatted_fitness,formatted_performance,formatted_fairness
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,count,mean,std,mean,std,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Fitness Rule Abvr,dataset,method,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Max(Acc - Eq. Odds),Adult Income,Spearman regularized MLP,0.768344,0.044868,15,0.847105,0.002409,0.078761,0.044443,$0.768 (\pm0.04)$,$0.847 (\pm0.00)$,$0.079 (\pm0.04)$
Max(Acc - Eq. Odds),Adult Income,Standard MLP (baseline),0.758290,0.035161,15,0.847864,0.004669,0.089574,0.035575,$0.758 (\pm0.04)$,$0.848 (\pm0.00)$,$0.090 (\pm0.04)$
Max(Acc - Eq. Odds),Adult Income,Pearson regularized MLP,0.746905,0.016776,15,0.849022,0.002941,0.102117,0.017054,$0.747 (\pm0.02)$,$0.849 (\pm0.00)$,$0.102 (\pm0.02)$
Max(Acc - Eq. Odds),Bank Marketing,Spearman regularized MLP,0.837676,0.064290,15,0.900022,0.003459,0.062346,0.063412,$0.838 (\pm0.06)$,$0.900 (\pm0.00)$,$0.062 (\pm0.06)$
Max(Acc - Eq. Odds),Bank Marketing,Pearson regularized MLP,0.829691,0.046308,15,0.901935,0.002759,0.072244,0.045095,$0.830 (\pm0.05)$,$0.902 (\pm0.00)$,$0.072 (\pm0.05)$
...,...,...,...,...,...,...,...,...,...,...,...,...
Max(MCC - Stat. Parity),Compas Recidivism,Standard MLP (baseline),0.067416,0.032416,15,0.281248,0.024988,0.213832,0.039258,$0.067 (\pm0.03)$,$0.281 (\pm0.02)$,$0.214 (\pm0.04)$
Max(MCC - Stat. Parity),Compas Recidivism,Pearson regularized MLP,0.062119,0.028627,15,0.289743,0.021580,0.227624,0.026098,$0.062 (\pm0.03)$,$0.290 (\pm0.02)$,$0.228 (\pm0.03)$
Max(MCC - Stat. Parity),German Credit,Standard MLP (baseline),0.284513,0.109260,15,0.344532,0.091782,0.060019,0.049881,$0.285 (\pm0.11)$,$0.345 (\pm0.09)$,$0.060 (\pm0.05)$
Max(MCC - Stat. Parity),German Credit,Spearman regularized MLP,0.264883,0.086960,15,0.368131,0.071892,0.103248,0.061236,$0.265 (\pm0.09)$,$0.368 (\pm0.07)$,$0.103 (\pm0.06)$


Significance Testing


Unnamed: 0_level_0,Unnamed: 1_level_0,Standard FTL (baseline),Pearson regularized FTL,Spearman regularized FTL,method
fitness_rule,dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Max(MCC - Stat. Parity),Adult Income,1.0,1.0,0.414297,Standard FTL (baseline)
Max(MCC - Stat. Parity),Adult Income,0.570111,1.0,0.2001,Pearson regularized FTL
Max(MCC - Stat. Parity),Adult Income,1.0,1.0,1.0,Spearman regularized FTL
Max(MCC - Eq. Opp.),Adult Income,1.0,1.0,1.0,Standard FTL (baseline)
Max(MCC - Eq. Opp.),Adult Income,0.33012,1.0,0.693668,Pearson regularized FTL
Max(MCC - Eq. Opp.),Adult Income,0.40972,1.0,1.0,Spearman regularized FTL
Max(MCC - Eq. Odds),Adult Income,1.0,0.214025,0.59168,Standard FTL (baseline)
Max(MCC - Eq. Odds),Adult Income,1.0,1.0,1.0,Pearson regularized FTL
Max(MCC - Eq. Odds),Adult Income,1.0,0.463396,1.0,Spearman regularized FTL
Max(Acc - Stat. Parity),Adult Income,1.0,1.0,1.0,Standard FTL (baseline)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,fitness,fitness,fitness,Performance,Performance,Fairness,Fairness,formatted_fitness,formatted_performance,formatted_fairness
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,count,mean,std,mean,std,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Fitness Rule Abvr,dataset,method,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Max(Acc - Eq. Odds),Adult Income,Pearson regularized FTL,0.81644,0.024623,16,0.844002,0.00914,0.027562,0.022816,$0.816 (\pm0.02)$,$0.844 (\pm0.01)$,$0.028 (\pm0.02)$
Max(Acc - Eq. Odds),Adult Income,Standard FTL (baseline),0.8134,0.021986,13,0.847098,0.002867,0.033698,0.02106,$0.813 (\pm0.02)$,$0.847 (\pm0.00)$,$0.034 (\pm0.02)$
Max(Acc - Eq. Odds),Adult Income,Spearman regularized FTL,0.793284,0.036451,12,0.843265,0.003187,0.049981,0.037638,$0.793 (\pm0.04)$,$0.843 (\pm0.00)$,$0.050 (\pm0.04)$
Max(Acc - Eq. Opp.),Adult Income,Pearson regularized FTL,0.806716,0.015056,16,0.843954,0.007224,0.037238,0.01454,$0.807 (\pm0.02)$,$0.844 (\pm0.01)$,$0.037 (\pm0.01)$
Max(Acc - Eq. Opp.),Adult Income,Spearman regularized FTL,0.805827,0.011884,12,0.8425,0.003842,0.036674,0.011862,$0.806 (\pm0.01)$,$0.843 (\pm0.00)$,$0.037 (\pm0.01)$
Max(Acc - Eq. Opp.),Adult Income,Standard FTL (baseline),0.801817,0.015775,14,0.839035,0.008325,0.037218,0.017543,$0.802 (\pm0.02)$,$0.839 (\pm0.01)$,$0.037 (\pm0.02)$
Max(Acc - Stat. Parity),Adult Income,Pearson regularized FTL,0.81338,0.011659,16,0.828372,0.008529,0.014992,0.011506,$0.813 (\pm0.01)$,$0.828 (\pm0.01)$,$0.015 (\pm0.01)$
Max(Acc - Stat. Parity),Adult Income,Spearman regularized FTL,0.809098,0.012363,12,0.824912,0.009076,0.015814,0.008208,$0.809 (\pm0.01)$,$0.825 (\pm0.01)$,$0.016 (\pm0.01)$
Max(Acc - Stat. Parity),Adult Income,Standard FTL (baseline),0.799946,0.023751,14,0.827561,0.00875,0.027615,0.026391,$0.800 (\pm0.02)$,$0.828 (\pm0.01)$,$0.028 (\pm0.03)$
Max(MCC - Eq. Odds),Adult Income,Standard FTL (baseline),0.552515,0.029883,14,0.580809,0.010422,0.028294,0.02873,$0.553 (\pm0.03)$,$0.581 (\pm0.01)$,$0.028 (\pm0.03)$


In [46]:
significances[0].sort_values(by=['fitness_rule', 'dataset'], ascending=[False, True])

Unnamed: 0_level_0,Unnamed: 1_level_0,Standard MLP (baseline),Pearson regularized MLP,Spearman regularized MLP,method
fitness_rule,dataset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Max(MCC - Stat. Parity),Adult Income,1.000000,1.000000,1.000000,Standard MLP (baseline)
Max(MCC - Stat. Parity),Adult Income,0.734049,1.000000,1.000000,Pearson regularized MLP
Max(MCC - Stat. Parity),Adult Income,0.130162,0.316765,1.000000,Spearman regularized MLP
Max(MCC - Stat. Parity),Bank Marketing,1.000000,0.468484,0.146821,Standard MLP (baseline)
Max(MCC - Stat. Parity),Bank Marketing,1.000000,1.000000,0.442639,Pearson regularized MLP
...,...,...,...,...,...
Max(Acc - Eq. Odds),Compas Recidivism,0.890634,1.000000,0.563892,Pearson regularized MLP
Max(Acc - Eq. Odds),Compas Recidivism,1.000000,1.000000,1.000000,Spearman regularized MLP
Max(Acc - Eq. Odds),German Credit,1.000000,1.000000,1.000000,Standard MLP (baseline)
Max(Acc - Eq. Odds),German Credit,0.451214,1.000000,0.884433,Pearson regularized MLP
