In [1]:
import sys
sys.path.append('../../../')
from utils.packages import *
from utils.ml_fairness import *
from utils.standard_data import *
dir = 'res/bank/'
Path(dir).mkdir(parents=True, exist_ok=True)

d_fields = ['Name', 'Stage', 'CVR', 'CVD', 'V_SPD', 'V_EOD', 'V_AOD', 'V_ERD', 'Acc', 'F1','SPD', 'EOD', 'AOD', 'ERD']
diff_file = dir + 'diff' + '.csv'
if(not os.path.isfile(diff_file)):
    with open(diff_file, 'a') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(d_fields)
    
f_count = len([name for name in os.listdir(dir) if os.path.isfile(os.path.join(dir, name)) and not name.startswith('.')])
fields = ['Acc', 'F1', 'DI','SPD', 'EOD', 'AOD', 'ERD', 'CNT', 'TI']
filename = dir + str(f_count) + '.csv'
with open(filename, 'a') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(fields)

In [2]:
def custom_get_fair_metrics_and_plot(fname, data, model, model_aif=False):
    pred = model.predict(data).labels if model_aif else model.predict(data.features)
    pred = (pred >= 0.5) * 1
    fair = fair_metrics(fname, data, pred)
    return (pred, fair)


In [3]:
file_path = '../../data/bank/bank-additional-full.csv'

column_names = []
na_values=['unknown']

df = pd.read_csv(file_path, sep=';', na_values=na_values)

#### Drop na values
dropped = df.dropna()
count = df.shape[0] - dropped.shape[0]
print("Missing Data: {} rows removed.".format(count))
df = dropped

df['age'] = df['age'].apply(lambda x: np.float(x >= 25))

## Feature selection
# features_to_keep = []
# df = df[features_to_keep]
y2_df = df.copy()
# Create a one-hot encoding of the categorical variables.
cat_feat = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
df = pd.get_dummies(df, columns=cat_feat, prefix_sep='=')

for feature in cat_feat:
    le = LabelEncoder()
    y2_df[feature] = le.fit_transform(y2_df[feature])


Missing Data: 10700 rows removed.


In [4]:
for i in range(5):
    pro_att_name = ['age'] # ['race', 'sex']
    priv_class = [1] # ['White', 'Male']
    reamining_cat_feat = []
    seed = randrange(100)

    y2_data_orig, y2_X, y2_y = load_bank_data(y2_df, pro_att_name, priv_class, reamining_cat_feat)
    y2_data_orig_train, y2_data_orig_test = y2_data_orig.split([0.7], shuffle=True, seed=seed)

    y2_X_train = y2_data_orig_train.features
    y2_y_train = y2_data_orig_train.labels.ravel()
    y2_X_test = y2_data_orig_test.features
    y2_y_test = y2_data_orig_test.labels.ravel()

    y1_data_orig, y1_X, y1_y = load_bank_data(df, pro_att_name, priv_class, reamining_cat_feat)
    y1_data_orig_train, y1_data_orig_test = y1_data_orig.split([0.7], shuffle=True, seed=seed)

    y1_X_train = y1_data_orig_train.features
    y1_y_train = y1_data_orig_train.labels.ravel()
    y1_X_test = y1_data_orig_test.features
    y1_y_test = y1_data_orig_test.labels.ravel()

    import lightgbm as lgb
    from xgboost.sklearn import XGBClassifier

    y2_lgb_train = lgb.Dataset(data=y2_X_train, label=y2_y_train,  free_raw_data=False)
    y2_lgb_eval = lgb.Dataset(data=y2_X_test, label=y2_y_test, reference=y2_lgb_train,  free_raw_data=False)
    y2_evals_result={}
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'verbose': -1
    }

    y2_md = XGBClassifier()
    y2_mdl = lgb.train(params,
                    y2_lgb_train,
                    valid_sets = y2_lgb_eval,
                    num_boost_round= 150,
                    early_stopping_rounds= 25,
                    evals_result=y2_evals_result)


    y1_lgb_train = lgb.Dataset(data=y1_X_train, label=y1_y_train,  free_raw_data=False)
    y1_lgb_eval = lgb.Dataset(data=y1_X_test, label=y1_y_test, reference=y1_lgb_train,  free_raw_data=False)
    y1_evals_result={}

    y1_md = XGBClassifier()
    y1_mdl = lgb.train(params,
                    y1_lgb_train,
                    valid_sets = y1_lgb_eval,
                    num_boost_round= 150,
                    early_stopping_rounds= 25,
                    evals_result=y1_evals_result)



    # plot_model_performance(y2_mdl, y2_X_test, y2_y_test)
    y1_pred, y1_fair = custom_get_fair_metrics_and_plot(filename, y1_data_orig_test, y1_mdl)
    y2_pred, y2_fair = custom_get_fair_metrics_and_plot(filename, y2_data_orig_test, y2_mdl)



    y1_fair = y1_fair.drop(['DI', 'CNT', 'TI'], axis=1)
    y2_fair = y2_fair.drop(['DI', 'CNT', 'TI'], axis=1)
    CVR, CVD, AVR_EOD, AVD_EOD, AVR_SPD, AVD_SPD, AVD_AOD, AV_ERD = compute_new_metrics(y2_data_orig_test, y1_pred, y2_pred)
    row_y1 = y1_fair.iloc[[0]].values[0].tolist()
    row_y2 = y2_fair.iloc[[0]].values[0].tolist()
    diff = []

    diff.append(CVR)
    diff.append(CVD)
    diff.append(AVD_SPD)
    diff.append(AVD_EOD)
    diff.append(AVD_AOD)
    diff.append(AV_ERD)

    for i in range(len(row_y2)):
        if(i < 2):
            change = row_y2[i] - row_y1[i]
        else:
            sign = ''
            if(row_y2[i] >= 0 and row_y1[i] >= 0):
                sign = '(+)'
                d = abs(row_y2[i]) - abs(row_y1[i])
            if(row_y2[i] < 0 and row_y1[i] < 0):
                sign = '(-)'
                d = abs(row_y2[i]) - abs(row_y1[i])
            if(row_y2[i] < 0 and row_y1[i] >= 0):
                sign = '(+-)'
                d = row_y2[i] - row_y1[i]
            if(row_y2[i] >=0 and row_y1[i] < 0):
                sign = '(-+)'
                d = row_y2[i] - row_y1[i]
            d = round(d, 3)
            change = sign + ' ' + str(d)

        diff.append(change)

    cols = ['CVR', 'CVD ', 'AV_SPD', 'AV_EOD', 'AV_AOD', 'AV_ERD', 'Acc', 'F1','SPD', 'EOD', 'AOD', 'ERD']
    # metrics = pd.DataFrame(data=obj_fairness, index=['y1'], columns=cols)
    diff_df = pd.DataFrame(data=[diff], columns  = cols, index = ['Diff']).round(3)
    stage = 'LabelEncoder'
    model_name = 'bank2'
    diff = diff_df.iloc[0].values.tolist()
    diff.insert(0, stage)
    diff.insert(0, model_name)
    with open(diff_file, 'a') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(diff)    
        diff_df

[1]	valid_0's auc: 0.919658
Training until validation scores don't improve for 25 rounds
[2]	valid_0's auc: 0.923733
[3]	valid_0's auc: 0.923999
[4]	valid_0's auc: 0.926285
[5]	valid_0's auc: 0.926634
[6]	valid_0's auc: 0.926927
[7]	valid_0's auc: 0.927171
[8]	valid_0's auc: 0.927329
[9]	valid_0's auc: 0.929488
[10]	valid_0's auc: 0.934567
[11]	valid_0's auc: 0.936874
[12]	valid_0's auc: 0.937699
[13]	valid_0's auc: 0.9379
[14]	valid_0's auc: 0.938166
[15]	valid_0's auc: 0.938788
[16]	valid_0's auc: 0.939036
[17]	valid_0's auc: 0.939133
[18]	valid_0's auc: 0.939268
[19]	valid_0's auc: 0.939478
[20]	valid_0's auc: 0.93982
[21]	valid_0's auc: 0.939855
[22]	valid_0's auc: 0.939926
[23]	valid_0's auc: 0.939912
[24]	valid_0's auc: 0.940104
[25]	valid_0's auc: 0.940355
[26]	valid_0's auc: 0.940439
[27]	valid_0's auc: 0.941488
[28]	valid_0's auc: 0.941545
[29]	valid_0's auc: 0.941747
[30]	valid_0's auc: 0.941915
[31]	valid_0's auc: 0.942185
[32]	valid_0's auc: 0.942263
[33]	valid_0's auc: 0.9

[1]	valid_0's auc: 0.926282
Training until validation scores don't improve for 25 rounds
[2]	valid_0's auc: 0.92737
[3]	valid_0's auc: 0.928177
[4]	valid_0's auc: 0.928894
[5]	valid_0's auc: 0.92913
[6]	valid_0's auc: 0.929196
[7]	valid_0's auc: 0.929349
[8]	valid_0's auc: 0.929639
[9]	valid_0's auc: 0.929769
[10]	valid_0's auc: 0.929825
[11]	valid_0's auc: 0.93418
[12]	valid_0's auc: 0.938183
[13]	valid_0's auc: 0.938975
[14]	valid_0's auc: 0.938933
[15]	valid_0's auc: 0.938884
[16]	valid_0's auc: 0.939064
[17]	valid_0's auc: 0.93965
[18]	valid_0's auc: 0.939736
[19]	valid_0's auc: 0.940088
[20]	valid_0's auc: 0.940231
[21]	valid_0's auc: 0.940361
[22]	valid_0's auc: 0.940556
[23]	valid_0's auc: 0.940802
[24]	valid_0's auc: 0.941016
[25]	valid_0's auc: 0.941057
[26]	valid_0's auc: 0.941143
[27]	valid_0's auc: 0.941187
[28]	valid_0's auc: 0.9415
[29]	valid_0's auc: 0.942364
[30]	valid_0's auc: 0.94298
[31]	valid_0's auc: 0.943225
[32]	valid_0's auc: 0.943515
[33]	valid_0's auc: 0.94371

[88]	valid_0's auc: 0.947293
[89]	valid_0's auc: 0.947297
[90]	valid_0's auc: 0.947309
[91]	valid_0's auc: 0.947304
[92]	valid_0's auc: 0.947321
[93]	valid_0's auc: 0.947354
[94]	valid_0's auc: 0.947378
[95]	valid_0's auc: 0.94739
[96]	valid_0's auc: 0.947387
[97]	valid_0's auc: 0.947405
[98]	valid_0's auc: 0.947411
[99]	valid_0's auc: 0.947439
[100]	valid_0's auc: 0.947419
[101]	valid_0's auc: 0.947426
[102]	valid_0's auc: 0.947456
[103]	valid_0's auc: 0.947429
[104]	valid_0's auc: 0.947437
[105]	valid_0's auc: 0.947433
[106]	valid_0's auc: 0.947445
[107]	valid_0's auc: 0.947466
[108]	valid_0's auc: 0.947491
[109]	valid_0's auc: 0.94745
[110]	valid_0's auc: 0.947479
[111]	valid_0's auc: 0.947461
[112]	valid_0's auc: 0.947467
[113]	valid_0's auc: 0.94746
[114]	valid_0's auc: 0.947466
[115]	valid_0's auc: 0.947484
[116]	valid_0's auc: 0.947498
[117]	valid_0's auc: 0.947477
[118]	valid_0's auc: 0.94745
[119]	valid_0's auc: 0.947442
[120]	valid_0's auc: 0.947446
[121]	valid_0's auc: 0.947

[89]	valid_0's auc: 0.94324
[90]	valid_0's auc: 0.943206
[91]	valid_0's auc: 0.94315
[92]	valid_0's auc: 0.943137
[93]	valid_0's auc: 0.943157
[94]	valid_0's auc: 0.943198
[95]	valid_0's auc: 0.943237
[96]	valid_0's auc: 0.943241
[97]	valid_0's auc: 0.943296
[98]	valid_0's auc: 0.943296
[99]	valid_0's auc: 0.943309
[100]	valid_0's auc: 0.943313
[101]	valid_0's auc: 0.943337
[102]	valid_0's auc: 0.943363
[103]	valid_0's auc: 0.943313
[104]	valid_0's auc: 0.943301
[105]	valid_0's auc: 0.943286
[106]	valid_0's auc: 0.943242
[107]	valid_0's auc: 0.943293
[108]	valid_0's auc: 0.943226
[109]	valid_0's auc: 0.943195
[110]	valid_0's auc: 0.943192
[111]	valid_0's auc: 0.943238
[112]	valid_0's auc: 0.943248
[113]	valid_0's auc: 0.943288
[114]	valid_0's auc: 0.943298
[115]	valid_0's auc: 0.943312
[116]	valid_0's auc: 0.943329
[117]	valid_0's auc: 0.943318
[118]	valid_0's auc: 0.943317
[119]	valid_0's auc: 0.943312
[120]	valid_0's auc: 0.943277
[121]	valid_0's auc: 0.943279
[122]	valid_0's auc: 0.