# Reweighing TEST

Trying to calculate the sample weights for all race_gender subgroup (one vs all) and calculating the mean for all subgroups over all sampleweights

In [1]:
from IPython.display import display, Markdown, Latex
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from IPython.display import set_matplotlib_formats

set_matplotlib_formats('svg')
colors = ['#edf8e9','#c7e9c0','#a1d99b','#74c476','#31a354','#006d2c']


SAVE_PLOTS = False


dataset_dict = {
    'race_id': {
        0: 'white', 
        1: 'black', 
        2: 'asian', 
        3: 'indian', 
        4: 'others'
    },
    'gender_id': {
        0: 'male',
        1: 'female'
    },
    'age_id': {
        0: '<10',
        1: '10-20',
        2: '20-30',
        3: '30-40',
        4: '40-60',
        5: '60-80',
        6: '80+'
    }
}

dataset_dict['gender_alias'] = dict((g, i) for i, g in dataset_dict['gender_id'].items())
dataset_dict['race_alias'] = dict((r, i) for i, r in dataset_dict['race_id'].items())
dataset_dict['age_alias'] = dict((a, i) for i, a in dataset_dict['age_id'].items())

In [2]:
# read csv with predictions from the model
predictions_path = r"C:\Users\thoma\Documents\_FAIRALGOS\utk-fairness-analysis\Predictions\df_predctions_all.csv"
df_predictions = pd.read_csv(predictions_path, index_col=0)

# analysis with regard to the gender prediction only -> dropping age and race predictions
df_bld = df_predictions.drop(columns=['age_pred', 'race_pred']).rename(columns={'age_true': 'age', 'race_true': 'race'})

# transforming gender_true and gender_pred into a single attribute if the prediction was correct
pred_true = []
for i, row in df_bld.iterrows():
    if(row['gender_true'] == row['gender_pred']):
        pred_true.append(1)
    else:
        pred_true.append(0)

df_bld['pred_true'] = pred_true
df_bld = df_bld.drop(columns=['gender_pred']).rename(columns={'gender_true': 'gender'})

bins = [0, 10, 20, 30, 40, 60, 80, np.inf]
# cutting the age into bins
age_binned = pd.cut(df_bld['age'], bins, labels=[0,1,2,3,4,5,6])
df_bld['age'] = age_binned

In [19]:
# one vs rest groupings
def one_vs_rest_df(df, conditions={'race': 1, 'gender': 1}):
    # conditions is a dictionary with column - value pairs

    # 1 equals rest, 0 equals subgroup
    groups = []
    for i, row in df.iterrows():
        group = 0 # assume subgroup until on condition fails
        for column, value in conditions.items():
            if(row[column] != value):
                group = 1
                break
        groups.append(group)

    new_df = df.copy()
    new_df['group'] = groups

    return new_df

In [20]:
from aif360.datasets import BinaryLabelDataset
from aif360.algorithms.preprocessing import Reweighing

weights = []

for gender_id, gender_alias in dataset_dict['gender_id'].items():
    for race_id, race_alias in dataset_dict['race_id'].items():
        df_sub = one_vs_rest_df(df_bld, conditions={'race': race_id, 'gender': gender_id})
        bld = BinaryLabelDataset(favorable_label=1, unfavorable_label=0, df=df_sub, label_names=['pred_true'], protected_attribute_names=['race', 'age', 'gender','group'])
        RW = Reweighing([{'group': 0}], [{'group': 1}])
        RW.fit(bld)
        bld_transformed = RW.transform(bld)
        w_train = bld_transformed.instance_weights.ravel()
        print(len(w_train))
        weights.append(w_train)

7040
7040
7040
7040
7040
7040
7040
7040
7040
7040


In [35]:
weigths_mean = np.mean(weights, axis=0)

df_bld['weights'] = weigths_mean


acc_weights = {'female': {'white': {'weight': 0, 'total': 0}, 'black': {'weight': 0, 'total': 0}, 'asian': {'weight': 0, 'total': 0}, 'indian': {'weight': 0, 'total': 0}, 'others': {'weight': 0, 'total': 0}}, 'male': {
    'white': {'weight': 0, 'total': 0}, 'black': {'weight': 0, 'total': 0}, 'asian': {'weight': 0, 'total': 0}, 'indian': {'weight': 0, 'total': 0}, 'others': {'weight': 0, 'total': 0}}}


for i, row in df_bld.iterrows():
    gender = dataset_dict['gender_id'][row['gender']]
    race = dataset_dict['race_id'][row['race']]
    weight = row['weights']

    acc_weights[gender][race]['weight'] += weight
    acc_weights[gender][race]['total'] += 1


for gender_id, gender_alias in dataset_dict['gender_id'].items():
    for race_id, race_alias in dataset_dict['race_id'].items():

        total = acc_weights[gender_alias][race_alias]['total'] 
        acc_weights[gender_alias][race_alias]['weight'] /= total


print(acc_weights)




{'female': {'white': {'weight': 1.0000859768999493, 'total': 1388}, 'black': {'weight': 1.0000947320909475, 'total': 655}, 'asian': {'weight': 0.9996453453531876, 'total': 568}, 'indian': {'weight': 1.000131283142888, 'total': 523}, 'others': {'weight': 1.0001248417406206, 'total': 298}}, 'male': {'white': {'weight': 1.00007922120209, 'total': 1616}, 'black': {'weight': 0.9998653674745138, 'total': 681}, 'asian': {'weight': 0.999727850814823, 'total': 459}, 'indian': {'weight': 0.9999673852528987, 'total': 637}, 'others': {'weight': 1.0001095340344832, 'total': 215}}}
