# Explore possible preprocessing methods with AIF360

In [1]:
import os
import glob

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



dataset_folder_name = 'Dataset/UTKFace'

IM_WIDTH = IM_HEIGHT = 198

dataset_dict = {
    'race_id': {
        0: 'white', 
        1: 'black', 
        2: 'asian', 
        3: 'indian', 
        4: 'others'
    },
    'gender_id': {
        0: 'male',
        1: 'female'
    }
}

dataset_dict['gender_alias'] = dict((g, i) for i, g in dataset_dict['gender_id'].items())
dataset_dict['race_alias'] = dict((g, i) for i, g in dataset_dict['race_id'].items())


def parse_dataset(dataset_path, ext='jpg'):
    """
    Used to extract information about our dataset. It does iterate over all images and return a DataFrame with
    the data (age, gender and sex) of all files.
    """
    def parse_info_from_file(path):
        """
        Parse information from a single file
        """
        try:
            filename = os.path.split(path)[1]
            filename = os.path.splitext(filename)[0]
            age, gender, race, _ = filename.split('_')

            return int(age), int(gender), int(race)
        except Exception as ex:
            return None, None, None
        
    files = glob.glob(os.path.join(dataset_path, "*.%s" % ext))
    
    records = []
    for file in files:
        info = parse_info_from_file(file)
        records.append(info)
        
    df = pd.DataFrame(records)
    df['file'] = files
    df.columns = ['age', 'gender', 'race', 'file']
    df = df.dropna()
    
    return df


df = parse_dataset(dataset_folder_name)

bins = [0, 10, 20, 30, 40, 60, 80, np.inf]
# cutting the age into bins
age_binned = pd.cut(df['age'], bins, labels=[0,1,2,3,4,5,6])
df['age'] = age_binned

df = df.drop(columns=['file'])


print(df.head())

  age  gender  race
0   6     0.0   0.0
1   6     0.0   0.0
2   6     1.0   0.0
3   6     1.0   0.0
4   6     1.0   0.0


## One vs Rest Reweighing for the group with the lowest disparate impact

female asian

In [20]:
inter_groups = []

for i, row in df.iterrows():
    if row['race'] == 2 and row['gender'] == 1:
        inter_groups.append(1)
    else:
        inter_groups.append(0)


df['inter_group'] = inter_groups

df[df['inter_group'] == 1].count()



age            1859
gender         1859
race           1859
inter_group    1859
dtype: int64

In [2]:
from aif360.datasets import BinaryLabelDataset

bld = BinaryLabelDataset(favorable_label=0, unfavorable_label=1, df=df, label_names=['gender'], protected_attribute_names=['race', 'age'])

In [6]:
from aif360.algorithms.preprocessing import Reweighing


RW = Reweighing([{'race': 1}], [{'race': 0}])

RW.fit(bld)
bld_transformed = RW.transform(bld)




In [7]:
w_train = bld_transformed.instance_weights.ravel()