In [1]:
import math
import time
import copy
import numpy as np
import pandas as pd
from collections import Counter
from aif360.datasets import MEPSDataset19
from sklearn.preprocessing import LabelEncoder
import fairlearn.datasets as fairlearnDatasets
from sklearn.model_selection import train_test_split
from DebugRF import Dataset, FairnessMetric, FairnessDebuggingUsingMachineUnlearning

# Dataset class for MEPS dataset

In [2]:
'''Class for loading and preprocessing german credits dataset'''
class MEPSDataset(Dataset): 
    def __init__(self, rootTrain, rootTest):
        Dataset.__init__(self, rootTrain = rootTrain, rootTest = rootTest)
        self.train = self.trainDataset
        self.test = self.testDataset
        self.trainProcessed, self.testProcessed = self.__preprocessDataset(self.train), self.__preprocessDataset(self.test)
        self.trainLattice, self.testLattice = self.__preprocessDatasetForCategorization(self.train), self.__preprocessDatasetForCategorization(self.test)
        
    def getDataset(self):
        return self.dataset, self.train, self.test

    def getDatasetWithNormalPreprocessing(self):
        return self.trainProcessed, self.testProcessed
    
    def getDatasetWithCategorizationPreprocessing(self, decodeAttributeValues = False):
        if decodeAttributeValues == True:
            return self.__decodeAttributeCodeToRealValues(self.trainLattice), self.__decodeAttributeCodeToRealValues(self.testLattice)
        return self.trainLattice, self.testLattice

    def __preprocessDataset(self, dataset):
        df = copy.deepcopy(dataset)
        columns = ['AGE', 'PCS42', 'MCS42', 'K6SUM42']
        quantiles = self.train[columns].quantile([0, .25, .5, .75, 1.0], axis = 0)
        for col in columns:
            if col == 'AGE':
                df[col] = pd.cut(df[col], 
                               [quantiles[col][0.0] - 1, 30, quantiles[col][1.0] + 1], 
                               labels = [0, 1], 
                               right = True, 
                               include_lowest = True)
            else:
                df[col] = pd.cut(df[col], 
                               [quantiles[col][0.0] - 1, quantiles[col][0.50], math.inf], 
                               labels = [0, 1], 
                               right = True, 
                               include_lowest = True)
        df['status'] = df['status'].astype(int)
        return df
    
    def __preprocessDatasetForCategorization(self, dataset):
        df = copy.deepcopy(dataset)
        columns = ['AGE', 'PCS42', 'MCS42', 'K6SUM42']
        quantiles = self.train[columns].quantile([0, .25, .5, .75, 1.0], axis = 0)
        for col in columns:
            if col == 'AGE':
                df[col] = pd.cut(df[col], 
                               [0, 30, 60, 100], 
                               labels = ['AGE = young', 'AGE = middle-aged', 'AGE = old'], 
                               right = True, 
                               include_lowest = True)
            else:
                df[col] = pd.cut(df[col], 
                               [quantiles[col][0.0] - 1, quantiles[col][0.50], math.inf], 
                               labels = [str(col) + ' = low', str(col) + ' = high'], 
                               right = True, 
                               include_lowest = True)
        df['status'] = df['status'].astype(int)
        return df
    
    def __decodeAttributeCodeToRealValues(self, dataset):
        df = copy.deepcopy(dataset)
        columns_done = ['AGE', 'PCS42', 'MCS42', 'K6SUM42']
        map_code_to_real = dict()
        for col in df.columns:
            if col in columns_done or col == 'status':
                continue
            keys = df[col].unique()
            colDict = dict()
            for key in keys:
                colDict[key] = col + " = " + str(key)
            map_code_to_real[col] = colDict
        for col in df.columns:
            if col in columns_done or col == 'status':
                continue
            df[col] = df[col].map(map_code_to_real[col]).fillna(df[col])
        return df

# Experiment

In [3]:
myDataset = MEPSDataset(rootTrain = 'Dataset/MEPS_train.csv',
                        rootTest = 'Dataset/MEPS_test.csv')

In [4]:
'''
Favorable and unfavorable labels
1.0 0.0
Protected attribute names
['RACE']
Privileged and unprivileged protected attribute values
[array([1.])] [array([0.])]
'''
fairnessDebug = FairnessDebuggingUsingMachineUnlearning(myDataset,
                                                        ["RACE", 1.0, 0.0],
                                                        "status",
                                                        FairnessMetric.SP)
print("OriginalAccuracy: " + fairnessDebug.getAccuracy() + ", originalSP: " + str(fairnessDebug.getDatasetStatisticalParity()) + ", originalPP: " + str(fairnessDebug.getDatasetPredictiveParity()) + ", originalEO: " + str(fairnessDebug.getDatasetEqualizingOddsParity()))

OriginalAccuracy: 84.80732785849654%, originalSP: 0.03423535058248615, originalPP: 0.05171208944793859, originalEO: 0.037964663583078734


In [5]:
bias_inducing_subsets = fairnessDebug.latticeSearchSubsets(2, (0.05, 0.15), "normal", True)
bias_inducing_subsets

level: 0
level: 1


Unnamed: 0,Subset,Size,Support,Parity,GT_Parity,Accuracy,GT_Accuracy,timeElapsedToTrain,timeElapsedToDelete,Parity_Reduction,Accuracy_Reduction
0,"{'CANCERDX = 1', 'CHBRON = 2'}",464,0.05862286797220467,0.00658754744971421,0.012117457131637506,0.8382817435249527,0.8423878711307644,0.2028498649597168,0.08526420593261719,80.758055,1.1545623836126617
1,"{'EMPST = 4', 'INSCOV = 1'}",850,0.10739102969046115,0.008214145468824992,0.013092194249312798,0.8382817435249527,0.8395451674036639,0.20641684532165527,0.11821985244750977,76.006831,1.1545623836126617
2,"{'CANCERDX = 1', 'EMPHDX = 2'}",460,0.05811749842072015,0.00878310572014486,0.015451808543130155,0.8408085912823753,0.8445988629185092,0.20779728889465332,0.09867453575134277,74.344923,0.8566108007448767
3,"{'CANCERDX = 1', 'COGLIM = 2'}",425,0.05369551484523057,0.00927091059819364,0.013499716392512762,0.8398610233733418,0.8436512950094757,0.20272302627563477,0.08204960823059082,72.920065,0.9683426443202977
4,{'CANCERDX = 1'},489,0.061781427668982945,0.00975871547624242,0.00878310572014486,0.8376500315855969,0.8401768793430195,0.20605897903442383,0.1240377426147461,71.495208,1.2290502793296176
...,...,...,...,...,...,...,...,...,...,...,...
151,"{'RTHLTH = 1', 'MIDX = -1'}",1161,0.1466835123183828,0.03415419520921506,0.036268598106374624,0.848389134554643,0.8493367024636765,0.2090592384338379,0.4011108875274658,0.237051,-0.03724394785846493
152,"{'RTHLTH = 1', 'ANGIDX = -1'}",1161,0.1466835123183828,0.03415419520921506,0.036268598106374624,0.848389134554643,0.8493367024636765,0.2000725269317627,0.3745994567871094,0.237051,-0.03724394785846493
153,"{'RTHLTH = 1', 'CHDDX = -1'}",1161,0.1466835123183828,0.03415419520921506,0.036268598106374624,0.848389134554643,0.8493367024636765,0.20105600357055664,0.4262092113494873,0.237051,-0.03724394785846493
154,"{'RTHLTH = 1', 'HIBPDX = -1'}",1161,0.1466835123183828,0.03415419520921506,0.036268598106374624,0.848389134554643,0.8493367024636765,0.22005915641784668,0.4376249313354492,0.237051,-0.03724394785846493


# How explainable are the bias inducing subsets?

In [11]:
subsets = [{'CANCERDX = 1', 'CHBRON = 2'},
{'EMPST = 4', 'INSCOV = 1'},
{'CANCERDX = 1', 'EMPHDX = 2'},  
{'CANCERDX = 1', 'COGLIM = 2'},
{'CANCERDX = 1'}]

In [12]:
featImp = fairnessDebug.getFeatureImportanceChanges(subsets)
featImp

Unnamed: 0,Subset,ARTHTYPE,WLKLIM,EMPST,ACTDTY,ACTLIM,SOCLIM,RTHLTH,AGE,ARTHDX,...,ANGIDX,SEX,ASTHDX,REGION,FTSTU,DFSEE42,MCS42,EMPHDX,DFHEAR42,STRKDX
0,"{'CANCERDX = 1', 'CHBRON = 2'}",-27.377751868507094,-1.5826293278665589,-24.51602110829966,-8.758786679157266,49.67436944899554,23.88223967878682,48.26101420797017,-34.176303205234866,33.535984649980364,...,-19.98732949860651,50.3305899736448,110.96333081292556,65.93105384237086,40.27297598821297,-23.1188494052614,12.49012805522468,154.62495415867875,50.7191488411061,170.817035869671
1,"{'EMPST = 4', 'INSCOV = 1'}",-33.91279992524495,-5.972165427762742,-14.214092426263717,-1.23315489873609,30.831020816232936,25.05294113129525,29.55849780919425,3.632752274606781,77.90946410240285,...,-27.727952402247148,42.26181457513621,68.88807088042498,126.91777386073802,-59.39259428611854,72.607360636087,69.99120826547517,33.00642910865904,95.8277135482071,101.07414982257062
2,"{'CANCERDX = 1', 'EMPHDX = 2'}",-9.19858613580056,-2.0306515124014184,-42.52875933644951,-31.1587496653253,93.0974474016187,34.254259548177,5.423045675078247,-7.372035401797698,35.297126252750815,...,-10.74467456442053,39.79841469784552,160.09517980957676,66.31291060659962,-43.98450564951868,-0.2137027562636661,14.284009800221666,362.0062254129016,71.1379226026904,201.84369013139653
3,"{'CANCERDX = 1', 'COGLIM = 2'}",-25.961246705305285,-9.614700949287224,-43.20422176346598,-28.103726217042844,50.10853435031106,84.60934293226966,68.09939368628977,-40.43957220241478,44.5438020664587,...,-68.17809328602256,33.38450941869932,151.45257203598922,61.32282368477348,-33.342987174564044,7.946147407461456,37.25964470269765,259.2064942897382,151.38389845574764,133.9783548332582
4,{'CANCERDX = 1'},-19.71096601071488,-8.407238031077323,-41.59429629131559,-15.77902642619721,21.69966692651775,85.1249412180715,24.934265347684374,-57.870003940635,42.36770897996631,...,-25.527992007785937,11.588618255558044,72.60109576387362,23.08875674123772,-24.278244629552816,29.41399744145894,66.66367200605306,348.9649253281321,128.24220330884043,315.1278512330995


In [13]:
inference = fairnessDebug.drawInferencesFromResultSubsets(subsets, "Blacks", "Non_Blacks")
inference

Unnamed: 0,Subset,Size,Support,SupportRange,Total_Blacks,Total_Non_Blacks,Blacks_1s,Non_Blacks_1s,Blacks_0s,Non_Blacks_0s
0,Entire Train Dataset,7915,1.0,100%,5051,2864,0.13,0.26,0.87,0.74
1,"{'CANCERDX = 1', 'CHBRON = 2'}",464,0.0586228679722046,5-10%,154,310,0.14,0.22,0.86,0.78
2,"{'EMPST = 4', 'INSCOV = 1'}",850,0.1073910296904611,10-30%,391,459,0.18,0.18,0.82,0.82
3,"{'CANCERDX = 1', 'EMPHDX = 2'}",460,0.0581174984207201,5-10%,164,296,0.23,0.17,0.77,0.83
4,"{'CANCERDX = 1', 'COGLIM = 2'}",425,0.0536955148452305,5-10%,143,282,0.18,0.19,0.82,0.81
5,{'CANCERDX = 1'},489,0.0617814276689829,5-10%,169,320,0.21,0.17,0.79,0.82
