In [1]:
import math
import time
import copy
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.preprocessing import LabelEncoder
import fairlearn.datasets as fairlearnDatasets
from sklearn.model_selection import train_test_split
from FairDebugger import Dataset, FairnessMetric, FairnessDebuggingUsingMachineUnlearning

# Dataset class for ACS Income dataset

In [2]:
'''Class for loading and preprocessing Adults Income Dataset'''
class ACSIncomeDataset(Dataset): 
    def __init__(self, rootTrain, rootTest, column_names = None):
        Dataset.__init__(self, 
                         rootTrain = rootTrain,
                         rootTest = rootTest,
                         column_names = column_names)
        self.train = self.trainDataset
        self.test = self.testDataset
        self.trainLattice, self.testLattice = self.__preprocessDatasetForCategorization(self.train), self.__preprocessDatasetForCategorization(self.test)
        
    def getDataset(self):
        return self.dataset, self.train, self.test

    def getDatasetWithNormalPreprocessing(self):
        return self.train, self.test
    
    def getDatasetWithCategorizationPreprocessing(self, decodeAttributeValues = False):
        return self.trainLattice, self.testLattice
    
    def __preprocessDatasetForCategorization(self, dataset):
        df = copy.deepcopy(dataset)
        # Converting Continuous Variables to Categorical
        df['AGEP'] = pd.cut(df['AGEP'], [0, 30, 60, 100], 
                labels = ['Young', 'Middle-aged', 'Senior'], 
                right = True, include_lowest = True)
        df['WKHP'] = pd.cut(df['WKHP'], [0, 20, 40, 100], 
                            labels = ['Part-time', 'Full-time', 'Over-time'], 
                            right = True, include_lowest = True)
        for col in df.columns:
            if col == 'AGEP' or col == 'WKHP' or col == 'OCCP':
                df[col] = col + " = " + df[col].astype(str)
        df = df.reset_index(drop = True)
        return self.__decodeAttributeCodeToRealValues(df)
    
    def __decodeAttributeCodeToRealValues(self, dataset):
        df = copy.deepcopy(dataset)
        map_code_to_real = {
            "COW": { 
                1: "COW = Private",
                2: "COW = Private-non-profit",
                3: "COW = Local-gov",
                4: "COW = State-gov",
                5: "COW = Fed-gov",
                6: "COW = Self-employed-not-own",
                7: "COW = Self-employed-own",
                8: "COW = Without-pay",
                9: "COW = Unemployed"
            },
            "SCHL": {
                1: "SCHL = No schooling completed",
                2: "SCHL = Nursery school/preschool",
                3: "SCHL = Kindergarten",
                4: "SCHL = Grade 1",
                5: "SCHL = Grade 2",
                6: "SCHL = Grade 3",
                7: "SCHL = Grade 4",
                8: "SCHL = Grade 5",
                9: "SCHL = Grade 6",
                10: "SCHL = Grade 7",
                11: "SCHL = Grade 8",
                12: "SCHL = Grade 9",
                13: "SCHL = Grade 10",
                14: "SCHL = Grade 11",
                15: "SCHL = 12th Grade-no diploma",
                16: "SCHL = Regular high school diploma",
                17: "SCHL = GED or alternative credential",
                18: "SCHL = Some college-<1 year",
                19: "SCHL = >=1 college credit but no degree",
                20: "SCHL = Associate’s degree",
                21: "SCHL = Bachelor’s degree", 
                22: "SCHL = Master’s degree", 
                23: "SCHL = Professional degree beyond a bachelor’s degree",
                24: "SCHL = Doctorate degree"
            },
            "MAR": {
                1: "MAR = Married",
                2: "MAR = Widowed",
                3: "MAR = Divorced",
                4: "MAR = Separated", 
                5: "MAR = Never married or <15 yrs old"
            },
            "RELP": {
                0: "RELP = Reference person",
                1: "RELP = Husband/wife",
                2: "RELP = Biological son or daughter", 
                3: "RELP = Adopted son or daughter",
                4: "RELP = Stepson or stepdaughter", 
                5: "RELP = Brother or sister", 
                6: "RELP = Father or mother",
                7: "RELP = Grandchild",
                8: "RELP = Parent-in-law",
                9: "RELP = Son-in-law or daughter-in-law",
                10: "RELP = Other relative",
                11: "RELP = Roomer or boarder",
                12: "RELP = Housemate or roommate",
                13: "RELP = Unmarried partner",
                14: "RELP = Foster child",
                15: "RELP = Other nonrelative",
                16: "RELP = Institutionalized grp quarters pop",
                17: "RELP = Noninstitutionalized grp quarters pop"
            },
            "SEX": {
                1 : "SEX = Male",
                2 : "SEX = Female"
            },
            "RAC1P": {
                1: "RAC1P = White alone",
                2: "RAC1P = Black or African American alone",
                3: "RAC1P = American Indian alone",
                4: "RAC1P = Alaska Native alone",
                5: "RAC1P = American Indian and/or Alaska Native tribes",
                6: "RAC1P = Asian alone", 
                7: "RAC1P = Native Hawaiian and Other Pacific Islander alone",
                8: "RAC1P = Some Other Race alone",
                9: "RAC1P = Two or More Races"
            }
        }
        '''Add OCCP here'''
        columns = ['COW', 'SCHL', 'MAR', 'RELP', 'SEX', 'RAC1P']
        for col in columns:
            df[col] = df[col].map(map_code_to_real[col]).fillna(df[col])
        return df

# Experiment

In [3]:
rootTrain = 'Dataset/trainACSIncome.csv'
rootTest = 'Dataset/testACSIncome.csv'
myDataset = ACSIncomeDataset(rootTrain = rootTrain, rootTest = rootTest)

In [4]:
fairnessDebug = FairnessDebuggingUsingMachineUnlearning(myDataset,
                                                        ["SEX", 1.0, 2.0],
                                                        "PINCP",
                                                        FairnessMetric.SP)
print("OriginalAccuracy: " + fairnessDebug.getAccuracy() + ", originalSP: " + str(fairnessDebug.getDatasetStatisticalParity()) + ", originalPP: " + str(fairnessDebug.getDatasetPredictiveParity()) + ", originalEO: " + str(fairnessDebug.getDatasetEqualizingOddsParity()))

OriginalAccuracy: 78.65341295097794%, originalSP: 0.07633216366900852, originalPP: 0.09280686807274141, originalEO: 0.016399061449940605


In [12]:
bias_inducing_subsets = fairnessDebug.latticeSearchSubsets(4, (0.05, 0.15), "normal", True)
bias_inducing_subsets

level: 0
level: 1
level: 2
level: 3


Unnamed: 0,Subset,Size,Support,Parity,GT_Parity,Accuracy,GT_Accuracy,timeElapsedToTrain,timeElapsedToDelete,Parity_Reduction,Accuracy_Reduction
0,"{'WKHP = Over-time', 'COW = Private'}",16494,0.1474442636726083,0.0554741374136225,0.0707698954148354,0.7803840240283191,0.7826009225158223,2.830281972885132,16.51186203956604,27.325344,0.781924807928359
1,{'AGEP = Senior'},11647,0.1041156383530295,0.0608331212685462,0.0809539633519504,0.7832087817785247,0.7851396288482855,2.836836814880371,26.12115788459778,20.304733,0.4227849252170761
2,"{'AGEP = Middle-aged', 'SCHL = >=1 college cre...",10737,0.0959809057264942,0.0648733430586873,0.0791200135647782,0.7841026924589695,0.7875710658990953,2.892995595932007,9.647401809692385,15.011785,0.30913306359959
3,{'WKHP = Part-time'},16015,0.143162354960399,0.0654789135148067,0.0847387556772038,0.7809561268638038,0.7839596667500983,2.7843971252441406,29.496341943740845,14.21845,0.7091876164931605
4,{'COW = Local-gov'},9604,0.0858527166431266,0.0669430892406999,0.0678810096574135,0.7834590767690492,0.7851753852755032,2.745922327041626,6.40424919128418,12.300286,0.3909624039641795


# How explainable are the bias inducing subsets?

In [14]:
subsets = [{'WKHP = Over-time', 'COW = Private'},
{'AGEP = Senior'},
{'AGEP = Middle-aged', 'SCHL = >=1 college credit but no degree'},  
{'WKHP = Part-time'},
{'COW = Local-gov'}]

In [15]:
featImp = fairnessDebug.getFeatureImportanceChanges(subsets)
featImp

Unnamed: 0,Subset,WKHP,OCCP,RELP,AGEP,SCHL,MAR,SEX,COW,RAC1P
0,"{'COW = Private', 'WKHP = Over-time'}",-15.56089508226709,3.1712009166887074,18.297729818822884,-10.088977379298582,0.8118523789995513,8.065375080306092,-5.2246588049110345,615.3505224145963,-45.79848406220681
1,{'AGEP = Senior'},12.947944106312487,-7.491516110989635,23.25913332168489,-19.67487909670937,-11.307442207828544,2.321207380226812,-14.459366495120674,-9.468947336134104,-67.21270938092538
2,"{'AGEP = Middle-aged', 'SCHL = >=1 college cre...",-15.889564345028584,-11.569901600962613,-4.546650974705159,-19.847821874978635,64.137770625406,59.7697684222348,-9.590983028481274,43.70533506623185,-51.79614884839039
3,{'WKHP = Part-time'},-47.36113930253991,30.20993470022737,-2.997594341807226,-1.2866209627271417,19.607380180940936,63.793100979917085,10.266519690540123,63.18321497422297,-33.093177233116535
4,{'COW = Local-gov'},-14.78399082844805,15.731653075834544,-15.89086818400162,13.33047207246286,-6.967021244963428,60.34496813595602,-11.060411381347782,-7.5305032563927154,-67.65426772165587


In [16]:
inference = fairnessDebug.drawInferencesFromResultSubsets(subsets, "Female", "Male")
inference

Unnamed: 0,Subset,Size,Support,SupportRange,Total_Female,Total_Male,Female_1s,Male_1s,Female_0s,Male_0s
0,Entire Train Dataset,111866,1.0,100%,54279,57587,0.31,0.44,0.69,0.56
1,"{'COW = Private', 'WKHP = Over-time'}",16494,0.1474442636726083,10-30%,5277,11217,0.38,0.37,0.62,0.63
2,{'AGEP = Senior'},11647,0.1041156383530295,10-30%,5570,6077,0.37,0.37,0.63,0.63
3,"{'AGEP = Middle-aged', 'SCHL = >=1 college cre...",10737,0.0959809057264942,5-10%,5214,5523,0.38,0.37,0.62,0.63
4,{'WKHP = Part-time'},16015,0.143162354960399,10-30%,9824,6191,0.38,0.37,0.62,0.63
5,{'COW = Local-gov'},9604,0.0858527166431266,5-10%,5610,3994,0.38,0.37,0.62,0.63
