In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix


In [17]:
class FairnessLabel:
    
    def __init__(self, data,target_v, model_flag = False, sensi_lab=[], protected_values=[], positive_label = None):
        self.model_flag = model_flag
        
        # Self.data to store data for function use
        self.data = data
        # If positive variable not given, auto_detect one 
        if positive_label:
            self.positive_label = positive_label
        else:
            self.positive_label = data[target_v].unique().tolist()[0]
            
        self.target_v = target_v
        
        # USER CAN NOT INPUT Attributes = 1
        if len(sensi_lab) == 1: 
            raise ValueError
        
         # TODO 
        # store the auto-detected sensitive attribuets in the order of difference of the target feature for the protected value among the values from the sensitive attribute
        # for example. black is protected value of rave and it has most different number of positive and negative target label, then race is the first attribuet in this list
        self.sensitive_atts = {}
        # store the auto-detected values for each sensitive attribute in above list, key is the sensitive attribute name. value is a ordered list of values
        # values are in the order of privilege from the most privileged value to the protected value. For example, for race, the value is [white, asian, black] since black is protected value
        self.sensi_atts_values = {}
        # store the auto-detected protected value for each sensitive attribute in above list: key is the sensitive attribut and value is the protevted value of this sensitive attribute
        # if "protected_values" is specified by user, then formatt the content in "protected_values" as dict: key and value is the same as above
        self.protected_values = {}
        # generate all pair-wise combinations in variable pair_comb
        self.pair_comb = []
        
        for i in range(len(protected_values)):
            self.protected_values.update({sensi_lab[i]: protected_values[i]})
        
        if sensi_lab and protected_values:
            
            # UPDATE self.pair_comb
            for i in range(len(sensi_lab) - 1):
                for j in range(i + 1,len(sensi_lab)):
                    self.pair_comb += [(sensi_lab[i], sensi_lab[j])]
            
            # UPDATE  self.sensitive_atts AND  self.sensi_atts_values
            for label in sensi_lab:
                categories = data[label].unique()
                # temporarily store each categories and corresponding values 
                cat_num = []
                for cat in categories:
                    cat_num += [(cat, len(data.loc[data[label] == cat ]))]
                cat_num = sorted(cat_num, key = lambda x: x[1], reverse = True)
                # UPDATE self.sensitive_atts for each label 
                self.sensitive_atts.update({label: [cat_num[i][0] for i in range(len(cat_num))]})
                
                # update self.sensi_atts_values for each label 
                self.sensi_atts_values.update({label: cat_num[-1][0]})
                    
                
                    
        else:
            # loop through all variables and find the categorical values 
            sensi_lab = []
            for var_name in list(data.columns):
                # we regard all variables with distinct values <= 5 as categorical values
                if len(data[var_name].unique()) <= 5:
                    sensi_lab += [var_name]
            if target_v in sensi_lab:
                sensi_lab.remove(target_v)
                    
            
            # UPDATE self.pair_comb
            for i in range(len(sensi_lab) - 1):
                for j in range(i + 1,len(sensi_lab)):
                    self.pair_comb += [(sensi_lab[i], sensi_lab[j])]
            # run auto-detection
            
            # UPDATE  self.sensitive_atts AND  self.sensi_atts_values
            for label in sensi_lab:
                categories = data[label].unique()
                # temporarily store each categories and corresponding values 
                cat_num = []
                for cat in categories:
                    cat_num += [(cat, len(data.loc[data[label] == cat ]))]
                cat_num = sorted(cat_num, key = lambda x: x[1], reverse = True)
                # update self.sensitive_atts for each label 
                self.sensitive_atts.update({label: [cat_num[i][0] for i in range(len(cat_num))]})
                # update self.sensi_atts_values for each label 
                self.sensi_atts_values.update({label: cat_num[-1][0]})
            
       
        
        # help dict for value normalization, use later
        self.base_value = {}
        
        # update after extraction
        self.static_label = {}
        self.performance_label = {}
        
        
   
    def extract_static_label(self):
        # TODO: add extraction code
        
        # renew self.staic_label
            self.static_label = {}
    
            # update static label
            for pair in self.pair_comb:
                # temporary dic to store values
                label1 = pair[0]
                label2 = pair[1]
                # calculate static label for each pair 
                numerator = self.data.groupby([label1,label2,self.target_v]).count()
                denominator = self.data.groupby([label1,label2]).count()
                table = (numerator / denominator).to_dict()
                table = table[list(table.keys())[0]]
                keys = list(table.keys())
                for i in range(0,len(keys)-1, 2):
                    if self.target_v in keys[i]:
                        pos = table[keys[i]]
                        neg = table[keys[i+1]]
                    else:
                        pos = table[keys[i+1]]
                        neg = table[keys[i]]
                    self.static_label.update({(keys[i][0],keys[i][1]):{'positive': pos, 'negative': neg}})
                
    
                                         
                            
                        
                        
                        
                        
    def compute_evaluation_metric(self,data, data_prediction, y_label, positive_label, label_order):
        #TODO: add weight for generalized metric
        # validity of the input data, must have two labels
        if len(data[y_label].unique()) != 2 or len(data_prediction[y_label].unique()) != 2: # compute manually
            
            TP = data[data[y_label] == positive_label].shape[0]
            TN = data.shape[0] - TP
            FP = len(set(data_prediction[data_prediction[y_label] == positive_label].index).intersection(data[data[y_label] == positive_label].index))
            FN = len(set(data_prediction[data_prediction[y_label] != positive_label].index).intersection(data[data[y_label] != positive_label].index))
        else:    
            TN, FP, FN, TP = confusion_matrix(list(data[y_label]), list(data_prediction[y_label]), labels=label_order).ravel()
        P = TP + FN
        N = TN + FP
        ACC = (TP+TN) / (P+N) if (P+N) > 0.0 else np.float64(0.0)
        return dict(
                    PR = P/ (P+N), P = TP + FN, N = TN + FP,
                    TPR=TP / P, TNR=TN / N, FPR=FP / N, FNR=FN / P,
                    PPV=TP / (TP+FP) if (TP+FP) > 0.0 else np.float64(0.0),
                    NPV=TN / (TN+FN) if (TN+FN) > 0.0 else np.float64(0.0),
                    FDR=FP / (FP+TP) if (FP+TP) > 0.0 else np.float64(0.0),
                    FOR=FN / (FN+TN) if (FN+TN) > 0.0 else np.float64(0.0),
                    ACC=ACC,
                    ERR=1-ACC,
                    F1=2*TP / (2*TP+FP+FN) if (2*TP+FP+FN) > 0.0 else np.float64(0.0)
                )
        
        
                        
    
                        
    
    def extract_performance_label(self,data, data_prediction, y_label, positive_label, label_order):
        
        for pair in self.pair_comb:
            att1 = pair[0]
            att2 = pair[1]
            cat_1 = data[att1].unique().tolist()
            cat_2 = data[att2].unique().tolist()
            for cat1 in cat_1:
                for cat2 in cat_2:
                    data_cat = data[(data[att1] == cat1) & (data[att2] == cat2)]
                    data_pred_cat = data_prediction[(data_prediction[att1] == cat1) & (data_prediction[att2] == cat2) ]
                    performance_info = self.compute_evaluation_metric(data_cat, data_pred_cat, y_label, positive_label, label_order)
                    self.performance_label.update({(cat1,cat2):performance_info})
            
        # round float numbers to 3 digits
        for tuple_key in list(self.performance_label.keys()):
            for key in list(self.performance_label[tuple_key].keys()):
                self.performance_label[tuple_key][key] = round(self.performance_label[tuple_key][key], 3)

                
                
            
            
    
    def add_sensitive_att(self, new_sensitive_att):
        # new_sensitive_att is usually a non-traditional sensitive feature like race or gender but user want to treat it as sensitive
        # update all the information in the __init__ accordingly
        for label in new_sensitive_att:
            categories = self.data[label].unique()
                # temporarily store each categories and corresponding values 
            cat_num = []
            for cat in categories:
                cat_num += [(cat, len(self.data.loc[self.data[label] == cat ]))]
                cat_num = sorted(cat_num, key = lambda x: x[1], reverse = True)
                # update self.sensitive_atts for each label 
                self.sensitive_atts.update({label: [cat_num[i][0] for i in range(len(cat_num))]})
                # update self.sensi_atts_values for each label
                self.sensi_atts_values.update({label: cat_num[-1][0]})
        
        # auto renew static_label
        self.extract_static_label()
            

print('done')
        

done


In [18]:
# EXAMPLE1: DO NOT PASS VARIABLES TO VARIABLES (static label)
data = pd.read_csv('../data/adult.csv')
fair = FairnessLabel(data,'income-per-year', False, [], [] ,'>50K')
fair.extract_static_label()
fair.static_label

{('Amer-Indian-Eskimo', 'Female'): {'positive': 0.10084033613445378,
  'negative': 0.8991596638655462},
 ('Amer-Indian-Eskimo', 'Male'): {'positive': 0.125, 'negative': 0.875},
 ('Asian-Pac-Islander', 'Female'): {'positive': 0.12427745664739884,
  'negative': 0.8757225433526011},
 ('Asian-Pac-Islander', 'Male'): {'positive': 0.3362193362193362,
  'negative': 0.6637806637806638},
 ('Black', 'Female'): {'positive': 0.05787781350482315,
  'negative': 0.9421221864951769},
 ('Black', 'Male'): {'positive': 0.18929254302103252,
  'negative': 0.8107074569789675},
 ('Other', 'Female'): {'positive': 0.05504587155963303,
  'negative': 0.944954128440367},
 ('Other', 'Male'): {'positive': 0.11728395061728394,
  'negative': 0.8827160493827161},
 ('White', 'Female'): {'positive': 0.118953945845869,
  'negative': 0.881046054154131},
 ('White', 'Male'): {'positive': 0.31756545321789925,
  'negative': 0.6824345467821008}}

In [19]:
# EXAMPLE2: PASS protected attributes to the variables (static label)
data = pd.read_csv('../data/adult.csv')

'''
target_var = data['income-per-year'].unique()
for i, value in enumerate(target_var):
    if i == 0 :
        data = data.replace(value, 0)
    else:
        data = data.replace(value, 1)
display(data)
'''
fair = FairnessLabel(data,'income-per-year', False, ['race', 'sex','relationship'], ['White','Female','Not-in-family'],'>50K')
fair.extract_static_label()
fair.static_label

{('Amer-Indian-Eskimo', 'Female'): {'positive': 0.10084033613445378,
  'negative': 0.8991596638655462},
 ('Amer-Indian-Eskimo', 'Male'): {'positive': 0.125, 'negative': 0.875},
 ('Asian-Pac-Islander', 'Female'): {'positive': 0.12427745664739884,
  'negative': 0.8757225433526011},
 ('Asian-Pac-Islander', 'Male'): {'positive': 0.3362193362193362,
  'negative': 0.6637806637806638},
 ('Black', 'Female'): {'positive': 0.05787781350482315,
  'negative': 0.9421221864951769},
 ('Black', 'Male'): {'positive': 0.18929254302103252,
  'negative': 0.8107074569789675},
 ('Other', 'Female'): {'positive': 0.05504587155963303,
  'negative': 0.944954128440367},
 ('Other', 'Male'): {'positive': 0.11728395061728394,
  'negative': 0.8827160493827161},
 ('White', 'Female'): {'positive': 0.118953945845869,
  'negative': 0.881046054154131},
 ('White', 'Male'): {'positive': 0.31756545321789925,
  'negative': 0.6824345467821008},
 ('Amer-Indian-Eskimo', 'Husband'): {'positive': 0.18478260869565216,
  'negative'

In [20]:
# EXAMPLE 3: PERFORMANCE LABELS 
fair.extract_performance_label(data, data, 'income-per-year', '>50K', ['>50K','<=50K'])
fair.performance_label

{('White', 'Male'): {'PR': 0.682,
  'P': 13085,
  'N': 6089,
  'TPR': 1.0,
  'TNR': 1.0,
  'FPR': 0.0,
  'FNR': 0.0,
  'PPV': 1.0,
  'NPV': 1.0,
  'FDR': 0.0,
  'FOR': 0.0,
  'ACC': 1.0,
  'ERR': 0.0,
  'F1': 1.0},
 ('White', 'Female'): {'PR': 0.881,
  'P': 7614,
  'N': 1028,
  'TPR': 1.0,
  'TNR': 1.0,
  'FPR': 0.0,
  'FNR': 0.0,
  'PPV': 1.0,
  'NPV': 1.0,
  'FDR': 0.0,
  'FOR': 0.0,
  'ACC': 1.0,
  'ERR': 0.0,
  'F1': 1.0},
 ('Black', 'Male'): {'PR': 0.811,
  'P': 1272,
  'N': 297,
  'TPR': 1.0,
  'TNR': 1.0,
  'FPR': 0.0,
  'FNR': 0.0,
  'PPV': 1.0,
  'NPV': 1.0,
  'FDR': 0.0,
  'FOR': 0.0,
  'ACC': 1.0,
  'ERR': 0.0,
  'F1': 1.0},
 ('Black', 'Female'): {'PR': 0.942,
  'P': 1465,
  'N': 90,
  'TPR': 1.0,
  'TNR': 1.0,
  'FPR': 0.0,
  'FNR': 0.0,
  'PPV': 1.0,
  'NPV': 1.0,
  'FDR': 0.0,
  'FOR': 0.0,
  'ACC': 1.0,
  'ERR': 0.0,
  'F1': 1.0},
 ('Asian-Pac-Islander', 'Male'): {'PR': 0.664,
  'P': 460,
  'N': 233,
  'TPR': 1.0,
  'TNR': 1.0,
  'FPR': 0.0,
  'FNR': 0.0,
  'PPV': 1.0,
 