In [113]:
pip install seaborn

Collecting seaborn
[?25l  Downloading https://files.pythonhosted.org/packages/a8/76/220ba4420459d9c4c9c9587c6ce607bf56c25b3d3d2de62056efe482dadc/seaborn-0.9.0-py3-none-any.whl (208kB)
[K     |████████████████████████████████| 215kB 5.3MB/s eta 0:00:01
Installing collected packages: seaborn
Successfully installed seaborn-0.9.0
Note: you may need to restart the kernel to use updated packages.


In [81]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
import seaborn as sns
import itertools

In [115]:
from itertools import combinations
lis = ["sex", "race", 'zip']
list(combinations(lis, 3))

[('sex', 'race', 'zip')]

In [168]:
class FairnessLabel:
    
    def __init__(self, data, target, model_flag = False, 
                 sensi_lab = [], protected_values = [], positive_label = None):
        
        self.model_flag = model_flag
        self.data = data
        self.target = target
        
        ''' If positive variable not given, auto_detect one '''
        if positive_label:
            self.positive_label = positive_label
        else:
            self.positive_label = data[target].unique().tolist()[0]
            
        # NOW WE CAN ACCEPT USER INPUT OF 1 ATTRIBUTES 
        if len(sensi_lab) == 1:
            self.sensitive_atts = sensi_lab
            self.protected_values = protected_values
        
        else:
            '''
            Store **auto-detected** sensitive attributes in the order of difference of the target feature 
            for the protected value among the values from the sensitive attribute.
            For example: Black is protected value of "race" and it has most different number of positive and negative target labels, 
            then race is the first attribute in this list
            '''
            self.sensitive_atts = {}

            '''
            Store **auto-detected** values for each sensitive attribute, where
            keys are names of sensitive attributes, and values are their ordered list of values.
            Values are in the order of privilege from the most privileged value to the protected value. 
            For example, for race, the value is [White, Asian, Black] since black is protected value
            '''
            self.sensi_atts_values = {}

            '''
            Store **auto-detected** protected value for each sensitive attribute,
            where key is the sensitive attribute and value is the protected value of this sensitive attribute.
            If "protected_values" is specified by user, then format the content 
            in "protected_values" as dict: key and value is the same as above
            '''
            self.protected_values = {}

            self.pair_comb = [] # generate all pair-wise combinations in variable pair_comb

            for i in range(len(protected_values)):
                self.protected_values.update({sensi_lab[i]: protected_values[i]})

            if sensi_lab:

                ''''''
                # UPDATE self.pair_comb
                for i in range(len(sensi_lab) - 1):
                    for j in range(i + 1,len(sensi_lab)):
                        self.pair_comb += [(sensi_lab[i], sensi_lab[j])]
                ''''''
                # UPDATE self.sensitive_atts AND self.sensi_atts_values
                for label in sensi_lab:
                    categories = data[label].unique()
                    # temporarily store each categories and corresponding values 
                    cat_num = []
                    for cat in categories:
                        cat_num += [(cat, len(data.loc[data[label] == cat ]))]
                    cat_num = sorted(cat_num, key = lambda x: x[1], reverse = True)
                    # UPDATE self.sensitive_atts for each label 
                    self.sensitive_atts.update({label: [cat_num[i][0] for i in range(len(cat_num))]})

                    # update self.sensi_atts_values for each label 
                    self.sensi_atts_values.update({label: cat_num[-1][0]})      

            else:
                # loop through all variables and find the categorical values 
                sensi_lab = []
                for var_name in list(data.columns):
                    if len(data[var_name].unique()) <= 5: # regard variables with distinct values <= 5 as categorical values
                        sensi_lab += [var_name]
                if target in sensi_lab:
                    sensi_lab.remove(target)

                ''''''
                # UPDATE self.pair_comb
                for i in range(len(sensi_lab) - 1):
                    for j in range(i + 1,len(sensi_lab)):
                        self.pair_comb += [(sensi_lab[i], sensi_lab[j])]
                # run auto-detection

                # UPDATE  self.sensitive_atts AND  self.sensi_atts_values
                for label in sensi_lab:
                    categories = data[label].unique()
                    # temporarily store each categories and corresponding values 
                    cat_num = []
                    for cat in categories:
                        cat_num += [(cat, len(data.loc[data[label] == cat ]))]
                    cat_num = sorted(cat_num, key = lambda x: x[1], reverse = True)
                    # update self.sensitive_atts for each label 
                    self.sensitive_atts.update({label: [cat_num[i][0] for i in range(len(cat_num))]})
                    # update self.sensi_atts_values for each label 
                    self.sensi_atts_values.update({label: cat_num[-1][0]})
            
       
        
        # help dict for value normalization, use later
        self.base_value = {}
        
        # update after extraction
        self.static_labels = {}
        self.performance_labels = {}
        
        
   
    def extract_static_label(self):
        # TODO: add extraction code
        
        # renew self.staic_label
        self.static_labels = {}
        negative_labels = self.data[self.target].unique().tolist()
        negative_labels.remove(self.positive_label)
        print(negative_labels)

        if len(self.sensitive_atts) == 1:
            numerator = self.data.groupby([self.sensitive_atts[0],self.target]).count()
            denominator = self.data.groupby(self.sensitive_atts[0]).count()
            table = (numerator / denominator).to_dict()
            table = table[list(table.keys())[0]]
            keys = list(table.keys())
            tem_info = {}
            for i in range(0,len(keys)-1, 2):
                if self.target in keys[i]:
                    pos = table[keys[i]]
                    neg = table[keys[i+1]]
                else:
                    pos = table[keys[i+1]]
                    neg = table[keys[i]]
                tem_info.update({(keys[i][0]):{'positive': pos, 'negative': neg}})
            self.static_labels.update({self.sensitive_atts[0]:tem_info})
        else:
            # update static label
            for pair in self.pair_comb:  # calculate static label for each pair 
                tem_info = {}
                cat_1 = self.data[pair[0]].unique().tolist()
                cat_2 = self.data[pair[1]].unique().tolist()
                pair = [i for i in pair]
                column_numerator = pair + [self.target]
                column_denominator = pair[0]
                numerator = self.data[column_numerator].groupby(column_numerator)
                denominator = self.data[[pair[0]]].groupby([pair[0]])
                for cat1 in cat_1:
                    dem = len(denominator.get_group((cat1))[pair[0]])
                    for cat2 in cat_2:
                        negative_num = 0
                        try:
                            postive_num = len(numerator.get_group((cat1,cat2,self.positive_label))[self.target])
                        except:
                            postive_num  = 0
                       
                            positve_num = 0
                        for negtive_label in negative_labels:
                            try:
                                negative_num += len(numerator.get_group((cat1,cat2,negtive_label))[self.target])
                            except:
                                pass
                        tem_info.update({(cat1,cat2):{'positive': round(postive_num/dem,3), 'negative': round(negative_num/dem,3)}})
                    self.static_labels.update({(pair[0],pair[1]): tem_info})
            
                        
                        
                        
                        
                        
                

                self.static_labels.update({(pair[0],pair[1]):tem_info})
                
            
    def compute_evaluation_metric(self, data, data_prediction, y_label, positive_label, label_order):
        #TODO: add weight for generalized metric
        # validity of the input data, must have two labels
        
        if len(data[y_label].unique()) != 2 or len(data_prediction[y_label].unique()) != 2: # compute manually
            
            TP = data[data[y_label] == positive_label].shape[0]
            TN = data.shape[0] - TP
            FP = len(set(data_prediction[data_prediction[y_label] == positive_label].index).intersection(data[data[y_label] == positive_label].index))
            FN = len(set(data_prediction[data_prediction[y_label] != positive_label].index).intersection(data[data[y_label] != positive_label].index))
        
        else:    
            TN, FP, FN, TP = confusion_matrix(list(data[y_label]), list(data_prediction[y_label]), labels=label_order).ravel()
        
        P = TP + FN
        N = TN + FP
        ACC = (TP+TN) / (P+N) if (P+N) > 0.0 else np.float64(0.0)
        
        return dict(
                    PR = P/ (P+N) if (P+N) > 0.0 else np.float64(0.0), N = TN + FP,
                    TPR=TP / P if P > 0.0 else np.float64(0.0), 
                    TNR=TN / N if N > 0.0 else np.float64(0.0), 
                    FPR=FP / N if N > 0.0 else np.float64(0.0), 
                    FNR=FN / P if P > 0.0 else np.float64(0.0),
                    PPV=TP / (TP+FP) if (TP+FP) > 0.0 else np.float64(0.0),
                    NPV=TN / (TN+FN) if (TN+FN) > 0.0 else np.float64(0.0),
                    FDR=FP / (FP+TP) if (FP+TP) > 0.0 else np.float64(0.0),
                    FOR=FN / (FN+TN) if (FN+TN) > 0.0 else np.float64(0.0),
                    ACC=ACC,
                    ERR=1-ACC,
                    F1=2*TP / (2*TP+FP+FN) if (2*TP+FP+FN) > 0.0 else np.float64(0.0)
                )
    
        
    def extract_performance_label(self, data, data_prediction, y_label, positive_label, label_order):
        
        if len(self.sensitive_atts) == 1:
            cats = data[self.sensitive_atts[0]].unique().tolist()
            tem_info = {}
            for cat in cats:
                data_cat = data[data[self.sensitive_atts[0]] == cat]
                data_pred_cat = data_prediction[data_prediction[self.sensitive_atts[0]] == cat]
                performance_info = self.compute_evaluation_metric(data_cat, data_pred_cat, y_label, positive_label, label_order)
                tem_info.update({(cat):performance_info})
            self.performance_labels.update({(self.sensitive_atts[0]):tem_info})
                
        else:
            for pair in self.pair_comb:
                tem_info = {}
                att1 = pair[0]
                att2 = pair[1]
                cat_1 = data[att1].unique().tolist()
                cat_2 = data[att2].unique().tolist()
                for cat1 in cat_1:
                    for cat2 in cat_2:
                        data_cat = data[(data[att1] == cat1) & (data[att2] == cat2)]
                        data_pred_cat = data_prediction[(data_prediction[att1] == cat1) & (data_prediction[att2] == cat2) ]
                        performance_info = self.compute_evaluation_metric(data_cat, data_pred_cat, y_label, positive_label, label_order)
                        tem_info.update({(cat1,cat2):performance_info})
                self.performance_labels.update({(pair[0],pair[1]):tem_info})
            
        # round float numbers to 3 digits
        for tuple_key in list(self.performance_labels.keys()):
            for tuple_key2 in list(self.performance_labels[tuple_key].keys()):
                for key in list(self.performance_labels[tuple_key][tuple_key2].keys()):
                    self.performance_labels[tuple_key][tuple_key2][key] = round(self.performance_labels[tuple_key][tuple_key2][key], 3)
                
                
    
    def add_sensitive_att(self, new_sensitive_att):
        '''
        new_sensitive_att is usually a non-traditional sensitive feature unlike race or gender but user want to treat it as sensitive
        update all the information in the __init__ accordingly
        '''
        for label in new_sensitive_att:
            categories = self.data[label].unique()
            # temporarily store each categories and corresponding values 
            cat_num = []
            for cat in categories:
                cat_num += [(cat, len(self.data.loc[self.data[label] == cat]))]
                cat_num = sorted(cat_num, key = lambda x: x[1], reverse = True)
                # update self.sensitive_atts for each label 
                self.sensitive_atts.update({label: [cat_num[i][0] for i in range(len(cat_num))]})
                # update self.sensi_atts_values for each label 
                self.sensi_atts_values.update({label: cat_num[-1][0]})
        
        # auto renew static_label
        self.extract_static_label()
    
    '''Visualizations'''
        
    def static_labels_table(self, interests=[]):
        '''
        if interests:
            out = {}
            for label in intetests:
                out[label] = self.static_labels[label]
            return pd.DataFrame(data = out).T  
        else:
            return pd.DataFrame(data = self.static_labels).T
        '''
        sensitive = ["race", "sex"] # change this
        static_statistics = pd.DataFrame(self.static_labels.values())
        attr = pd.DataFrame(self.static_labels.keys())
        attr.columns = sensitive
        static_table = pd.concat([attr, static_statistics], sort = False, axis = 1)
        self.static_table = static_table
        return self.static_table
        
    def static_labels_plot(self, attr, measure):
        '''
        temp_data = self.data
        encoding = 0
        for i in self.data[self.target].unique():
            temp_data[self.target].replace(i, encoding, inplace=True)
            encoding += 1
            
        sns_plot = sns.catplot(x=attr[0], y = self.target, hue = attr[1], kind = "bar", data = temp_data)
        return sns_plot
        '''
        #sns.barplot(x = "sex", y = "positive", hue = "race", data = static_table)
        static_plot = sns.barplot(x = attr[0], y = measure, hue = attr[1], data = self.static_table)
        static_plot.set_xticklabels(static_plot.get_xticklabels(), rotation=45, horizontalalignment='right')
        return static_plot
                                  
    def performance_labels_table(self, interests=[]):
        performance_formatted = {}
        for key, value in self.performance_labels.items():
            for k, v in value.items():
                performance_formatted[(key, k)] = v
        '''
        if interests:
            out = {}
            for label in intetests:
                out[label] = performance_new[label]
            return pd.DataFrame(data = out).T
            
        else:
            return pd.DataFrame(data = performance_new).T
        '''
        sensitive = ["race", "sex"]
        performance_statistics = pd.DataFrame(performance_formatted.values())
        performance_attr = pd.DataFrame(performance_formatted.keys())
        performance_attr.columns = sensitive
        performance_table = pd.concat([attr, performance_statistics], sort = False, axis = 1)
        self.performance_table = performance_table
        
        return self.performance_table
    
    def performance_labels_plot(self, attr, measure):
        #sns.barplot(x = "sex", y = "PR", hue = "race", data = performance_table)
        performance_plot = sns.barplot(x = attr[0], y = measure, hue = attr[1], data = self.performance_table)
        return performance_plot

In [171]:
data = pd.read_csv('data/adult.csv')
labels =FairnessLabel(data, 'income-per-year', model_flag = False, 
                 sensi_lab = ['race','sex','education'], protected_values = [], positive_label = '>50K')

labels.extract_static_label()
labels.static_labels

['<=50K']


{('race', 'sex'): {('White', 'Male'): {'positive': 0.219, 'negative': 0.47},
  ('White', 'Female'): {'positive': 0.037, 'negative': 0.274},
  ('Black', 'Male'): {'positive': 0.095, 'negative': 0.407},
  ('Black', 'Female'): {'positive': 0.029, 'negative': 0.469},
  ('Asian-Pac-Islander', 'Male'): {'positive': 0.224, 'negative': 0.443},
  ('Asian-Pac-Islander', 'Female'): {'positive': 0.041, 'negative': 0.292},
  ('Amer-Indian-Eskimo', 'Male'): {'positive': 0.077, 'negative': 0.54},
  ('Amer-Indian-Eskimo', 'Female'): {'positive': 0.039, 'negative': 0.344},
  ('Other', 'Male'): {'positive': 0.07, 'negative': 0.528},
  ('Other', 'Female'): {'positive': 0.022, 'negative': 0.38}},
 ('race',
  'education'): {('White', 'Bachelors'): {'positive': 0.072,
   'negative': 0.096}, ('White', 'HS-grad'): {'positive': 0.055,
   'negative': 0.265}, ('White', '11th'): {'positive': 0.002,
   'negative': 0.033}, ('White', 'Masters'): {'positive': 0.031,
   'negative': 0.024}, ('White', '9th'): {'positive

In [190]:
labels.static_labels.keys()

dict_keys([('race', 'sex')])

In [18]:
pd.DataFrame(table).T

Unnamed: 0_level_0,Amer-Indian-Eskimo,Amer-Indian-Eskimo,Asian-Pac-Islander,Asian-Pac-Islander,Black,Black,Other,Other,White,White
Unnamed: 0_level_1,<=50K,>50K,<=50K,>50K,<=50K,>50K,<=50K,>50K,<=50K,>50K
age,0.884244,0.115756,0.73436,0.26564,0.87612,0.12388,0.907749,0.092251,0.74414,0.25586
capital-gain,0.884244,0.115756,0.73436,0.26564,0.87612,0.12388,0.907749,0.092251,0.74414,0.25586
capital-loss,0.884244,0.115756,0.73436,0.26564,0.87612,0.12388,0.907749,0.092251,0.74414,0.25586
education,0.884244,0.115756,0.73436,0.26564,0.87612,0.12388,0.907749,0.092251,0.74414,0.25586
education-num,0.884244,0.115756,0.73436,0.26564,0.87612,0.12388,0.907749,0.092251,0.74414,0.25586
fnlwgt,0.884244,0.115756,0.73436,0.26564,0.87612,0.12388,0.907749,0.092251,0.74414,0.25586
hours-per-week,0.884244,0.115756,0.73436,0.26564,0.87612,0.12388,0.907749,0.092251,0.74414,0.25586
income-per-year,,,,,,,,,,
marital-status,0.884244,0.115756,0.73436,0.26564,0.87612,0.12388,0.907749,0.092251,0.74414,0.25586
native-country,0.884244,0.115756,0.73436,0.26564,0.87612,0.12388,0.907749,0.092251,0.74414,0.25586


In [70]:
data = pd.read_csv('data/adult.csv')
numerator = data[['race',"sex", "education",'income-per-year']].groupby(['race',"sex","education"])
numerator.groupby["income-per-year"].count()

AttributeError: Cannot access callable attribute 'groupby' of 'DataFrameGroupBy' objects, try using the 'apply' method

In [2]:
data = pd.read_csv('data/adult.csv')


In [10]:
columns = ['race','sex','education','income-per-year']
numerator = data[columns].groupby(['race','sex','education'])
#denominator = data.groupby(['race','sex','education']).count()
a = numerator.get_group(("Black", "Male", "10th"))
a

Unnamed: 0,race,sex,education,income-per-year
1356,Black,Male,10th,<=50K
1394,Black,Male,10th,<=50K
1461,Black,Male,10th,<=50K
1533,Black,Male,10th,>50K
1763,Black,Male,10th,<=50K
...,...,...,...,...
30635,Black,Male,10th,<=50K
31123,Black,Male,10th,<=50K
31867,Black,Male,10th,<=50K
32116,Black,Male,10th,<=50K


In [100]:
b = a["income-per-year"].value_counts()
b

<=50K    63
>50K      6
Name: income-per-year, dtype: int64

In [101]:
dict(b)

{'<=50K': 63, '>50K': 6}

In [102]:
dict(b).keys()

dict_keys(['<=50K', '>50K'])

In [105]:
sum(dict(b).values())

69

In [108]:
dict(b/sum(dict(b).values()))

{'<=50K': 0.9130434782608695, '>50K': 0.08695652173913043}

In [55]:
att1 = 'race'
att2 = 'sex'
target_v = 'income-per-year'
numerator = data[['race','sex','income-per-year']].groupby(['race','sex','income-per-year'])
cat_1 = data[att1].unique().tolist()
cat_2 = data[att2].unique().tolist()
positive_label = '>50K'
target_labels = list(data[target_v].unique())

if positive_label != target_labels[0]:
    for i, value in enumerate(target_labels):
        print(value)
        if str(value) == str(positive_label):
            print(True)
            target_labels[0], target_labels[i] = target_labels[i], target_labels[0]

for cat1 in cat_1:
    for cat2 in cat_2

<=50K
>50K
True


['>50K', '<=50K']

In [154]:
numerator = data[['race','education','income-per-year']].groupby(['race','education','income-per-year'])
denominator = data[['race']].groupby(['race'])
A = numerator.get_group(('Black','Female','>50K'))
B = denominator.get_group(('Black'))
g = A['race'].value_counts()
h = B['race'].value_counts()
list(g.to_dict().values())

KeyError: ('Black', 'Female', '>50K')

In [157]:
numerator = data[['race','education','income-per-year']].groupby(['race','education','income-per-year'])
numerator.get_group(('Black','Preschool','<=50K'))

Unnamed: 0,race,education,income-per-year
2884,Black,Preschool,<=50K
11734,Black,Preschool,<=50K
27239,Black,Preschool,<=50K
28395,Black,Preschool,<=50K
32359,Black,Preschool,<=50K


In [122]:
a = ['ab','c']
a.remove('ab')
a

['c']

In [167]:
try:
    print(123 == 1)
except:
    print('gg')

False
