In [1]:
import pandas as pd 
import math
from joblib import dump, load
import random
import re
import copy
import json
import os
cache_path = './cache'
protect_attr='sex'
class FindGroups(object):   
    def __init__(self, key_vals):
        '''
        Args: 
            key_vals(dict): e.g., {'atrribute_a':['a', 'b', 'c'], 'attribute_b': ['m', 'n', 'k']}
            depth(int): recursive depth
            index(dict): the choosen value of each key attribute. e.g.,  {'atrribute_a':'a', 'attribute_b': 'k'}
            key_groups: a list of index
        '''
        self.key_vals = key_vals
        self.key_attrs = [k for k in key_vals]
        self.key_groups = []
        self.generate_groups( 0, {} )
        
        

    def generate_groups(self,  depth, index):
        '''
        a recursive function to find the attribute constraints of the key groups
            e.g., 
            [
                {'atrribute_a':'a', 'attribute_b': 'm'},
                {'atrribute_a':'a', 'attribute_b': 'n'},
                {'atrribute_a':'b', 'attribute_b': 'm'},
                {'atrribute_a':'b', 'attribute_b': 'n'}
            ]
        Args: 
            key_vals(dict): e.g., {'atrribute_a':['a', 'b', 'c'], 'attribute_b': ['m', 'n', 'k']}
            depth(int): recursive depth
            index(dict): the choosen value of each key attribute. e.g.,  {'atrribute_a':'a', 'attribute_b': 'k'}
            key_groups: a list of index
        '''

        for k in self.key_vals[ self.key_attrs[depth] ]:
            index_ = copy.deepcopy(index)
            index_[self.key_attrs[depth]] = k
            if depth < len(self.key_attrs)-1: 
                depth_ = depth+1 
                self.generate_groups(depth_, index_)
            else:
                self.key_groups.append(index_)
            #print(self.key_groups)

    def locate_items(self, model_samples, protect_attr):
        protect_vals = list(set(model_samples[protect_attr]))
        for i, group in enumerate(self.key_groups):
            group_items = model_samples.copy()
            for attr in group:
                group_items = group_items.loc[group_items[attr]==group[attr]]
            self.key_groups[i]['items'] =  group_items.index.tolist()
            score = []
            #print(protect_vals)
            if len(group_items)>0:
                for val in protect_vals:
                    # based on protected attribute
                    # self.key_groups[i]['items'][val] =  {}
                    group_items_ = group_items.loc[group_items[protect_attr] == val]
                    if len(group_items_)>0:
                        group_reject = group_items_.loc[group_items_['class'] == 0]
                        group_accept = group_items_.loc[group_items_['class'] == 1]
                        p_0 = len(group_reject)/len(group_items_)
                        p_1 = len(group_accept)/len(group_items_)
                        score.append(p_1)
                        # self.key_groups[i]['items'][val]['reject'] = group_reject.index.tolist()
                        # self.key_groups[i]['items'][val]['reject'] = group_reject.index.tolist()
                        # print(val, "{:.2f}".format(p_0), "{:.2f}".format(p_1), len(group_items_)) 
            if(score==[]):
                self.key_groups[i]['score'] =  0
            elif(len(score)==1):
                self.key_groups[i]['score'] =  abs(score[0])
            else:
                self.key_groups[i]['score'] =  abs(score[0]-score[1])
            self.key_groups[i]['scores'] =  score
        self.key_groups.sort(key=lambda x: x['score'] , reverse=True)
        return self.key_groups



In [2]:
def findRange(thresholds, v):
    for i, th in enumerate(thresholds):
        if(v <= th):
            if i==0:
                return "x<{}".format(th)
            elif i == len(thresholds)-1:
                return "x>{}".format(thresholds[i-1])
            else:
                return "{}<x<{}".format(thresholds[i-1], thresholds[i])

def convert_cate(arr):
    n = 4 #parts to be divided
    maxValue = max(arr)
    minValue = min(arr)
    thresholds = [ math.floor(i*(maxValue-minValue)/n)+minValue for i in range(n+1)]

    #print([findRange(thresholds, i) for i in arr])
    
    return pd.Series([findRange(thresholds, i) for i in arr])


def num2cate(dataIn):
    df = dataIn[:]
#     new_data = pd.DataFrame()
    for k in df.columns:
        if(k in df.select_dtypes(include=['int64'])):
            values = pd.to_numeric(df[k])
            df[k] = convert_cate(values.tolist())
        
    return df

In [3]:
    dataset_path = '../data/dataTest.csv'
    data = pd.read_csv(dataset_path)
    data = num2cate(data)

    # get model samples
    sample_path = os.path.join(cache_path, 'dataTest_knn_samples.csv')
    model_samples = pd.read_csv(sample_path)
    key_attrs = ['relationship','race','education_num']

    key_vals = {}
    key_groups = []
    
    for key_attr in key_attrs:
            key_vals[key_attr] = list(set(data[key_attr]))
    
    if(key_vals):
        findGroups = FindGroups(key_vals)
        key_groups = findGroups.locate_items(model_samples, protect_attr)
    
    key_groups

[{'relationship': ' Unmarried',
  'race': ' White',
  'education_num': 'x<1',
  'items': [266, 1972, 1995],
  'score': 1.0,
  'scores': [1.0]},
 {'relationship': ' Not_in_family',
  'race': ' Asian_Pac_Islander',
  'education_num': 'x<1',
  'items': [241, 455],
  'score': 1.0,
  'scores': [1.0]},
 {'relationship': ' Husband',
  'race': ' Asian_Pac_Islander',
  'education_num': '8<x<12',
  'items': [50,
   224,
   598,
   646,
   796,
   833,
   925,
   1074,
   1565,
   1700,
   1876,
   1913,
   1968,
   2080,
   2174,
   2178,
   2201,
   2294,
   2364],
  'score': 0.6888888888888889,
  'scores': [0.1111111111111111, 0.8]},
 {'relationship': ' Husband',
  'race': ' Other',
  'education_num': 'x>12',
  'items': [301,
   332,
   335,
   458,
   470,
   519,
   520,
   670,
   734,
   896,
   943,
   978,
   1103,
   1146,
   1168,
   1215,
   1229,
   1342,
   1426,
   1584,
   1608,
   1614,
   1757,
   1869,
   2427,
   2467,
   2567,
   2570,
   2734,
   2817,
   2971],
  'score': 0

In [8]:
n = 4
arr = [0,1,2,3,4,5,6]
maxValue = 80
minValue = 0
thresholds = [ math.floor(i*(maxValue-minValue)/n)+minValue for i in range(n+1)]
print(pd.Series([findRange(thresholds, i) for i in arr]))

0       x<0
1    0<x<20
2    0<x<20
3    0<x<20
4    0<x<20
5    0<x<20
6    0<x<20
dtype: object


In [26]:
import pandas as pd

df = pd.read_csv('dataTest_clean.csv')
df_ex = df.loc[(df['relationship']==' Wife')&(df['education_num']=='x>12')&(df['sex']==' Female')]
len(df_ex)


17