In [151]:

import os
import pandas as pd
import json
import numpy as np

from model import num2cate_fit, num2cate_transform, generate_samples, generate_model_samples, ModelGene, findKeyAttrs, FindGroups, get_numAttrs, find_rules
from model.samples import DataGene
from model.data_encoder import DataEncoder
from joblib import dump, load

from sklearn.model_selection import KFold
import dill
import pickle
import warnings
warnings.filterwarnings('ignore')

from itertools import chain


In [87]:


store_path = '../../front/src/testdata/'



    
    
def init_samples(dataset_name, find_key=False):
    sample_num = 1000 # number of generated data 
    dataset_path = '../server/{}.csv'.format(dataset_name)
    data = pd.read_csv(dataset_path)
    
    mdlp = num2cate_fit(data, 2)
    
    if find_key:
        # find key_attrs
        key_attrs = findKeyAttrs(num2cate_transform(data, mdlp))
        f = open(store_path + '{}_key.json'.format(dataset_name),'w')
        json.dump(key_attrs, f)
    
    #  save mdelp
    f=open('{}_mdlp.pkl'.format(dataset_name), 'wb')
    dill.dump(mdlp, f, -1)
    
    
    # generate samples
    samplesInit = generate_samples(data, sample_num)
    samples_path = os.path.join(store_path, '{}_samples.json'.format(dataset_name))
    samplesInit.to_json(samples_path, orient='records')


def get_model_samples(dataset_name, models=['lr', 'knn', 'xgb'], protect_attr=''):
    """
    train model on the training data,
    return the generated samples based on the training data
    """
    if os.path.isfile('{}_mdlp.pkl'.format(dataset_name)):
        f=open('{}_mdlp.pkl'.format(dataset_name), 'rb')
        mdlp = dill.load(f)
    else:
        raise Exception('no mdlp exists, run init_samples first')
        
    dataset_path = '../server/{}.csv'.format(dataset_name)
    data = pd.read_csv(dataset_path)
    
    

    samples_path = os.path.join(store_path, '{}_samples.json'.format(dataset_name))
    samplesInit = pd.read_json(samples_path)
    
    


    
    for var in models:
        
        # set name
        model_name = dataset_name+"_"+var
        model_gene = ModelGene(model_name)
    
        # train model
        model, encoder, score = model_gene.fit_model( num2cate_transform(data, mdlp) )
        f = open('./cache/models/model_{}_{}.pkl'.format(var, dataset_name), 'wb')    
        pickle.dump(model, f)  
        
        # general samples
    
        num_samples, cate_samples = generate_model_samples(samplesInit, mdlp, model, encoder) 
        
        # add the ID col 
        cate_samples.insert(loc=0, column='id', value= cate_samples.index)
        num_samples.insert(loc=0, column='id', value= num_samples.index)
        
        # save mdeol & samples to cache
        dataOut = pd.concat([num_samples,cate_samples])
        
        samples_path = os.path.join(store_path, '{}_samples.json'.format(model_name))
        dataOut.to_json(samples_path, orient='records')

        samples_path = os.path.join(store_path, '{}_samples.json'.format(model_name))
        dataOut.to_json(samples_path, index=False)
        
        print(dataset_name + ' ' + var + ' accuracy: ' + str(score))
        
    print(dataset_name + ' all done')
    
def get_rules(dataset_name, protect_attr='',model_name=None, min_support = 5):
    
    model_name = '{}_{}'.format(dataset_name, model_name)
    sample_path = os.path.join(store_path, '{}_samples.json'.format(model_name))
    model_samples = pd.read_json(sample_path)
    model_samples = model_samples.iloc[int(len(model_samples)/2):]
    rules = find_rules(model_samples, minimum_support=min_support, min_len=1, protect_attr = protect_attr, target_attr='class', elift_th=[1, 1])

    
    rules.to_json(store_path + '{}_rules.json'.format(model_name),orient='records')

def get_all_rules(protect, models=['xgb','knn','lr'], dataset='adult'):
    
    for model in models:
        get_rules(dataset, protect, model)
        get_rules(dataset, protect, model)
        get_rules(dataset, protect, model)
    for model in models:
        sample_name = os.path.join(store_path, dataset + '_' + model + '_samples.json')
        rule_name = os.path.join(store_path, dataset + '_' + model + '_rules.json')
        samples = pd.read_json(sample_name)

        # add item id to rules
        def item_within_rule(item, rule_context):
            for attr_val in rule_context:
                attr, val = attr_val.split('=')
                if not item[attr] == val :
                    return False
            return True

        rules = pd.read_json(rule_name)
        rules['items'] = ''
        for idx, rule in rules.iterrows():
            rules.at[idx, 'items'] = [sample['id'] for i,sample in samples.iloc[int(len(samples)/2):].iterrows() if item_within_rule(sample, rule["antecedent"])] 
        rules.to_json(rule_name, orient='records')

        print(dataset+'_'+model+' has done')

    print('All finished')



In [7]:
models =['svm', 'lr', 'knn', 'dt', 'rf', 'xgb']
dataset = 'adult'
init_samples(dataset)
get_model_samples(dataset, models)


adult svm accuracy: 0.8093239898158519
adult lr accuracy: 0.8078908389866524
adult knn accuracy: 0.8179639810666528
adult dt accuracy: 0.8234511754703175
adult rf accuracy: 0.8332786382637953
adult xgb accuracy: 0.835551428414055
adult all done


In [6]:
get_all_rules('gender=F', models, dataset)

academic_svm has done
academic_lr has done
academic_knn has done
academic_dt has done
academic_rf has done
academic_xgb has done
All finished


In [12]:
models =['xgb']
dataset = 'adult'
get_model_samples(dataset, models)
get_all_rules('race=Female', models, dataset)

adult xgb accuracy: 0.8351828109761368
adult all done
adult_xgb has done
All finished


In [11]:
get_all_rules('race=Black',['lr', 'xgb', 'knn', 'rf', 'dt', 'svm'], 'adult')

adult_lr has done
adult_xgb has done
adult_knn has done
adult_rf has done
adult_dt has done
adult_svm has done
All finished


In [5]:
models =['gnb']
dataset = 'adult'
get_model_samples(dataset, models)

adult gnb accuracy: 0.7715080396470508
adult all done


### reject option 

a post processing method for removing algorithmic discrimination.  
For details, please refer to **Decision Theory for Discrimination-Aware Classification, 10.1109/ICDM.2012.45**

In [168]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score

def predict_post(x_samples, model, theta, samples, group, includes, excludes):
    y_samples = model.predict(x_samples)
    y_probs = model.predict_proba(x_samples)
    y_post_samples = []
    
    
    
    for idx, prob in enumerate( y_probs):
        is_include = any(\
            (all([ include[1] in samples.iloc[idx][include[0]] \
                  for include in include_group])) \
            for include_group in includes) \
        or len(includes)==0
        
        is_exclude = all(\
            (all([ exclude[1] not in samples.iloc[idx][exclude[0]] \
                  for exclude in exclude_group])) \
            for exclude_group in excludes) \
            or len(excludes)==0
        
        
        # if in the reject option space, modify label
        if max(prob[1], 1- prob[1])<theta \
        and is_include\
        and is_exclude:
            if group[1] in samples.iloc[idx][group[0]]:
                y_post_samples.append(1)
            else: 
                y_post_samples.append(0)
        # else, remain the same
        else:
            y_post_samples.append(y_samples[idx])
    return y_post_samples
        

def run_reject_option(model_name, dataset_name, theta, group=['gender', 'M'], includes=[[]], excludes=[[]], context=True):
#     f = open('./cache/models/model_{}_{}.pkl'.format(model_name, dataset_name), 'rb')    
#     model = pickle.load(f)  
    
    if os.path.isfile('{}_mdlp.pkl'.format(dataset_name)):
        f=open('{}_mdlp.pkl'.format(dataset_name), 'rb')
        mdlp = dill.load(f)
    else:
        raise Exception('no mdlp exists, run init_samples first')
        
    #  training data
    dataset_path = '../server/{}.csv'.format(dataset_name)
    data = pd.read_csv(dataset_path)
    encoder = DataEncoder()
    encoder.fit(num2cate_transform(data, mdlp))
    x, y = encoder.transform(num2cate_transform(data, mdlp))
    
#     score_cross = cross_val_score(model, x, y, scoring='accuracy', cv = KFold(n_splits=5, shuffle = True)) 
#     print('cross valide', score_cross)

    
#     load the discriminatory region
    if context:
        f = open(store_path + '{}_{}_default_region.json'.format(model_name, dataset_name),'r')
        discri_region = json.load(f)
        discri_region = [[a.split('=') for a in i['antecedent']] for i in discri_region]
        includes_ = includes + discri_region 
    else:
        includes_ = includes
    
    k =5
    scores = []
    kf = KFold(n_splits=k)
    kf.get_n_splits(x)

    for train_index, test_index in kf.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model_gene = ModelGene('{}_{}'.format(dataset_name,model_name))
        model = model_gene.model
        
        model.fit(x_train, y_train)
        score = accuracy_score(y_test, model.predict(x_test))
        scores.append(score)
    print ('score of {}'.format(model_name), sum(scores)/len(scores))
    
    scores = []
    for train_index, test_index in kf.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model_gene = ModelGene('{}_{}'.format(dataset_name,model_name))
        model = model_gene.model
        model.fit(x_train, y_train)
        
        y_post_test = predict_post(x_test, model, theta, num2cate_transform(data, mdlp), group, includes_, excludes)
        score = accuracy_score(y_test, y_post_test)
        scores.append(score)
    print ('score after roc of {}'.format(model_name), sum(scores)/len(scores))
        
#     #  for synthetic data, save samples for rules mining
#     model_gene = ModelGene('{}_{}'.format(dataset_name,model_name))
#     model = model_gene.model
#     model.fit(x, y)
    
    # load model from the memory to avoid randomness
    f = open('./cache/models/model_{}_{}.pkl'.format(model_name, dataset_name), 'rb')    
    model = pickle.load(f) 
    
        
    samples_path = os.path.join(store_path, '{}_samples.json'.format(dataset_name))
    samplesInit = pd.read_json(samples_path)
    samples = num2cate_transform(samplesInit, mdlp)
    x_samples, _ = encoder.transform(samples)
    y_samples_post = predict_post(x_samples, model, theta, samples, group, includes_, excludes)
    
    #  concate post processing result to samples
    num_samples = samplesInit.copy()
    num_samples['class'] = pd.Series(np.asarray(y_samples_post), index= samples.index) 

    cate_samples = samples.copy()
    cate_samples['class'] = pd.Series(np.asarray(y_samples_post), index= samples.index) 
    
    
    # add the ID col 
    cate_samples.insert(loc=0, column='id', value= cate_samples.index)
    num_samples.insert(loc=0, column='id', value= num_samples.index)

    # save mdeol & samples to cache
    dataOut = pd.concat([num_samples,cate_samples])

    samples_path = os.path.join(store_path, '{}_{}_post{}{}_samples.json'.format(dataset_name, model_name, len(includes), len(excludes)))
    dataOut.to_json(samples_path, orient='records')

    samples_path = os.path.join(store_path, '{}_{}_post{}{}_samples.csv'.format(dataset_name, model_name, len(includes), len(excludes)))
    dataOut.to_csv(samples_path, index=False)
    

## random forest

In [167]:
# rf_post00 0.78
# run_reject_option('rf', 'academic', 0.8, ['gender', 'M'], [], [])
# get_all_rules('gender=F', ['rf_post00'], 'academic')


# rf_post31 0.90
run_reject_option('rf', 'academic', 0.8, \
                  ['gender', 'M'], \
                  [],\
                  []
                 )
get_all_rules('gender=F', ['rf_post00'], 'academic')

# # rf_post11 0.89
# run_reject_option('rf', 'academic', 0.8, \
#                   ['gender', 'M'], \
#                   [\
#                    [['raisedhands', '50']], \
#                   ],\
#                   [\
#                    [['Semester', 'S']]
#                   ]
#                  )


# # rf_post21 0.90
# run_reject_option('rf', 'academic', 0.8, \
#                   ['gender', 'M'], \
#                   [\
#                    [['raisedhands', '50']], \
#                    [['AnnouncementsView', '66']], \
#                   ],\
#                   [\
#                    [['Semester', 'S']]
#                   ]
#                  )

# rf_post41 0.89
# run_reject_option('rf', 'academic', 0.8, \
#                   ['gender', 'M'], \
#                   [\
#                    [['raisedhands', '50']], \
#                    [['raisedhands', '23']], \
#                    [['AnnouncementsView', '66']], \
#                    [['StudentAbsenceDays','Under-7']]  \
#                   ],\
#                   [\
#                    [['Semester', 'S']]
#                   ]
#                  )


score of rf 0.9104166666666668
score after roc of rf 0.9041666666666668
academic_rf_post00 has done
All finished


## KNN

In [169]:
# run_reject_option('knn', 'academic', 0.8, [['gender', 'M'], ['StudentAbsenceDays', 'Above-7']])
# run_reject_option('knn', 'academic', 0.8, [['gender', 'M'],  ['VisITedResources', '15']])


# # knn_post00 0.860
context = True
run_reject_option('knn', 'academic', 0.8, ['gender', 'M'], [], [], context)
get_all_rules('gender=F', ['knn_post00'], 'academic')




# # knn_post41 0.9
# run_reject_option('knn', 'academic', 0.8, \
#                   ['gender', 'M'], \
#                   [\
#                    [['raisedhands', '50']], \
#                    [['raisedhands', '23']], \
#                    [['AnnouncementsView', '66']], \
#                    [['StudentAbsenceDays','Under-7']]  \
#                   ],\
#                   [\
#                    [['StudentAbsenceDays','Above-7'], ['raisedhands', 'x>50'], ['AnnouncementsView', '10'], ['Relation','Father']]
#                   ]
#                  )

# # # knn_post11 0.875
# run_reject_option('knn', 'academic', 0.8, \
#                   ['gender', 'M'], \
#                   [\
#                    [['StudentAbsenceDays','Above-7']],  \
#                    [['raisedhands', '23']],\
#                    [['AnnouncementsView', '20']]\
#                   ],\
#                   [\
# #                    [ ['AnnouncementsView', '10']]\
#                    [['raisedhands', '50'], ['AnnouncementsView', '10']]\
#                   ]
#                  )

# get_all_rules('gender=F', ['knn_post31'], 'academic')

score of knn 0.8979166666666668
score after roc of knn 0.8604166666666668
academic_knn_post00 has done
All finished


## Decision Tree

In [171]:
#  dt_post0 original reject option, 0.808
# dt_post00 context reject option

run_reject_option('dt', 'academic', 0.9, \
                  ['gender', 'M'], \
                  [],\
                  [],\
                  True
                 )
get_all_rules('gender=F', ['dt_post00'], 'academic')

score of dt 0.8895833333333334
score after roc of dt 0.8895833333333334


KeyboardInterrupt: 

In [73]:
any([[] for i in []])

False