In [1]:

import os
import pandas as pd
import json
import numpy as np

from model import num2cate_fit, num2cate_transform, generate_samples, generate_model_samples, ModelGene, findKeyAttrs, FindGroups, get_numAttrs, find_rules
from model.samples import DataGene
from model.data_encoder import DataEncoder
from joblib import dump, load

from sklearn.model_selection import KFold
import dill
import pickle
import warnings
warnings.filterwarnings('ignore')


In [2]:


store_path = '../../front/src/testdata/'


def get_rules(dataset_name, protect_attr='',model_name=None):
    
    model_name = '{}_{}'.format(dataset_name, model_name)
    sample_path = os.path.join(store_path, '{}_samples.csv'.format(model_name))
    model_samples = pd.read_csv(sample_path)
    model_samples = model_samples.iloc[int(len(model_samples)/2):]
    rules = find_rules(model_samples, minimum_support=5, min_len=1, protect_attr = protect_attr, target_attr='class', elift_th=[1, 1])

    
    rules.to_json(store_path + '{}_rules.json'.format(model_name),orient='records')
    
    
def init_samples(dataset_name, find_key=False):
    sample_num = 1000 # number of generated data 
    dataset_path = '../server/{}.csv'.format(dataset_name)
    data = pd.read_csv(dataset_path)
    
    mdlp = num2cate_fit(data, 2)
    
    if find_key:
        # find key_attrs
        key_attrs = findKeyAttrs(num2cate_transform(data, mdlp))
        f = open(store_path + '{}_key.json'.format(dataset_name),'w')
        json.dump(key_attrs, f)
    
    #  save mdelp
    f=open('{}_mdlp.pkl'.format(dataset_name), 'wb')
    dill.dump(mdlp, f, -1)
    
    
    # generate samples
    samplesInit = generate_samples(data, sample_num)
    samples_path = os.path.join(store_path, '{}_samples.json'.format(dataset_name))
    samplesInit.to_json(samples_path, orient='records')


def get_model_samples(dataset_name, models=['lr', 'knn', 'xgb'], protect_attr=''):
    """
    train model on the training data,
    return the generated samples based on the training data
    """
    if os.path.isfile('{}_mdlp.pkl'.format(dataset_name)):
        f=open('{}_mdlp.pkl'.format(dataset_name), 'rb')
        mdlp = dill.load(f)
    else:
        raise Exception('no mdlp exists, run init_samples first')
        
    dataset_path = '../server/{}.csv'.format(dataset_name)
    data = pd.read_csv(dataset_path)
    
    

    samples_path = os.path.join(store_path, '{}_samples.json'.format(dataset_name))
    samplesInit = pd.read_json(samples_path)
    
    


    
    for var in models:
        
        # set name
        model_name = dataset_name+"_"+var
        model_gene = ModelGene(model_name)
    
        # train model
        model, encoder, score = model_gene.fit_model( num2cate_transform(data, mdlp) )
        f = open('./cache/models/model_{}_{}.pkl'.format(var, dataset_name), 'wb')    
        pickle.dump(model, f)  
        
        # general samples
    
        num_samples, cate_samples = generate_model_samples(samplesInit, mdlp, model, encoder) 
        
        # add the ID col 
        cate_samples.insert(loc=0, column='id', value= cate_samples.index)
        num_samples.insert(loc=0, column='id', value= num_samples.index)
        
        # save mdeol & samples to cache
        dataOut = pd.concat([num_samples,cate_samples])
        
        samples_path = os.path.join(store_path, '{}_samples.json'.format(model_name))
        dataOut.to_json(samples_path, orient='records')

        samples_path = os.path.join(store_path, '{}_samples.csv'.format(model_name))
        dataOut.to_csv(samples_path, index=False)
        
        print(dataset_name + ' ' + var + ' accuracy: ' + str(score))
        
    print(dataset_name + ' all done')

def get_all_rules(protect, models=['xgb','knn','lr'], dataset='adult'):
    
    for model in models:
        get_rules(dataset, protect, model)
        get_rules(dataset, protect, model)
        get_rules(dataset, protect, model)
    for model in models:
        sample_name = os.path.join(store_path, dataset + '_' + model + '_samples.json')
        rule_name = os.path.join(store_path, dataset + '_' + model + '_rules.json')
        samples = pd.read_json(sample_name)

        # add item id to rules
        def item_within_rule(item, rule_context):
            for attr_val in rule_context:
                attr, val = attr_val.split('=')
                if not item[attr] == val :
                    return False
            return True

        rules = pd.read_json(rule_name)
        rules['items'] = ''
        for idx, rule in rules.iterrows():
            rules.at[idx, 'items'] = [sample['id'] for i,sample in samples.iloc[int(len(samples)/2):].iterrows() if item_within_rule(sample, rule["antecedent"])] 
        rules.to_json(rule_name, orient='records')

        print(dataset+'_'+model+' has done')

    print('All finished')



In [5]:
models =['svm', 'lr', 'knn', 'dt', 'rf', 'xgb']
dataset = 'academic'
init_samples(dataset)
get_model_samples(dataset, models)


academic svm accuracy: 0.9125
academic lr accuracy: 0.9145833333333332
academic knn accuracy: 0.9
academic dt accuracy: 0.8916666666666666
academic rf accuracy: 0.9083333333333334
academic xgb accuracy: 0.9083333333333332
academic all done


In [6]:
get_all_rules('gender=F', models, dataset)

academic_svm has done
academic_lr has done
academic_knn has done
academic_dt has done
academic_rf has done
academic_xgb has done
All finished


### reject option 

In [3]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score

def predict_post(x_samples, model, theta, samples, constraints=[['gender', 'M']]):
    y_samples = model.predict(x_samples)
    y_probs = model.predict_proba(x_samples)
    y_post_samples = []
    for idx, prob in enumerate( y_probs):
        # if in the reject option space, modify label
        if max(prob[1], 1- prob[1])<theta and all([samples.iloc[idx][constraint[0]]==constraint[1] for constraint in constraints]):
            y_post_samples.append(1)
        # else, remain the same
        else:
            y_post_samples.append(y_samples[idx])
    return y_post_samples
        

def run_reject_option(model_name, dataset_name, theta, constraints=[['gender', 'M']]):
#     f = open('./cache/models/model_{}_{}.pkl'.format(model_name, dataset_name), 'rb')    
#     model = pickle.load(f)  
    
    if os.path.isfile('{}_mdlp.pkl'.format(dataset_name)):
        f=open('{}_mdlp.pkl'.format(dataset_name), 'rb')
        mdlp = dill.load(f)
    else:
        raise Exception('no mdlp exists, run init_samples first')
        
    #  training data
    dataset_path = '../server/{}.csv'.format(dataset_name)
    data = pd.read_csv(dataset_path)
    encoder = DataEncoder()
    encoder.fit(num2cate_transform(data, mdlp))
    x, y = encoder.transform(num2cate_transform(data, mdlp))
    
#     score_cross = cross_val_score(model, x, y, scoring='accuracy', cv = KFold(n_splits=5, shuffle = True)) 
#     print('cross valide', score_cross)
    
    k =5
    scores = []
    kf = KFold(n_splits=k)
    kf.get_n_splits(x)

    for train_index, test_index in kf.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model_gene = ModelGene('{}_{}'.format(dataset_name,model_name))
        model = model_gene.model
        
        model.fit(x_train, y_train)
        score = accuracy_score(y_test, model.predict(x_test))
        scores.append(score)
    print ('score of {}'.format(model_name), sum(scores)/len(scores))
    
    scores = []
    for train_index, test_index in kf.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model_gene = ModelGene('{}_{}'.format(dataset_name,model_name))
        model = model_gene.model
        model.fit(x_train, y_train)
        
        y_post_test = predict_post(x_test, model, theta, data, constraints)
        score = accuracy_score(y_test, y_post_test)
        scores.append(score)
    print ('score after roc of {}'.format(model_name), sum(scores)/len(scores))
        
    #  for synthetic data, save samples for rules mining
    model_gene = ModelGene('{}_{}'.format(dataset_name,model_name))
    model = model_gene.model
    model.fit(x, y)
        
    samples_path = os.path.join(store_path, '{}_samples.json'.format(dataset_name))
    samplesInit = pd.read_json(samples_path)
    samples = num2cate_transform(samplesInit, mdlp)
    x_samples, _ = encoder.transform(samples)
    y_samples_post = predict_post(x_samples, model, theta, samples, constraints)
    
    #  concate post processing result to samples
    num_samples = samplesInit.copy()
    num_samples['class'] = pd.Series(np.asarray(y_samples_post), index= samples.index) 

    cate_samples = samples.copy()
    cate_samples['class'] = pd.Series(np.asarray(y_samples_post), index= samples.index) 
    
    
    # add the ID col 
    cate_samples.insert(loc=0, column='id', value= cate_samples.index)
    num_samples.insert(loc=0, column='id', value= num_samples.index)

    # save mdeol & samples to cache
    dataOut = pd.concat([num_samples,cate_samples])

    samples_path = os.path.join(store_path, '{}_{}_post{}_samples.json'.format(dataset_name, model_name, len(constraints)))
    dataOut.to_json(samples_path, orient='records')

    samples_path = os.path.join(store_path, '{}_{}_post{}_samples.csv'.format(dataset_name, model_name, len(constraints)))
    dataOut.to_csv(samples_path, index=False)
    

In [4]:
run_reject_option('knn', 'academic', 0.8, [['gender', 'M']])

score of knn 0.8916666666666666
score after roc of knn 0.8708333333333333


In [5]:
get_all_rules('gender=F', ['knn_post1'], 'academic')

academic_knn_post1 has done
All finished


In [None]:
'''explore the number of samples for each rules'''
import pandas as pd

df = pd.read_json('academic_knn_rules.json')
df = pd.DataFrame(df)
for i,d in df.iterrows():
    print(df.at[i,'antecedent'])
    if(df.at[i,'antecedent']==['StudentAbsenceDays=Under-7', 'raisedhands=0<x<25', 'Discussion=1<x<25']):
        print(df.at[i,'id'])

In [None]:
samples = pd.read_json(os.path.join(store_path, '{}_samples.json'.format('academic')))
samples.iloc[int(len(samples)/2):]

In [7]:
dataset_name = 'academic'
dataset_path = '../server/{}.csv'.format(dataset_name)
data = pd.read_csv(dataset_path)
    
mdlp = num2cate_fit(data, 2)

key_attrs = findKeyAttrs(num2cate_transform(data, mdlp))

samplePrior: Sample prior (min = 1.0) (java.lang.Double) [default:1.0]
structurePrior: Structure prior coefficient (min = 1.0) (java.lang.Double) [default:1.0]
faithfulnessAssumed: Yes if (one edge) faithfulness should be assumed (java.lang.Boolean) [default:true]
symmetricFirstStep: Yes if the first step step for FGES should do scoring for both X->Y and Y->X (java.lang.Boolean) [default:false]
maxDegree: The maximum degree of the graph (min = -1) (java.lang.Integer) [default:100]
verbose: Yes if verbose output should be printed or logged (java.lang.Boolean) [default:true]
numberResampling: The number of resampling iterations (min = 0) (java.lang.Integer) [default:0]
resampleSize: The resample size (min = 1) (java.lang.Integer) [default:1]
resamplingWithReplacement: Yes, if resampling with replacement (bootstrapping) (java.lang.Boolean) [default:true]
resamplingEnsemble: Ensemble method: Preserved (0), Highest (1), Majority (2) (java.lang.Integer) [default:1]
edges ['class --- StudentA