In [1]:

import os
import pandas as pd
import json

from model import num2cate_fit, num2cate_transform, generate_samples, generate_model_samples, ModelGene, findKeyAttrs, FindGroups, get_numAttrs, find_rules
from model.samples import DataGene
from model.data_encoder import DataEncoder
from joblib import dump, load

import dill
import pickle
import warnings
warnings.filterwarnings('ignore')


In [2]:


store_path = '../../front/src/testdata/'


def get_rules(dataset_name, protect_attr='',model_name=None):
    
    model_name = '{}_{}'.format(dataset_name, model_name)
    sample_path = os.path.join(store_path, '{}_samples.csv'.format(model_name))
    model_samples = pd.read_csv(sample_path)
    model_samples = model_samples.iloc[int(len(model_samples)/2):]
    rules = find_rules(model_samples, minimum_support=5, min_len=1, protect_attr = protect_attr, target_attr='class', elift_th=[1, 1])

    
    rules.to_json(store_path + '{}_rules.json'.format(model_name),orient='records')
    
    
def init_samples(dataset_name, find_key=False):
    sample_num = 1000 # number of generated data 
    dataset_path = '../server/{}.csv'.format(dataset_name)
    data = pd.read_csv(dataset_path)
    
    mdlp = num2cate_fit(data, 2)
    
    if find_key:
        # find key_attrs
        key_attrs = findKeyAttrs(num2cate_transform(data, mdlp))
        f = open(store_path + '{}_key.json'.format(dataset_name),'w')
        json.dump(key_attrs, f)
    
    #  save mdelp
    f=open('{}_mdlp.pkl'.format(dataset_name), 'wb')
    dill.dump(mdlp, f, -1)
    
    
    # generate samples
    samplesInit = generate_samples(data, sample_num)
    samples_path = os.path.join(store_path, '{}_samples.json'.format(dataset_name))
    samplesInit.to_json(samples_path, orient='records')


def get_model_samples(dataset_name, models=['lr', 'knn', 'xgb'], protect_attr=''):
    """
    train model on the training data,
    return the generated samples based on the training data
    """
    if os.path.isfile('{}_mdlp.pkl'.format(dataset_name)):
        f=open('{}_mdlp.pkl'.format(dataset_name), 'rb')
        mdlp = dill.load(f)
    else:
        raise Exception('no mdlp exists, run init_samples first')
        
    dataset_path = '../server/{}.csv'.format(dataset_name)
    data = pd.read_csv(dataset_path)
    
    

    samples_path = os.path.join(store_path, '{}_samples.json'.format(dataset_name))
    samplesInit = pd.read_json(samples_path)
    
    


    
    for var in models:
        
        # set name
        model_name = dataset_name+"_"+var
        model_gene = ModelGene(model_name)
    
        # train model
        model, encoder, score = model_gene.fit_model( num2cate_transform(data, mdlp) )
        f = open('./cache/models/model_{}_{}.pkl'.format(var, dataset_name), 'wb')    
        pickle.dump(model, f)  
        
        # general samples
    
        num_samples, cate_samples = generate_model_samples(samplesInit, mdlp, model, encoder) 
        
        # add the ID col 
        cate_samples.insert(loc=0, column='id', value= cate_samples.index)
        num_samples.insert(loc=0, column='id', value= num_samples.index)
        
        # save mdeol & samples to cache
        dataOut = pd.concat([num_samples,cate_samples])
        
        samples_path = os.path.join(store_path, '{}_samples.json'.format(model_name))
        dataOut.to_json(samples_path, orient='records')

        samples_path = os.path.join(store_path, '{}_samples.csv'.format(model_name))
        dataOut.to_csv(samples_path, index=False)
        
        print(dataset_name + ' ' + var + ' accuracy: ' + str(score))
        
    print(dataset_name + ' all done')

def get_all_rules(protect, models=['xgb','knn','lr'], dataset='adult'):
    
    for model in models:
        get_rules(dataset, protect, model)
        get_rules(dataset, protect, model)
        get_rules(dataset, protect, model)
    for model in models:
        sample_name = os.path.join(store_path, dataset + '_' + model + '_samples.json')
        rule_name = os.path.join(store_path, dataset + '_' + model + '_rules.json')
        samples = pd.read_json(sample_name)

        # add item id to rules
        def item_within_rule(item, rule_context):
            for attr_val in rule_context:
                attr, val = attr_val.split('=')
                if not item[attr] == val :
                    return False
            return True

        rules = pd.read_json(rule_name)
        rules['items'] = ''
        for idx, rule in rules.iterrows():
            rules.at[idx, 'items'] = [sample['id'] for i,sample in samples.iloc[int(len(samples)/2):].iterrows() if item_within_rule(sample, rule["antecedent"])] 
        rules.to_json(rule_name, orient='records')

        print(dataset+'_'+model+' has done')

    print('All finished')



In [3]:
models =['svm', 'lr', 'knn', 'dt', 'rf', 'xgb']
dataset = 'academic'
init_samples(dataset)
get_model_samples(dataset, models)


academic svm accuracy: 0.9001659432085367
academic lr accuracy: 0.904289654548743
academic knn accuracy: 0.8878605986616025
academic dt accuracy: 0.9022716585277628
academic rf accuracy: 0.8981479471875564
academic xgb accuracy: 0.9044207813347802
academic all done


In [4]:
get_all_rules('gender=F', models, dataset)

academic_svm has done
academic_lr has done
academic_knn has done
academic_dt has done
academic_rf has done
academic_xgb has done
All finished


In [None]:
dataset_name = 'academic'

f=open('{}.pkl'.format('academic'), 'rb')
mdlp = dill.load(f)
dataset_path = '../server/{}.csv'.format(dataset_name)
data = pd.read_csv(dataset_path)
num2cate_transform(data, mdlp)

In [None]:
import pandas as pd


df = pd.read_csv('german_credit.csv')

df['gender'] = ''
df['status'] = ''

for i,d in df.iterrows():
    df.at[i,'gender'] = df.at[i,'personal_status'].split(' ')[0]
    df.at[i,'status'] = df.at[i,'personal_status'].split(' ')[1]
df.drop(['personal_status'],axis=1)
df.to_csv('german_credit.csv',index=False)

In [None]:
'''explore the number of samples for each rules'''
import pandas as pd

df = pd.read_json('academic_knn_rules.json')
df = pd.DataFrame(df)
for i,d in df.iterrows():
    print(df.at[i,'antecedent'])
    if(df.at[i,'antecedent']==['StudentAbsenceDays=Under-7', 'raisedhands=0<x<25', 'Discussion=1<x<25']):
        print(df.at[i,'id'])

In [None]:
samples = pd.read_json(os.path.join(store_path, '{}_samples.json'.format('academic')))
samples.iloc[int(len(samples)/2):]

In [None]:
dataset_name = 'adult'
dataset_path = '../server/{}.csv'.format(dataset_name)
data = pd.read_csv(dataset_path)

mdlp = num2cate_fit(data, 2)
num2cate_transform(data, mdlp)