In [1]:
from flask import request, jsonify, Blueprint, current_app, Response
from model import generate_samples, generate_model_samples, ModelGene, findKeyAttrs, FindGroups
import pandas as pd
from joblib import dump, load

import json
import os

api = Blueprint('api', __name__)
cache_path = './cache'

######################
# API Starts here
######################
@api.route('/dataset/<string:dataset_name>', methods=['GET'])
def get_dataset(dataset_name):
    """Fetch dataset by id"""
    dataset_path = '../data/{}_clean.csv'.format(dataset_name)
    df = pd.read_csv(dataset_path, 'r')
    return df.to_json()



@api.route('/samples', methods=['GET'])
def get_samples():
    """
    train model on the training data,
    return the generated samples based on the training data
    E.g.: /api/samples?dataset=credit&model=knn
    """
    dataset_name = request.args.get('dataset', None, type=str)
    model_name = request.args.get('model', None, type=str)
    dataset_path = '../data/{}_clean.csv'.format(dataset_name)

    sample_num = 3000
    data = pd.read_csv(dataset_path)
    # samples = generate_samples(data, sample_num)

    model_gene = ModelGene(model_name)
    model, encoder, score = model_gene.fit_model(data)
    model_samples = generate_model_samples(data, sample_num, model, encoder)
    # add the ID col
    # model_samples['id'] = model_samples.index
    model_samples.insert(loc=0, column='id', value=model_samples.index)
    # print('model score', score)

    # save mdodl & samples to cache
    samples_path = os.path.join(cache_path, '{}_{}_samples.csv'.format(dataset_name, model_name))
    model_samples.to_csv(samples_path, index=False)
    model_samples.to_json('../../front/src/testdata/test.json', orient='records')
    model_path = os.path.join(cache_path, '{}_{}.joblib'.format(dataset_name, model_name))
    dump(model, model_path) 
    jsonfile = model_samples.to_json(orient='records')
    
    
    return jsonfile


@api.route('/groups', methods=['GET'])
def get_groups():
    """
    Fetch the info of classifiers.
    E.g.: /api/groups?dataset=credit&model=knn&protect=age
    """

    dataset_name = request.args.get('dataset', None, type=str)
    model_name = request.args.get('model', None, type=str)
    protect_attr = request.args.get('protect', None, type=str)

    # get training data
    dataset_path = '../data/{}_clean.csv'.format(dataset_name)
    data = pd.read_csv(dataset_path)

    # get model samples
    sample_path = os.path.join(cache_path, '{}_{}_samples.csv'.format(dataset_name, model_name))
    model_samples = pd.read_csv(sample_path)

    
    key_attrs = findKeyAttrs(data, protect_attr)

    key_vals = {}
    for key_attr in key_attrs:
        key_vals[key_attr] = list(set(data[key_attr]))

    findGroups = FindGroups(key_vals)
    key_groups = findGroups.locate_items(model_samples, protect_attr)
    
    

    return_value = {
        'key_attrs': key_attrs,
        'key_groups': key_groups
    }

    f = open('../../front/src/testdata/test2.json','w')
    json.dump(return_value, f)
   
    return jsonify(return_value)







@api.route('/key_attrs/<string:dataset_name>', methods=['GET'])
def get_key_attrs(dataset_name):
    """
    Fetch the key attributes of a dataset (training data).
    E.g.: /api/key_attrs/credit
    """
    protect_attr = ''
    dataset_path = '../data/{}_clean.csv'.format(dataset_name)
    data = pd.read_csv(dataset_path)
    key_attrs = findKeyAttrs(data, protect_attr)


    return json.dumps({
        'key_attrs':key_attrs
        })

In [61]:
dataset_name = 'adult'
model_name = 'knn'
protect_attr = 'sex'

# get training data
dataset_path = '../data/{}_clean.csv'.format(dataset_name)
data = pd.read_csv(dataset_path)

# get model samples
sample_path = os.path.join(cache_path, '{}_{}_samples.csv'.format(dataset_name, model_name))
model_samples = pd.read_csv(sample_path)


key_attrs = findKeyAttrs(data, protect_attr)

key_vals = {}
for key_attr in key_attrs:
    key_vals[key_attr] = list(set(data[key_attr]))

    
print('key_attrs:', key_attrs)
print('key_vals:', key_vals)
findGroups = FindGroups(key_vals)
key_groups = findGroups.locate_items(model_samples, protect_attr)
for i, group in enumerate(key_groups):
    key_groups[i]['items'] = list(map(int, key_groups[i]['items']))


return_value = {
    'key_attrs': key_attrs,
    'key_groups': key_groups
}

f = open('test2.json','w')
json.dump(return_value, f)

samplePrior: Sample prior (min = 1.0) (java.lang.Double) [default:1.0]
structurePrior: Structure prior coefficient (min = 1.0) (java.lang.Double) [default:1.0]
faithfulnessAssumed: Yes if (one edge) faithfulness should be assumed (java.lang.Boolean) [default:true]
symmetricFirstStep: Yes if the first step step for FGES should do scoring for both X->Y and Y->X (java.lang.Boolean) [default:false]
maxDegree: The maximum degree of the graph (min = -1) (java.lang.Integer) [default:100]
verbose: Yes if verbose output should be printed or logged (java.lang.Boolean) [default:true]
numberResampling: The number of resampling iterations (min = 0) (java.lang.Integer) [default:0]
resampleSize: The resample size (min = 1) (java.lang.Integer) [default:1]
resamplingWithReplacement: Yes, if resampling with replacement (bootstrapping) (java.lang.Boolean) [default:true]
resamplingEnsemble: Ensemble method: Preserved (0), Highest (1), Majority (2) (java.lang.Integer) [default:1]
edges ['capital_loss --- c

KeyError: 'occupationwork'

In [100]:
dataset_path = '../data/{}_clean.csv'.format('adult')
df = pd.read_csv(dataset_path)
df = df[0:1000]
a = df.to_json()


In [101]:
a

'{"age":{"0":"35<x<53","1":"35<x<53","2":"35<x<53","3":"35<x<53","4":"17<x<35","5":"35<x<53","6":"35<x<53","7":"35<x<53","8":"17<x<35","9":"35<x<53","10":"35<x<53","11":"17<x<35","12":"17<x<35","13":"17<x<35","14":"17<x<35","15":"17<x<35","16":"17<x<35","17":"35<x<53","18":"35<x<53","19":"35<x<53","20":"53<x<71","21":"17<x<35","22":"35<x<53","23":"53<x<71","24":"53<x<71","25":"17<x<35","26":"35<x<53","27":"35<x<53","28":"17<x<35","29":"17<x<35","30":"35<x<53","31":"17<x<35","32":"17<x<35","33":"35<x<53","34":"17<x<35","35":"17<x<35","36":"35<x<53","37":"17<x<35","38":"35<x<53","39":"17<x<35","40":"35<x<53","41":"17<x<35","42":"53<x<71","43":"35<x<53","44":"35<x<53","45":"35<x<53","46":"17<x<35","47":"17<x<35","48":"35<x<53","49":"35<x<53","50":"35<x<53","51":"35<x<53","52":"35<x<53","53":"17<x<35","54":"35<x<53","55":"17<x<35","56":"17<x<35","57":"35<x<53","58":"35<x<53","59":"17<x<35","60":"35<x<53","61":"17<x<35","62":"35<x<53","63":"35<x<53","64":"17<x<35","65":"17<x<35","66":"17<x<

In [95]:
a

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.
