In [1]:
#!/usr/bin/env python
import sys
sys.path.append('../../')

import privex.evaluation.run as run
from privex.components.basic import Schema, Dataset, GroupbyQuery, Question

import pandas as pd

import logging
logger = logging.getLogger(__name__)

df = pd.read_csv('../../data/german/synthetic_german.csv')
schema = Schema.from_json('../../data/german/german.json')

dataset = Dataset(df, schema)
full_attributes = ['status', 'duration', 'credit-history', 'purpose', 'credit-amount', 'saving-account', 'employment', 'installment-rate', 'sex-marst', 'other-debtors', 'residence', 'property', 'age', 'other-installment-plans', 'housing', 'existing-credits', 'job', 'people-liable', 'telephone', 'foreign-worker']
default = run.default

default['attr_agg'] = 'good-credit'
default['attr_group'] = 'status'
default['group_a'] = '... < 0 DM'
default['group_b'] = 'no checking account'

# for option in ['rho_query', 'rho_topk', 'rho_influ', 'rho_rank']:
#     default[option] = default[option] * 1

# default['reps'] = 1
# default['scale'] = 0.1

setting = default.copy()
setting['reps'] = 1

reports = run.run(
    dataset =                dataset,
    scale =                  setting['scale'],
    full_attributes =        full_attributes,
    rho_query =              setting['rho_query'],
    agg =                    setting['agg'],
    attr_agg =               setting['attr_agg'],
    attr_group =             setting['attr_group'],
    group_a =                setting['group_a'],
    group_b =                setting['group_b'],
    rho_topk =               setting['rho_topk'],
    rho_influ =              setting['rho_influ'],
    rho_rank =               setting['rho_rank'],
    gamma =                  setting['gamma'],
    predicate_strategy =     setting['predicate_strategy'],
    k =                      setting['k'],
    split_factor =           setting['split_factor'],
    reps =                   setting['reps']
)
    

2022-04-20 21:56:16,94 INFO     [meta_explanation_session.py:23] 82 predicates for the explanation.
2022-04-20 21:56:16,96 INFO     [run.py:168] loaded dataset with 1000000 rows and columns ['status', 'credit-history', 'purpose', 'saving-account', 'employment', 'installment-rate', 'sex-marst', 'other-debtors', 'residence', 'property', 'other-installment-plans', 'housing', 'existing-credits', 'job', 'people-liable', 'telephone', 'foreign-worker', 'good-credit', 'duration', 'credit-amount', 'age'].
2022-04-20 21:56:16,576 INFO     [run.py:180] submited queries with rho = 0.1
(`status` == "no checking account")or(`status` == "... < 0 DM")
2022-04-20 21:56:16,688 INFO     [influence_function.py:24] Dataset relative to the question has length 542650
2022-04-20 21:56:49,761 DEBUG    [meta_explanation_session.py:155] 82 predicates and their influences & scores have been loaded.
2022-04-20 21:56:49,768 DEBUG    [meta_explanation_session.py:159] 
       influence         score
count  82.000000 

100%|██████████| 5/5 [00:02<00:00,  1.67it/s]

2022-04-20 21:56:52,767 DEBUG    [meta_explanation_session.py:237] [(0.03658576908876855, 0.037223033156367574), (0.033544336306650754, 0.0341836670527422), (0.025684551931869457, 0.02633362435214833), (0.02337190362830731, 0.02404899470091711), (0.02290765441879217, 0.023578643005510668)]
2022-04-20 21:56:52,768 INFO     [meta_explanation_session.py:240] computing rank ci



100%|██████████| 5/5 [00:00<00:00, 8848.74it/s]

2022-04-20 21:56:52,770 DEBUG    [meta_explanation_session.py:253] [(1, 1), (1, 2), (1, 4), (2, 5), (2, 5)]





2022-04-20 21:56:52,993 INFO     [run.py:50] true question: 0.04789193497607158
2022-04-20 21:56:52,993 INFO     [run.py:51] question_point: 0.047895897784428376
2022-04-20 21:56:52,994 INFO     [run.py:52] question_ci: (0.04780560957004254, 0.04798618616488737)
2022-04-20 21:56:53,257 INFO     [run.py:54] groundtruth_ci: (0.04785625549055536, 0.04792770963009489)
2022-04-20 21:56:53,257 INFO     [run.py:59] percentage_of_missing: 0.0
2022-04-20 21:56:53,479 INFO     [run.py:63] percentage_of_extra: 0.0022785142291665158
2022-04-20 21:56:53,479 INFO     [run.py:68] noise quantification error: 0.0011392571145832579
2022-04-20 21:56:53,480 INFO     [run.py:75] topk true top_k influence: [0.036828077256122645, 0.03384325947850492, 0.02596021355695329, 0.023686919061497044, 0.02315438433232411]
2022-04-20 21:56:53,480 INFO     [run.py:76] topk true top_k score: [10073.068378785641, 9256.672959522752, 7100.533771243636, 6478.751354024425, 6333.094585039961]
2022-04-20 21:56:53,480 INFO     

In [2]:
!ls -alh test.pkl

-rw-rw-r-- 1 yuchao yuchao 5.4K Apr 20 21:56 test.pkl


In [3]:
import pickle

In [4]:
with open('test.pkl', 'rb') as inp:
    es = pickle.load(inp)

In [5]:
es

{`duration` == "< 1 yr": 0.006295569188570303,
 `duration` == "1 <= ... < 4 yrs": -0.005165177105530383,
 `duration` == "4 <= ... < 7 yrs": -0.0009850615285597452,
 `duration` == ">= 7 yrs": 0.0,
 `credit-history` == "delay in paying off in the past": 0.006334862154941053,
 `credit-history` == "critical account/other credits elsewhere": 0.010554190285299768,
 `credit-history` == "no credits taken/all credits paid back duly": 0.02315438433232411,
 `credit-history` == "existing credits paid back duly till now": 0.018003187579113084,
 `credit-history` == "all credits at this bank paid back duly": -0.05791188914362722,
 `purpose` == "others": 0.004747468792784176,
 `purpose` == "car (new)": 0.005806153734366798,
 `purpose` == "car (used)": -0.03285595653774375,
 `purpose` == "furniture/equipment": 0.011012548433109417,
 `purpose` == "radio/television": 0.0012339668724354593,
 `purpose` == "domestic appliances": -9.373779783875403e-06,
 `purpose` == "repairs": 0.005634376155573154,
 `purpos