In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../')

import pandas as pd
import pprint

from privex.components.basic import Schema, Dataset, GroupbyQuery, Question
from privex.components.utils import generate_explanation_predicates
from privex.framework.solution import ExplanationSession

import logging
logger = logging.getLogger(__name__)

In [3]:
df = pd.read_csv('../data/german/synthetic_german.csv')
print(len(df))
schema = Schema.from_json('../data/german/german.json')
dataset = Dataset(df, schema)
gamma = 0.95
attributes = ['duration', 'credit-history', 'purpose', 'credit-amount', 'saving-account', 'employment', 'installment-rate', 'sex-marst', 'other-debtors', 'residence', 'property', 'age', 'other-installment-plans', 'housing', 'existing-credits', 'job', 'people-liable', 'telephone', 'foreign-worker']
predicates = generate_explanation_predicates(attributes, schema, strategy='1-way marginal')
#predicates = predicates[:10]
#predicates += generate_explanation_predicates(attributes, schema, strategy='2-way marginal')
es = ExplanationSession(dataset, gamma, predicates, random_seed = 152636)

1000000
2022-05-27 16:32:26,709 INFO     [meta_explanation_session.py:25] 82 predicates for the explanation.


In [4]:
# Phase 1
groupby_query = GroupbyQuery(
    agg = 'AVG',
    attr_agg = 'good-credit',
    predicate = None,
    attr_group = 'status',
    schema = schema
)
rho_query = 0.1
es.phase_1_submit_query(groupby_query, rho_query)
print(f'submiited queries with rho = {rho_query}')
nr = es.phase_1_show_query_results()
nr['group'] = nr['group'].apply(lambda row: row[0])
gt = df.groupby('status').agg(answer=('good-credit', 'mean')).reset_index()
gt = gt.rename(columns={'status': 'group'})
print(nr.merge(gt, on='group').rename(columns={'answer_x':'answer', 'answer_y': 'truth (hidden)'}).sort_values('truth (hidden)'))

submiited queries with rho = 0.1
                                        group    answer  truth (hidden)
0                         no checking account  0.526573        0.526574
1                                  ... < 0 DM  0.574456        0.574466
2                            0<= ... < 200 DM  0.756216        0.756344
3  ... >= 200 DM / salary for at least 1 year  0.863093        0.863093


In [8]:
# Phase 2
question = Question.from_group_comparison(groupby_query, ('... < 0 DM',), ('no checking account',))
es.phase_2_submit_question(question)
es.phase_2_prepare_question_ci()
ci = es.phase_2_show_question_ci()
print('question: ', question.to_natural_language())
print(f'The {gamma*100:.0f}% confidence interval of the difference is ', ci)
logger.info(f'question_point: {es.question_point}')
# groundtruth_ci = es.phase_2_ground_truth_ci(rho_query, gamma)
logger.info(f'true question: {es.question.evaluation(es.dataset)}')
# print(f'The {gamma*100:.0f}% confidence interval of ground truth is ', groundtruth_ci)

2022-05-27 16:33:16,529 INFO     [image.py:20] answers: [144026.5750581061, 273516.72303035343, 154607.9269292538, 269138.1833127718]
2022-05-27 16:33:16,530 INFO     [image.py:21] sigmas: [3.162277660168379, 3.162277660168379, 3.162277660168379, 3.162277660168379]
2022-05-27 16:33:16,530 INFO     [image.py:22] bounds: [(144018.6766198827, 144034.4734963295), (273508.82459213, 273524.62146857684), (154600.0284910304, 154615.8253674772), (269130.2848745484, 269146.0817509952)]
question:  Why AVG(good-credit) WHERE `status` == "... < 0 DM" >= AVG(good-credit) WHERE `status` == "no checking account"?
The 95% confidence interval of the difference is  (0.04779207887063275, 0.0479726572110577)
2022-05-27 16:33:16,532 INFO     [<ipython-input-8-54446e20c51d>:8] question_point: 0.047882367957848726
2022-05-27 16:33:16,800 INFO     [<ipython-input-8-54446e20c51d>:10] true question: 0.04789193497607158


In [9]:
# Phase 3
k = 5
logger.debug(f'Length of predicates is {len(predicates)}')
es.phase_3_submit_explanation_request()
es.phase_3_prepare_explanation(k, 
    {'rho_topk': 0.5, 'rho_influ': 0.5, 'rho_rank': 1.0}, split_factor = 0.9)
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 
'display.max_colwidth', 1000, 'display.width', 1000):
    t1, t2 = es.phase_3_show_explanation_table()
    print(t1)
    print(t2)

2022-05-27 16:33:18,304 DEBUG    [<ipython-input-9-716482da31c9>:3] Length of predicates is 82
(`status` == "no checking account")or(`status` == "... < 0 DM")
2022-05-27 16:33:18,417 INFO     [influence_function.py:24] Dataset relative to the question has length 542650
2022-05-27 16:33:57,982 DEBUG    [meta_explanation_session.py:169] 82 predicates and their influences & scores have been loaded.
2022-05-27 16:33:57,989 DEBUG    [meta_explanation_session.py:173] 
       influence         score
count  82.000000     82.000000
mean    0.000333     91.117826
std     0.014304   3912.349348
min    -0.057912 -15839.828271
25%    -0.004585  -1253.962692
50%     0.000003      0.761900
75%     0.005414   1480.750210
max     0.036828  10073.068379
2022-05-27 16:33:57,990 DEBUG    [meta_explanation_session.py:213] total rho_expl is 2.0
2022-05-27 16:33:57,990 DEBUG    [meta_explanation_session.py:214] rho_topk is 0.5
2022-05-27 16:33:57,990 DEBUG    [meta_explanation_session.py:215] rho_ci is 0.5
2

100%|██████████| 5/5 [00:02<00:00,  2.45it/s]

2022-05-27 16:34:00,38 DEBUG    [meta_explanation_session.py:259] [(10038.644554585431, 10178.888161077683), (9176.263254853591, 9316.506861345842), (7002.181555648282, 7142.425162140535), (6445.289253961989, 6585.532860454242), (6323.885055837214, 6464.128662329467)]
2022-05-27 16:34:00,38 INFO     [meta_explanation_session.py:267] computing rank ci



100%|██████████| 5/5 [00:00<00:00, 8405.42it/s]

2022-05-27 16:34:00,41 DEBUG    [meta_explanation_session.py:280] [(1, 1), (1, 2), (2, 4), (2, 5), (2, 5)]
                                                          predicates Rel Inf 90-CI L Rel Inf 90-CI R  Rnk 95-CI L  Rnk 95-CI R
0                                          `existing-credits` == "1"          77.90%          78.99%            1            1
1                             `job` == "skilled employee / official"          71.21%          72.29%            1            2
2                            `sex-marst` == "male : married/widowed"          54.34%          55.42%            2            4
3                                   `credit-amount` == "(500, 2500]"          50.01%          51.10%            2            5
4  `credit-history` == "no credits taken/all credits paid back duly"          49.07%          50.16%            2            5
                                                          predicates   Inf 95-CI L   Inf 95-CI R  Rnk 95-CI L  Rnk 95-CI R
0       




In [10]:
def get_influence_at_rank_i(es, i):
    return es.predicates_with_influences[
        es.sorted_predicates[i-1]
    ]

def get_score_at_rank_i(es, i):
    return es.predicates_with_scores[
        es.sorted_predicates[i-1]
    ]

In [11]:
true_kth_influence = get_influence_at_rank_i(es, k)
logger.info(f'topk true top_k influence: {[get_influence_at_rank_i(es, i+1) for i in range(k)]}')
logger.info(f'topk true top_k score: {[get_score_at_rank_i(es, i+1) for i in range(k)]}')

2022-05-27 16:34:00,77 INFO     [<ipython-input-11-8a65cf92c78b>:2] topk true top_k influence: [0.036828077256122645, 0.03384325947850492, 0.02596021355695329, 0.023686919061497044, 0.02315438433232411]
2022-05-27 16:34:00,78 INFO     [<ipython-input-11-8a65cf92c78b>:3] topk true top_k score: [10073.068378785641, 9256.672959522752, 7100.533771243636, 6478.751354024425, 6333.094585039961]


In [12]:
k = 5
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 
'display.max_colwidth', 1000, 'display.width', 1000):
    print(es.phase_3_true_top_k(k))

Positive Influences:  38
                                                                topk rel-influence
0                                          `existing-credits` == "1"        78.16%
1                             `job` == "skilled employee / official"        71.83%
2                            `sex-marst` == "male : married/widowed"        55.10%
3                                   `credit-amount` == "(500, 2500]"        50.27%
4  `credit-history` == "no credits taken/all credits paid back duly"        49.14%
