In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../data/adult.csv')

In [4]:
df.groupby('education').agg(ratio=('high_income', 'mean'), cnt=('high_income', 'count')).sort_values('ratio')

Unnamed: 0_level_0,ratio,cnt
education,Unnamed: 1_level_1,Unnamed: 2_level_1
Preschool,0.012048,83
1st-4th,0.032389,247
11th,0.050773,1812
5th-6th,0.053045,509
9th,0.054233,756
10th,0.062635,1389
7th-8th,0.064921,955
12th,0.073059,657
HS-grad,0.158578,15784
Some-college,0.189649,10878


In [105]:
import sys
sys.path.append('../')

import pandas as pd
import pprint

from privex.components.basic import Schema, Dataset, GroupbyQuery, Question
from privex.components.utils import generate_explanation_predicates
from privex.framework.solution import ExplanationSession

import logging
logger = logging.getLogger(__name__)

In [106]:
df = pd.read_csv('../data/adult.csv')
schema = Schema.from_json('../data/adult.json')
dataset = Dataset(df, schema)
gamma = 0.95
attributes = ['education', 'occupation', 'age', 'relationship', 'race', 'workclass', 'sex', 'native-country']
predicates = generate_explanation_predicates(attributes, schema, strategy='1-way marginal')
#predicates = predicates[:10]
#predicates += generate_explanation_predicates(attributes, schema, strategy='2-way marginal')
es = ExplanationSession(dataset, gamma, predicates)

In [107]:
# Phase 1
groupby_query = GroupbyQuery(
    agg = 'AVG',
    attr_agg = 'high_income',
    predicate = None,
    attr_group = 'marital-status',
    schema = schema
)
rho_query = 0.1
es.phase_1_submit_query(groupby_query, rho_query, random_seed = 152636)
print(f'submiited queries with rho = {rho_query}')
nr = es.phase_1_show_query_results()
nr['group'] = nr['group'].apply(lambda row: row[0])
gt = df.groupby('marital-status').agg(answer=('high_income', 'mean')).reset_index()
gt = gt.rename(columns={'marital-status': 'group'})
print(nr.merge(gt, on='group').rename(columns={'answer_x':'answer', 'answer_y': 'truth (hidden)'}).sort_values('truth (hidden)'))

submiited queries with rho = 0.1
                   group    answer  truth (hidden)
0          Never-married  0.045511        0.045480
1              Separated  0.064712        0.064706
2                Widowed  0.082854        0.084321
3  Married-spouse-absent  0.089988        0.092357
4               Divorced  0.101578        0.101161
6      Married-AF-spouse  0.463193        0.378378
5     Married-civ-spouse  0.446021        0.446133


In [108]:
# Phase 2
#question = Question.from_group_comparison(groupby_query, 'Prof-school', 'Doctorate')
question = Question.from_group_comparison(groupby_query, ('Married-AF-spouse',), ('Married-civ-spouse',))
es.phase_2_submit_question(question)
es.phase_2_prepare_question_ci()
ci = es.phase_2_show_question_ci()
point = es.phase_2_show_question_point()
print('question: ', question.to_natural_language())
print('The noisy group difference is ', point)
print(f'The {gamma*100:.0f}% confidence interval of the difference is ', ci)

question:  Why AVG(high_income) WHERE `marital-status` == "Married-AF-spouse" >= AVG(high_income) WHERE `marital-status` == "Married-civ-spouse"?
The noisy group difference is  0.01717194739104899
The 95% confidence interval of the difference is  (-0.2589346986687564, 0.4598422551149035)


In [109]:
# Phase 2
#question = Question.from_group_comparison(groupby_query, 'Prof-school', 'Doctorate')
question = Question.from_group_comparison(groupby_query, ('Married-civ-spouse',), ('Never-married',))
es.phase_2_submit_question(question)
es.phase_2_prepare_question_ci()
ci = es.phase_2_show_question_ci()
point = es.phase_2_show_question_point()
print('question: ', question.to_natural_language())
print('The noisy group difference is ', point)
print(f'The {gamma*100:.0f}% confidence interval of the difference is ', ci)

question:  Why AVG(high_income) WHERE `marital-status` == "Married-civ-spouse" >= AVG(high_income) WHERE `marital-status` == "Never-married"?
The noisy group difference is  0.4005103977536667
The 95% confidence interval of the difference is  (0.39948772348837247, 0.40151707072996684)


In [111]:
# Phase 3
k = 5
logger.debug(f'Length of predicates is {len(predicates)}')
rho_expl = 2.0
es.phase_3_submit_explanation_request()
es.phase_3_prepare_explanation(k, rho_expl, random_seed = 12532) # 12535, 12534
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 
'display.max_colwidth', 1000, 'display.width', 1000):
    t1, t2 = es.phase_3_show_explanation_table()
    print(t1)
    print(t2)

2022-04-10 16:57:57,287 DEBUG    [<ipython-input-111-345a1ae8a650>:3] Length of predicates is 103
2022-04-10 16:58:02,308 DEBUG    [meta_explanation_session.py:143] 103 predicates and their influences & scores have been loaded.
2022-04-10 16:58:02,315 DEBUG    [meta_explanation_session.py:147] 
        influence       score
count  103.000000  103.000000
mean     0.000325    7.266483
std      0.006527  146.071226
min     -0.026839 -600.661181
25%     -0.000606  -13.551499
50%     -0.000017   -0.385618
75%      0.000368    8.235890
max      0.024788  554.765178
2022-04-10 16:58:02,316 DEBUG    [meta_explanation_session.py:185] total rho_expl is 2.0
2022-04-10 16:58:02,316 DEBUG    [meta_explanation_session.py:186] rho_topk is 0.05
2022-04-10 16:58:02,316 DEBUG    [meta_explanation_session.py:187] rho_ci is 0.05
2022-04-10 16:58:02,317 DEBUG    [meta_explanation_session.py:188] rho_rank is 1.9
2022-04-10 16:58:02,317 INFO     [meta_explanation_session.py:193] computing top k
2022-04-10 16

100%|██████████| 5/5 [00:00<00:00,  8.55it/s]

2022-04-10 16:58:02,904 DEBUG    [meta_explanation_session.py:214] [(0.014692073122944162, 0.03717375649415368), (0.009505671946713437, 0.033271807497719796), (-0.00015150540984193594, 0.015213433272726305), (0.008901350183101082, 0.03056368176688522), (0.013050651990698912, 0.03417829218812417)]
2022-04-10 16:58:02,905 INFO     [meta_explanation_session.py:217] computing rank ci



100%|██████████| 5/5 [00:00<00:00, 7828.11it/s]

2022-04-10 16:58:02,906 DEBUG    [meta_explanation_session.py:229] [(1, 6), (1, 9), (1, 92), (1, 16), (1, 8)]
                          predicates Rel Inf 90-CI L Rel Inf 90-CI R  Rnk 95-CI L  Rnk 95-CI R
0  `occupation` == "Exec-managerial"           3.67%           9.28%            1            6
1         `education` == "Bachelors"           3.26%           8.53%            1            8
2                `age` == "(40, 50]"           2.37%           8.31%            1            9
3   `occupation` == "Prof-specialty"           2.22%           7.63%            1           16
4      `relationship` == "Own-child"          -0.04%           3.80%            1           92
                          predicates  Inf 95-CI L  Inf 95-CI R  Rnk 95-CI L  Rnk 95-CI R
0  `occupation` == "Exec-managerial"     0.014692     0.037174            1            6
1         `education` == "Bachelors"     0.013051     0.034178            1            8
2                `age` == "(40, 50]"     0.009506    




In [104]:
k = 5
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 
'display.max_colwidth', 1000, 'display.width', 1000):
    print(es.phase_3_true_top_k(k))

Positive Influences:  13
                                topk rel-influence
0  `occupation` == "Exec-managerial"         6.19%
1         `education` == "Bachelors"         6.11%
2                `age` == "(40, 50]"         5.59%
3   `occupation` == "Prof-specialty"         4.84%
4           `education` == "Masters"         2.81%
