In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../data/adult.csv')

In [4]:
df.groupby('education').agg(ratio=('high_income', 'mean'), cnt=('high_income', 'count')).sort_values('ratio')

Unnamed: 0_level_0,ratio,cnt
education,Unnamed: 1_level_1,Unnamed: 2_level_1
Preschool,0.012048,83
1st-4th,0.032389,247
11th,0.050773,1812
5th-6th,0.053045,509
9th,0.054233,756
10th,0.062635,1389
7th-8th,0.064921,955
12th,0.073059,657
HS-grad,0.158578,15784
Some-college,0.189649,10878


In [3]:
import sys
sys.path.append('../')

import pandas as pd
import pprint

from privex.components.basic import Schema, Dataset, GroupbyQuery, Question
from privex.components.utils import generate_explanation_predicates
from privex.framework.solution import ExplanationSession

import logging
logger = logging.getLogger(__name__)

In [4]:
df = pd.read_csv('../data/adult.csv')
schema = Schema.from_json('../data/adult.json')
dataset = Dataset(df, schema)
gamma = 0.95
attributes = ['education', 'occupation', 'age', 'relationship', 'race', 'workclass', 'sex', 'native-country']
predicates = generate_explanation_predicates(attributes, schema, strategy='1-way marginal')
#predicates = predicates[:10]
#predicates += generate_explanation_predicates(attributes, schema, strategy='2-way marginal')
es = ExplanationSession(dataset, gamma, predicates)

2022-05-07 22:12:45,752 INFO     [meta_explanation_session.py:25] 103 predicates for the explanation.


In [5]:
# Phase 1
groupby_query = GroupbyQuery(
    agg = 'AVG',
    attr_agg = 'high_income',
    predicate = None,
    attr_group = 'marital-status',
    schema = schema
)
rho_query = 0.1
es.phase_1_submit_query(groupby_query, rho_query, random_seed = 152636)
print(f'submiited queries with rho = {rho_query}')
nr = es.phase_1_show_query_results()
nr['group'] = nr['group'].apply(lambda row: row[0])
gt = df.groupby('marital-status').agg(answer=('high_income', 'mean')).reset_index()
gt = gt.rename(columns={'marital-status': 'group'})
print(nr.merge(gt, on='group').rename(columns={'answer_x':'answer', 'answer_y': 'truth (hidden)'}).sort_values('truth (hidden)'))

2022-05-07 22:12:49,682 INFO     [utils.py:141] NumExpr defaulting to 8 threads.
submiited queries with rho = 0.1
                   group    answer  truth (hidden)
0          Never-married  0.045511        0.045480
1              Separated  0.064712        0.064706
2                Widowed  0.082854        0.084321
3  Married-spouse-absent  0.089988        0.092357
4               Divorced  0.101578        0.101161
6      Married-AF-spouse  0.463193        0.378378
5     Married-civ-spouse  0.446021        0.446133


In [6]:
# Phase 2
#question = Question.from_group_comparison(groupby_query, 'Prof-school', 'Doctorate')
question = Question.from_group_comparison(groupby_query, ('Married-AF-spouse',), ('Married-civ-spouse',))
es.phase_2_submit_question(question)
es.phase_2_prepare_question_ci()
ci = es.phase_2_show_question_ci()
point = es.phase_2_show_question_point()
print('question: ', question.to_natural_language())
print('The noisy group difference is ', point)
print(f'The {gamma*100:.0f}% confidence interval of the difference is ', ci)

2022-05-07 22:12:51,956 INFO     [image.py:20] answers: [9982.926929253796, 22382.183312771773, 15.765188211211795, 34.03588831851936]
2022-05-07 22:12:51,957 INFO     [image.py:21] sigmas: [3.162277660168379, 3.162277660168379, 3.162277660168379, 3.162277660168379]
2022-05-07 22:12:51,957 INFO     [image.py:22] bounds: [(9975.028491030382, 9990.82536747721), (22374.284874548357, 22390.08175099519), (7.866749987797281, 23.66362643462631), (26.137450095104846, 41.93432654193388)]
question:  Why AVG(high_income) WHERE `marital-status` == "Married-AF-spouse" >= AVG(high_income) WHERE `marital-status` == "Married-civ-spouse"?
The noisy group difference is  0.01717194739104899
The 95% confidence interval of the difference is  (-0.2589346986687564, 0.4598422551149035)


In [7]:
# Phase 2
#question = Question.from_group_comparison(groupby_query, 'Prof-school', 'Doctorate')
question = Question.from_group_comparison(groupby_query, ('Married-civ-spouse',), ('Never-married',))
es.phase_2_submit_question(question)
es.phase_2_prepare_question_ci()
ci = es.phase_2_show_question_ci()
point = es.phase_2_show_question_point()
print('question: ', question.to_natural_language())
print('The noisy group difference is ', point)
print(f'The {gamma*100:.0f}% confidence interval of the difference is ', ci)

2022-05-07 22:12:54,948 INFO     [image.py:20] answers: [733.5750581061126, 16118.723030353407, 9982.926929253796, 22382.183312771773]
2022-05-07 22:12:54,948 INFO     [image.py:21] sigmas: [3.162277660168379, 3.162277660168379, 3.162277660168379, 3.162277660168379]
2022-05-07 22:12:54,948 INFO     [image.py:22] bounds: [(725.6766198826981, 741.473496329527), (16110.824592129993, 16126.621468576821), (9975.028491030382, 9990.82536747721), (22374.284874548357, 22390.08175099519)]
question:  Why AVG(high_income) WHERE `marital-status` == "Married-civ-spouse" >= AVG(high_income) WHERE `marital-status` == "Never-married"?
The noisy group difference is  0.4005103977536667
The 95% confidence interval of the difference is  (0.39948772348837247, 0.4015329300800539)


In [37]:
# Phase 3
k = 5
logger.debug(f'Length of predicates is {len(predicates)}')
rho_expl = 2.0
es.phase_3_submit_explanation_request()
# es.phase_3_prepare_explanation(k, rho_expl, split_factor = 0.9, 
#                                random_seed_topk = 12532, random_seed_influci = 1001, random_seed_rankci = 1000)
es.phase_3_prepare_explanation(k, rho_expl, split_factor = 0.9, 
                               random_seed_topk = 12527, random_seed_influci = 1001, random_seed_rankci = 1000) 
# random_seed_topk = 12531, random_seed_influci = 1001, random_seed_rankci = 1000
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 
'display.max_colwidth', 1000, 'display.width', 1000):
    t1, t2 = es.phase_3_show_explanation_table()
    print(t1)
    print(t2)

2022-05-25 16:16:37,315 DEBUG    [<ipython-input-37-7a20d1e6b6df>:3] Length of predicates is 103
(`marital-status` == "Never-married")or(`marital-status` == "Married-civ-spouse")
2022-05-25 16:16:37,322 INFO     [influence_function.py:24] Dataset relative to the question has length 38496
2022-05-25 16:16:40,633 DEBUG    [meta_explanation_session.py:169] 103 predicates and their influences & scores have been loaded.
2022-05-25 16:16:40,639 DEBUG    [meta_explanation_session.py:173] 
        influence       score
count  103.000000  103.000000
mean     0.000325    7.266483
std      0.006527  146.071226
min     -0.026839 -600.661181
25%     -0.000606  -13.551499
50%     -0.000017   -0.385618
75%      0.000368    8.235890
max      0.024788  554.765178
2022-05-25 16:16:40,639 DEBUG    [meta_explanation_session.py:213] total rho_expl is 2.0
2022-05-25 16:16:40,639 DEBUG    [meta_explanation_session.py:214] rho_topk is 0.05
2022-05-25 16:16:40,640 DEBUG    [meta_explanation_session.py:215] rho

100%|██████████| 5/5 [00:00<00:00, 30.95it/s]

2022-05-25 16:16:40,806 DEBUG    [meta_explanation_session.py:259] [(-151.0609253835924, 292.42829840830456), (-98.47665861816643, 345.01256517373054), (291.01195252105214, 734.5011763129492), (127.84858892088158, 571.3378127127785), (196.51099024975343, 640.0002140416505)]
2022-05-25 16:16:40,806 INFO     [meta_explanation_session.py:267] computing rank ci



100%|██████████| 5/5 [00:00<00:00, 7758.61it/s]

2022-05-25 16:16:40,808 DEBUG    [meta_explanation_session.py:280] [(1, 87), (1, 51), (1, 5), (1, 14), (1, 8)]
                          predicates Rel Inf 90-CI L Rel Inf 90-CI R  Rnk 95-CI L  Rnk 95-CI R
0         `education` == "Bachelors"           3.25%           8.19%            1            5
1  `occupation` == "Exec-managerial"           2.19%           7.14%            1            8
2                `age` == "(40, 50]"           1.43%           6.37%            1           14
3      `relationship` == "Own-child"          -1.10%           3.85%            1           51
4      `workclass` == "Self-emp-inc"          -1.69%           3.26%            1           87
                          predicates  Inf 95-CI L  Inf 95-CI R  Rnk 95-CI L  Rnk 95-CI R
0         `education` == "Bachelors"   291.011953   734.501176            1            5
1  `occupation` == "Exec-managerial"   196.510990   640.000214            1            8
2                `age` == "(40, 50]"   127.848589   




In [22]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 
'display.max_colwidth', 1000, 'display.width', 1000):
    t1, t2 = es.phase_3_show_explanation_table()
    print(t1)
    print(t2)

                          predicates Rel Inf 90-CI L Rel Inf 90-CI R  Rnk 95-CI L  Rnk 95-CI R
0  `occupation` == "Exec-managerial"           3.33%           8.28%            1            5
1         `education` == "Bachelors"           2.26%           7.21%            1            9
2                `age` == "(40, 50]"           1.99%           6.93%            1            8
3   `occupation` == "Prof-specialty"           0.68%           5.63%            1           18
4                  `race` == "White"          -2.67%           2.27%            1           99
                          predicates  Inf 95-CI L  Inf 95-CI R  Rnk 95-CI L  Rnk 95-CI R
0  `occupation` == "Exec-managerial"   298.366748   741.855972            1            5
1         `education` == "Bachelors"   202.748395   646.237619            1            9
2                `age` == "(40, 50]"   178.066581   621.555805            1            8
3   `occupation` == "Prof-specialty"    60.916650   504.405873            

In [11]:
k = 5
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 
'display.max_colwidth', 1000, 'display.width', 1000):
    print(es.phase_3_true_top_k(k))

Positive Influences:  13
                                topk rel-influence
0  `occupation` == "Exec-managerial"         6.19%
1         `education` == "Bachelors"         6.11%
2                `age` == "(40, 50]"         5.59%
3   `occupation` == "Prof-specialty"         4.84%
4           `education` == "Masters"         2.81%


In [12]:
for p in es.topk_explanation_predicates:
    x = es.predicates_with_influences_and_scores[p]
    print(f"{x['score']:.0f},")
    print(x['score'] / x['influence'])

555,
22380.0
501,
22380.0
225,
22380.0
434,
22380.0
547,
22380.0


In [13]:
for p in es.topk_explanation_predicates:
    influ = es.predicates_with_influences_and_scores[p]['influence']
    rank = es.sorted_predicates.tolist().index(p) + 1
    print(influ, rank)

0.024788435119158524 1
0.02239452141248067 3
0.010037808294482172 6
0.019403818136007892 4
0.02445980262099675 2


In [14]:
x = [(659.4865258199197, 785.0208922430029), (16027.299784401126, 16152.834150824208), (9920.325556829126, 10045.859923252208), (22322.769679528385, 22448.304045951467), (507.9753327456765, 633.5096991687597), (14799.363464777114, 14924.897831200196), (7470.523050135881, 7596.057416558965), (18751.171149870457, 18876.70551629354), (14791.120183116112, 14916.654549539195), (22315.627001837274, 22441.161368260357)]
for i, c in enumerate(x):
    print(f'$\I_{{{i+1}}} = ({c[0]:.0f}, {c[1]:.0f})$, ')

$\I_{1} = (659, 785)$, 
$\I_{2} = (16027, 16153)$, 
$\I_{3} = (9920, 10046)$, 
$\I_{4} = (22323, 22448)$, 
$\I_{5} = (508, 634)$, 
$\I_{6} = (14799, 14925)$, 
$\I_{7} = (7471, 7596)$, 
$\I_{8} = (18751, 18877)$, 
$\I_{9} = (14791, 14917)$, 
$\I_{10} = (22316, 22441)$, 
