In [1]:
%load_ext autoreload
%autoreload 2

In [77]:
import pandas as pd

In [78]:
ipums = pd.read_csv('../data/ipums/ipumscps.csv')

In [79]:
df = ipums.sample(frac=0.2)

In [46]:
len(df)

126038

In [4]:
df.groupby('EDUC').agg(ratio=('INCTOT', 'mean'), cnt=('INCTOT', 'count')).sort_values('ratio')

Unnamed: 0_level_0,ratio,cnt
EDUC,Unnamed: 1_level_1,Unnamed: 2_level_1
Grade 10,16001.63483,14585
Grade 11,16305.004841,17764
Grade 9,16900.903476,12111
None or preschool,17985.533937,1768
Grades 7 or 8,19388.422854,11044
"Grades 1, 2, 3, or 4",20226.544358,3979
Grades 5 or 6,22263.893708,8392
"12th grade, no diploma",23132.302223,9043
High school diploma or equivalent,32399.334546,175438
Some college but no degree,35568.332165,114380


In [2]:
import sys
sys.path.append('../')

import pandas as pd
import pprint

from privex.components.basic import Schema, Dataset, GroupbyQuery, Question
from privex.components.utils import generate_explanation_predicates
from privex.framework.solution import ExplanationSession

import logging
logger = logging.getLogger(__name__)

In [3]:
ipums = pd.read_csv('../data/ipums/ipumscps.csv')

In [7]:
df = ipums.sample(frac=1)
print(len(df))
schema = Schema.from_json('../data/ipums/ipumscps.json')
dataset = Dataset(df, schema)
gamma = 0.95
attributes = ['RELATE', 'SEX', 'RACE', 'CITIZEN', 'CLASSWKR', 'EDUC']#, 'OCC']
predicates = generate_explanation_predicates(attributes, schema, strategy='1-way marginal')
#predicates = predicates[:10]
#predicates += generate_explanation_predicates(attributes, schema, strategy='2-way marginal')
es = ExplanationSession(dataset, gamma, predicates, random_seed = 152636)

1146552
2022-04-18 15:54:30,495 INFO     [meta_explanation_session.py:23] 111 predicates for the explanation.


In [8]:
# Phase 1
groupby_query = GroupbyQuery(
    agg = 'AVG',
    attr_agg = 'INCTOT',
    predicate = None,
    attr_group = 'SEX',
    schema = schema
)
rho_query = 0.1
es.phase_1_submit_query(groupby_query, rho_query)
print(f'submiited queries with rho = {rho_query}')
nr = es.phase_1_show_query_results()
nr['group'] = nr['group'].apply(lambda row: row[0])
gt = df.groupby('SEX').agg(answer=('INCTOT', 'mean')).reset_index()
gt = gt.rename(columns={'SEX': 'group'})
print(nr.merge(gt, on='group').rename(columns={'answer_x':'answer', 'answer_y': 'truth (hidden)'}).sort_values('truth (hidden)'))

submiited queries with rho = 0.1
    group        answer  truth (hidden)
0  Female  31135.246992    31135.778443
1    Male  45778.457476    45778.392514


In [9]:
# Phase 2
#question = Question.from_group_comparison(groupby_query, 'Prof-school', 'Doctorate')
# question = Question.from_group_comparison(groupby_query, ('Married, spouse absent',), ('Divorced',))
question = Question.from_group_comparison(groupby_query, ('Male',), ('Female',))
es.phase_2_submit_question(question)
es.phase_2_prepare_question_ci()
ci = es.phase_2_show_question_ci()
print('question: ', question.to_natural_language())
print(f'The {gamma*100:.0f}% confidence interval of the difference is ', ci)
logger.info(f'question_point: {es.question_point}')
groundtruth_ci = es.phase_2_ground_truth_ci(rho_query, gamma)
logger.info(f'true question: {es.question.evaluation(es.dataset)}')
print(f'The {gamma*100:.0f}% confidence interval of ground truth is ', groundtruth_ci)

question:  Why AVG(INCTOT) WHERE `SEX` == "Male" >= AVG(INCTOT) WHERE `SEX` == "Female"?
The 95% confidence interval of the difference is  (14636.627916155696, 14649.79306911513)
2022-04-18 15:54:33,375 INFO     [<ipython-input-9-6b6148cc5e01>:10] question_point: 14643.210484454969
2022-04-18 15:54:33,691 INFO     [<ipython-input-9-6b6148cc5e01>:12] true question: 14642.614070936666
The 95% confidence interval of ground truth is  (14639.494036260458, 14645.74503543867)


In [10]:
# Phase 3
k = 5
logger.debug(f'Length of predicates is {len(predicates)}')
es.phase_3_submit_explanation_request()
es.phase_3_prepare_explanation(k, 
    {'rho_topk': 0.5, 'rho_influ': 0.5, 'rho_rank': 1.0}, split_factor = 0.9)
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 
'display.max_colwidth', 1000, 'display.width', 1000):
    t1, t2 = es.phase_3_show_explanation_table()
    print(t1)
    print(t2)

2022-04-18 15:54:40,434 DEBUG    [<ipython-input-10-716482da31c9>:3] Length of predicates is 111
(`SEX` == "Male")or(`SEX` == "Female")
2022-04-18 15:54:40,539 INFO     [influence_function.py:24] Dataset relative to the question has length 1146552
2022-04-18 15:55:39,805 DEBUG    [meta_explanation_session.py:155] 111 predicates and their influences & scores have been loaded.
2022-04-18 15:55:39,810 DEBUG    [meta_explanation_session.py:159] 
         influence         score
count   111.000000  1.110000e+02
mean      6.589972  3.890212e+06
std     311.344038  1.837935e+08
min   -1738.293903 -1.026155e+09
25%      -9.949549 -5.873448e+06
50%       0.000000  0.000000e+00
75%       0.000000  0.000000e+00
max    1712.471559  1.010911e+09
2022-04-18 15:55:39,811 DEBUG    [meta_explanation_session.py:208] total rho_expl is 2.0
2022-04-18 15:55:39,811 DEBUG    [meta_explanation_session.py:209] rho_topk is 0.5
2022-04-18 15:55:39,811 DEBUG    [meta_explanation_session.py:210] rho_ci is 0.5
2022

100%|██████████| 5/5 [00:04<00:00,  1.18it/s]

2022-04-18 15:55:44,56 DEBUG    [meta_explanation_session.py:237] [(1682.1324637031005, 1725.8124311163579), (977.5989753765735, 1034.1634492262472), (886.4305297554581, 922.2874998001224), (807.2432055645846, 863.3515472196193), (562.2852727038127, 616.3155294968363)]
2022-04-18 15:55:44,56 INFO     [meta_explanation_session.py:240] computing rank ci



100%|██████████| 5/5 [00:00<00:00, 3892.26it/s]

2022-04-18 15:55:44,59 DEBUG    [meta_explanation_session.py:253] [(1, 1), (2, 3), (2, 5), (2, 5), (2, 6)]
                       predicates Rel Inf 90-CI L Rel Inf 90-CI R  Rnk 95-CI L  Rnk 95-CI R
0  `RELATE` == "Head/householder"          11.49%          11.79%            1            1
1   `EDUC` == "Bachelor's degree"           6.68%           7.06%            2            3
2               `RACE` == "White"           6.05%           6.30%            2            5
3            `RELATE` == "Spouse"           5.51%           5.90%            2            5
4             `CLASSWKR` == "NIU"           3.84%           4.21%            2            6
                       predicates  Inf 95-CI L  Inf 95-CI R  Rnk 95-CI L  Rnk 95-CI R
0  `RELATE` == "Head/householder"  1682.132464  1725.812431            1            1
1   `EDUC` == "Bachelor's degree"   977.598975  1034.163449            2            3
2               `RACE` == "White"   886.430530   922.287500            2           




In [11]:
def get_influence_at_rank_i(es, i):
    return es.predicates_with_influences[
        es.sorted_predicates[i-1]
    ]

def get_score_at_rank_i(es, i):
    return es.predicates_with_scores[
        es.sorted_predicates[i-1]
    ]

In [12]:
true_kth_influence = get_influence_at_rank_i(es, k)
logger.info(f'topk true top_k influence: {[get_influence_at_rank_i(es, i+1) for i in range(k)]}')
logger.info(f'topk true top_k score: {[get_score_at_rank_i(es, i+1) for i in range(k)]}')

2022-04-18 15:55:44,94 INFO     [<ipython-input-12-8a65cf92c78b>:2] topk true top_k influence: [1712.4715591138315, 1009.6045667697219, 902.5574820521066, 829.6254586592688, 582.3596862862028]
2022-04-18 15:55:44,95 INFO     [<ipython-input-12-8a65cf92c78b>:3] topk true top_k score: [1010911348.1907544, 595992796.6692026, 532800440.4774457, 489746989.63211554, 343780317.0875301]


In [13]:
k = 5
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 
'display.max_colwidth', 1000, 'display.width', 1000):
    print(es.phase_3_true_top_k(k))

Positive Influences:  9
                             topk rel-influence
0  `RELATE` == "Head/householder"        11.69%
1   `EDUC` == "Bachelor's degree"         6.89%
2               `RACE` == "White"         6.16%
3            `RELATE` == "Spouse"         5.67%
4             `CLASSWKR` == "NIU"         3.98%
