In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

In [44]:
ipums = pd.read_csv('../data/ipums/ipumscps.csv')

In [45]:
df = ipums.sample(frac=0.2)

In [46]:
len(df)

126038

In [4]:
df.groupby('EDUC').agg(ratio=('INCTOT', 'mean'), cnt=('INCTOT', 'count')).sort_values('ratio')

Unnamed: 0_level_0,ratio,cnt
EDUC,Unnamed: 1_level_1,Unnamed: 2_level_1
Grade 10,16001.63483,14585
Grade 11,16305.004841,17764
Grade 9,16900.903476,12111
None or preschool,17985.533937,1768
Grades 7 or 8,19388.422854,11044
"Grades 1, 2, 3, or 4",20226.544358,3979
Grades 5 or 6,22263.893708,8392
"12th grade, no diploma",23132.302223,9043
High school diploma or equivalent,32399.334546,175438
Some college but no degree,35568.332165,114380


In [47]:
import sys
sys.path.append('../')

import pandas as pd
import pprint

from privex.components.basic import Schema, Dataset, GroupbyQuery, Question
from privex.components.utils import generate_explanation_predicates
from privex.framework.solution import ExplanationSession

import logging
logger = logging.getLogger(__name__)

In [53]:
ipums = pd.read_csv('../data/ipums/ipumscps.csv')

In [70]:
df = ipums.sample(frac=1)
print(len(df))
schema = Schema.from_json('../data/ipums/ipumscps.json')
dataset = Dataset(df, schema)
gamma = 0.95
attributes = ['RELATE', 'SEX', 'RACE', 'CITIZEN', 'CLASSWKR', 'EDUC']#, 'OCC']
predicates = generate_explanation_predicates(attributes, schema, strategy='1-way marginal')
#predicates = predicates[:10]
#predicates += generate_explanation_predicates(attributes, schema, strategy='2-way marginal')
es = ExplanationSession(dataset, gamma, predicates)

630191


In [71]:
# Phase 1
groupby_query = GroupbyQuery(
    agg = 'AVG',
    attr_agg = 'INCTOT',
    predicate = None,
    attr_group = 'MARST',
    schema = schema
)
rho_query = 0.001
es.phase_1_submit_query(groupby_query, rho_query)
print(f'submiited queries with rho = {rho_query}')
nr = es.phase_1_show_query_results()
nr['group'] = nr['group'].apply(lambda row: row[0])
gt = df.groupby('MARST').agg(answer=('INCTOT', 'mean')).reset_index()
gt = gt.rename(columns={'MARST': 'group'})
print(nr.merge(gt, on='group').rename(columns={'answer_x':'answer', 'answer_y': 'truth (hidden)'}).sort_values('truth (hidden)'))

submiited queries with rho = 0.001
                     group        answer  truth (hidden)
1     Never married/single  29620.368259    29787.120008
2                  Widowed  30777.590323    32017.555066
0                Separated  28600.382862    34539.103627
3   Married, spouse absent  36531.384853    42418.966090
4                 Divorced  43741.552503    43961.069032
5  Married, spouse present  54630.673945    54584.651732


In [72]:
# Phase 2
#question = Question.from_group_comparison(groupby_query, 'Prof-school', 'Doctorate')
question = Question.from_group_comparison(groupby_query, ('Married, spouse absent',), ('Divorced',))
es.phase_2_submit_question(question)
es.phase_2_prepare_question_ci()
ci = es.phase_2_show_question_ci()
print('question: ', question.to_natural_language())
print(f'The {gamma*100:.0f}% confidence interval of the difference is ', ci)

question:  Why AVG(INCTOT) WHERE `MARST` == "Married, spouse absent" >= AVG(INCTOT) WHERE `MARST` == "Divorced"?
The 95% confidence interval of the difference is  (-7578.852479099871, -6836.038218264461)


In [73]:
# Phase 2
#question = Question.from_group_comparison(groupby_query, 'Prof-school', 'Doctorate')
question = Question.from_group_comparison(groupby_query, ('Married, spouse present',), ('Never married/single',))
es.phase_2_submit_question(question)
es.phase_2_prepare_question_ci()
ci = es.phase_2_show_question_ci()
print('question: ', question.to_natural_language())
print(f'The {gamma*100:.0f}% confidence interval of the difference is ', ci)

question:  Why AVG(INCTOT) WHERE `MARST` == "Married, spouse present" >= AVG(INCTOT) WHERE `MARST` == "Never married/single"?
The 95% confidence interval of the difference is  (24983.548844319983, 25037.055146973882)


In [75]:
# Phase 3
k = 5
logger.debug(f'Length of predicates is {len(predicates)}')
rho_expl = 1.0
es.phase_3_submit_explanation_request()
es.phase_3_prepare_explanation(k, rho_expl)
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 
'display.max_colwidth', 1000, 'display.width', 1000):
    t1, t2 = es.phase_3_show_explanation_table()
    print(t1)
    print(t2)

2022-04-10 17:18:45,811 DEBUG    [<ipython-input-75-7c846f656983>:3] Length of predicates is 111
(`MARST` == "Married, spouse present")or(`MARST` == "Never married/single")
2022-04-10 17:18:45,858 INFO     [influence_function.py:24] Dataset relative to the question has length 505962
2022-04-10 17:19:09,557 DEBUG    [meta_explanation_session.py:143] 111 predicates and their influences & scores have been loaded.
2022-04-10 17:19:09,563 DEBUG    [meta_explanation_session.py:147] 
         influence         score
count   111.000000  1.110000e+02
mean     -3.871889 -1.313697e+06
std     570.547317  1.935816e+08
min   -2918.057612 -9.900707e+08
25%     -13.020279 -4.417664e+06
50%       0.000000  0.000000e+00
75%       0.320958  1.088980e+05
max    3058.268787  1.037643e+09
2022-04-10 17:19:09,564 DEBUG    [meta_explanation_session.py:185] total rho_expl is 1.0
2022-04-10 17:19:09,564 DEBUG    [meta_explanation_session.py:186] rho_topk is 0.025
2022-04-10 17:19:09,565 DEBUG    [meta_explanat

100%|██████████| 5/5 [00:02<00:00,  2.08it/s]

2022-04-10 17:19:11,966 DEBUG    [meta_explanation_session.py:214] [(3153.592411479239, 3201.3320447693936), (-50.41319712384169, 8.878031903992582), (2613.184699378917, 2666.957601751099), (469.5574897572796, 529.0749306203855), (-129.89631784565466, -71.4531837415002)]
2022-04-10 17:19:11,967 INFO     [meta_explanation_session.py:217] computing rank ci



100%|██████████| 5/5 [00:00<00:00, 7587.38it/s]

2022-04-10 17:19:11,969 DEBUG    [meta_explanation_session.py:229] [(1, 3), (1, 108), (1, 3), (1, 106), (1, 108)]
                                     predicates Rel Inf 90-CI L Rel Inf 90-CI R  Rnk 95-CI L  Rnk 95-CI R
0                               `SEX` == "Male"          12.61%          12.80%            1            3
1                           `RELATE` == "Child"          10.45%          10.66%            1            3
2   `CLASSWKR` == "Self-employed, incorporated"           1.88%           2.12%            1          106
3                 `EDUC` == "1 year of college"          -0.20%           0.04%            1          108
4  `RELATE` == "Opposite sex unmarried partner"          -0.52%          -0.29%            1          108
                                     predicates  Inf 95-CI L  Inf 95-CI R  Rnk 95-CI L  Rnk 95-CI R
0                               `SEX` == "Male"  3153.592411  3201.332045            1            3
1                           `RELATE` == "Child"  2




In [None]:
# originally, it takes 4 minutes to finish phase 3