In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

In [3]:
ipums = pd.read_csv('../data/ipums/ipumscps.csv')

In [4]:
df = ipums.sample(frac=0.2)

In [5]:
len(df)

229310

In [6]:
df.groupby('EDUC').agg(ratio=('INCTOT', 'mean'), cnt=('INCTOT', 'count')).sort_values('ratio')

Unnamed: 0_level_0,ratio,cnt
EDUC,Unnamed: 1_level_1,Unnamed: 2_level_1
Grade 10,14634.251852,5535
Grade 11,15313.941323,6817
Grade 9,15563.616831,4575
None or preschool,16940.367898,704
Grades 7 or 8,17849.437004,4286
"Grades 1, 2, 3, or 4",18830.79975,1603
Grades 5 or 6,20263.133229,3205
"12th grade, no diploma",21475.321429,3304
High school diploma or equivalent,29897.164417,65425
Some college but no degree,32426.085508,43025


In [7]:
import sys
sys.path.append('../')

import pandas as pd
import pprint

from privex.components.basic import Schema, Dataset, GroupbyQuery, Question
from privex.components.utils import generate_explanation_predicates
from privex.framework.solution import ExplanationSession

import logging
logger = logging.getLogger(__name__)

In [8]:
ipums = pd.read_csv('../data/ipums/ipumscps.csv')

In [9]:
df = ipums.sample(frac=1)
print(len(df))
schema = Schema.from_json('../data/ipums/ipumscps.json')
dataset = Dataset(df, schema)
gamma = 0.95
attributes = ['RELATE', 'SEX', 'RACE', 'CITIZEN', 'CLASSWKR', 'EDUC']#, 'OCC']
predicates = generate_explanation_predicates(attributes, schema, strategy='1-way marginal')
#predicates = predicates[:10]
#predicates += generate_explanation_predicates(attributes, schema, strategy='2-way marginal')
es = ExplanationSession(dataset, gamma, predicates, random_seed = 152636)

1146552
2022-05-27 16:32:34,540 INFO     [meta_explanation_session.py:25] 111 predicates for the explanation.


In [10]:
# Phase 1
groupby_query = GroupbyQuery(
    agg = 'AVG',
    attr_agg = 'INCTOT',
    predicate = None,
    attr_group = 'SEX',
    schema = schema
)
rho_query = 0.1
es.phase_1_submit_query(groupby_query, rho_query)
print(f'submiited queries with rho = {rho_query}')
nr = es.phase_1_show_query_results()
nr['group'] = nr['group'].apply(lambda row: row[0])
gt = df.groupby('SEX').agg(answer=('INCTOT', 'mean')).reset_index()
gt = gt.rename(columns={'SEX': 'group'})
print(nr.merge(gt, on='group').rename(columns={'answer_x':'answer', 'answer_y': 'truth (hidden)'}).sort_values('truth (hidden)'))

2022-05-27 16:32:34,561 INFO     [utils.py:141] NumExpr defaulting to 8 threads.
submiited queries with rho = 0.1
    group        answer  truth (hidden)
0  Female  31135.246992    31135.778443
1    Male  45778.457476    45778.392514


In [12]:
# Phase 2
#question = Question.from_group_comparison(groupby_query, 'Prof-school', 'Doctorate')
# question = Question.from_group_comparison(groupby_query, ('Married, spouse absent',), ('Divorced',))
question = Question.from_group_comparison(groupby_query, ('Male',), ('Female',))
es.phase_2_submit_question(question)
es.phase_2_prepare_question_ci()
ci = es.phase_2_show_question_ci()
print('question: ', question.to_natural_language())
print(f'The {gamma*100:.0f}% confidence interval of the difference is ', ci)
logger.info(f'question_point: {es.question_point}')
# groundtruth_ci = es.phase_2_ground_truth_ci(rho_query, gamma)
logger.info(f'true question: {es.question.evaluation(es.dataset)}')
# print(f'The {gamma*100:.0f}% confidence interval of ground truth is ', groundtruth_ci)

2022-05-27 16:33:30,156 INFO     [image.py:20] answers: [25463430279.621223, 556231.7230303534, 18379920387.850758, 590325.1833127717]
2022-05-27 16:33:30,157 INFO     [image.py:21] sigmas: [632455.5320336758, 3.162277660168379, 632455.5320336758, 3.162277660168379]
2022-05-27 16:33:30,157 INFO     [image.py:22] bounds: [(25461850591.97654, 25465009967.265907), (556223.82459213, 556239.6214685768), (18378340700.206074, 18381500075.49544), (590317.2848745483, 590333.0817509951)]
question:  Why AVG(INCTOT) WHERE `SEX` == "Male" >= AVG(INCTOT) WHERE `SEX` == "Female"?
The 95% confidence interval of the difference is  (14636.627916155696, 14649.79306911513)
2022-05-27 16:33:30,161 INFO     [<ipython-input-12-110d792e710a>:10] question_point: 14643.210484454969
2022-05-27 16:33:30,344 INFO     [<ipython-input-12-110d792e710a>:12] true question: 14642.614070936666


In [13]:
# Phase 3
k = 5
logger.debug(f'Length of predicates is {len(predicates)}')
es.phase_3_submit_explanation_request()
es.phase_3_prepare_explanation(k, 
    {'rho_topk': 0.5, 'rho_influ': 0.5, 'rho_rank': 1.0}, split_factor = 0.9)
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 
'display.max_colwidth', 1000, 'display.width', 1000):
    t1, t2 = es.phase_3_show_explanation_table()
    print(t1)
    print(t2)

2022-05-27 16:33:30,665 DEBUG    [<ipython-input-13-716482da31c9>:3] Length of predicates is 111
(`SEX` == "Male")or(`SEX` == "Female")
2022-05-27 16:33:30,779 INFO     [influence_function.py:24] Dataset relative to the question has length 1146552
2022-05-27 16:34:37,365 DEBUG    [meta_explanation_session.py:169] 111 predicates and their influences & scores have been loaded.
2022-05-27 16:34:37,371 DEBUG    [meta_explanation_session.py:173] 
         influence         score
count   111.000000  1.110000e+02
mean      6.589972  3.890212e+06
std     311.344038  1.837935e+08
min   -1738.293903 -1.026155e+09
25%      -9.949549 -5.873448e+06
50%       0.000000  0.000000e+00
75%       0.000000  0.000000e+00
max    1712.471559  1.010911e+09
2022-05-27 16:34:37,372 DEBUG    [meta_explanation_session.py:213] total rho_expl is 2.0
2022-05-27 16:34:37,372 DEBUG    [meta_explanation_session.py:214] rho_topk is 0.5
2022-05-27 16:34:37,372 DEBUG    [meta_explanation_session.py:215] rho_ci is 0.5
2022

100%|██████████| 5/5 [00:02<00:00,  1.97it/s]

2022-05-27 16:34:39,923 DEBUG    [meta_explanation_session.py:259] [(991713473.9750044, 1019762195.2734551), (578555425.614609, 606604146.9130597), (521961769.2557782, 550010490.5542288), (464260608.63082576, 492309329.92927635), (312118632.44994235, 340167353.74839294)]
2022-05-27 16:34:39,924 INFO     [meta_explanation_session.py:267] computing rank ci



100%|██████████| 5/5 [00:00<00:00, 3870.71it/s]

2022-05-27 16:34:39,926 DEBUG    [meta_explanation_session.py:280] [(1, 1), (2, 3), (2, 5), (2, 5), (2, 6)]
                                    predicates Rel Inf 90-CI L Rel Inf 90-CI R  Rnk 95-CI L  Rnk 95-CI R
0               `RELATE` == "Head/householder"          12.18%          12.52%            1            1
1                `EDUC` == "Bachelor's degree"           7.10%           7.45%            2            3
2                            `RACE` == "White"           6.41%           6.75%            2            5
3                         `RELATE` == "Spouse"           5.70%           6.04%            2            5
4  `CLASSWKR` == "Self-employed, incorporated"           3.83%           4.18%            2            6
                                    predicates   Inf 95-CI L   Inf 95-CI R  Rnk 95-CI L  Rnk 95-CI R
0               `RELATE` == "Head/householder"  9.917135e+08  1.019762e+09            1            1
1                `EDUC` == "Bachelor's degree"  5.785554e+08




In [14]:
def get_influence_at_rank_i(es, i):
    return es.predicates_with_influences[
        es.sorted_predicates[i-1]
    ]

def get_score_at_rank_i(es, i):
    return es.predicates_with_scores[
        es.sorted_predicates[i-1]
    ]

In [15]:
true_kth_influence = get_influence_at_rank_i(es, k)
logger.info(f'topk true top_k influence: {[get_influence_at_rank_i(es, i+1) for i in range(k)]}')
logger.info(f'topk true top_k score: {[get_score_at_rank_i(es, i+1) for i in range(k)]}')

2022-05-27 16:34:39,967 INFO     [<ipython-input-15-8a65cf92c78b>:2] topk true top_k influence: [1712.4715591138315, 1009.6045667697219, 902.5574820521066, 829.6254586592688, 582.3596862862028]
2022-05-27 16:34:39,968 INFO     [<ipython-input-15-8a65cf92c78b>:3] topk true top_k score: [1010911348.1907544, 595992796.6692026, 532800440.4774457, 489746989.63211554, 343780317.0875301]


In [16]:
k = 5
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 
'display.max_colwidth', 1000, 'display.width', 1000):
    print(es.phase_3_true_top_k(k))

Positive Influences:  9
                             topk rel-influence
0  `RELATE` == "Head/householder"        12.41%
1   `EDUC` == "Bachelor's degree"         7.32%
2               `RACE` == "White"         6.54%
3            `RELATE` == "Spouse"         6.01%
4             `CLASSWKR` == "NIU"         4.22%
