In [1]:
%load_ext autoreload
%autoreload 2

In [8]:
import sys
sys.path.append('../')

import pandas as pd
import pprint

from privex.components.basic import Schema, Dataset, GroupbyQuery, Question
from privex.components.utils import generate_explanation_predicates
from privex.framework.solution import ExplanationSession

import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

In [60]:
# Load dataset and prepare an explanation session

df = pd.read_csv('../data/adult.csv')
schema = Schema.from_json('../data/adult.json')
dataset = Dataset(df, schema)
gamma = 0.95 # It can be changed later. You can fix it as 0.95 as a default number.
attributes = ['education', 'occupation', 'age', 'relationship', 'race', 'workclass', 'sex', 'native-country'] # We can hard-code the attributes for each dataset in the demo. See other notebooks (ipumscps.ipynb and german-credit.ipynb)
predicates = generate_explanation_predicates(attributes, schema, strategy='1-way marginal')
es = ExplanationSession(dataset, gamma, predicates)

In [10]:
# Phase 1

# Speicfy the Group-By query
groupby_query = GroupbyQuery(
    agg = 'AVG',
    attr_agg = 'high_income',
    predicate = None,  # If you have a WHERE Clause like "WHERE X = a AND Y = b", you can specify here: predicate = Predicate('X = "a" AND Y == "b"')  
    attr_group = 'marital-status',
    schema = schema
)

# Submit the query and get the answer table
rho_query = 0.1
es.phase_1_submit_query(groupby_query, rho_query) # This must be called to update the status of es, so that es can remember the noisy answers.
answer_table, answer_table_df = groupby_query.get_answer_table(dataset, rho_query) # Answer table is to show on the web interface. The first return is a dictionary, the second return is its dataframe version 
print(answer_table_df)

                   group  true_answer  noisy_answer
0          Never-married     0.045480      0.045338
1     Married-civ-spouse     0.446133      0.445963
2                Widowed     0.084321      0.084258
3               Divorced     0.101161      0.101266
4              Separated     0.064706      0.065926
5  Married-spouse-absent     0.092357      0.090783
6      Married-AF-spouse     0.378378      0.408071


In [19]:
# Phase 2


# Prepare Question. Replace ('Married-AF-spouse',), ('Married-civ-spouse',) to the choices from the web interface. Here the question is about why the first group is >= the second group.
question = Question.from_group_comparison(groupby_query, ('Married-civ-spouse',), ('Never-married',))  

# Submit Question.
es.phase_2_submit_question(question)

# Re-specify the confidence level
es.gamma = 0.95

# Generate the confidence interval for the question
ci, _ = es.phase_2_prepare_question_ci()

# Print Result
print('question: ', question.to_natural_language())
print(f'The {es.gamma*100:.0f}% confidence interval of the difference is ', ci)

question:  Why AVG(high_income) WHERE `marital-status` == "Married-civ-spouse" >= AVG(high_income) WHERE `marital-status` == "Never-married"?
The 95% confidence interval of the difference is  (0.3994671429809914, 0.4015134317138519)


In [53]:
# Phase 3
k = 5
rho_expl = {'rho_topk':0.05, 'rho_influ': 0.05, 'rho_rank': 1.9}
es.phase_3_submit_explanation_request()
es.phase_3_prepare_explanation(k, rho_expl)

(`marital-status` == "Never-married")or(`marital-status` == "Married-civ-spouse")


100%|████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 38.26it/s]
100%|█████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 19099.74it/s]


In [54]:
explanation_table, _ = es.phase_3_show_explanation_table(withTruth=True)
print(explanation_table)

                          predicates Rel Inf 90-CI L Rel Inf 90-CI R  \
0  `occupation` == "Exec-managerial"           5.01%          11.88%   
1   `occupation` == "Prof-specialty"           4.18%          11.05%   
2         `education` == "Bachelors"           4.09%          10.96%   
3                `age` == "(50, 60]"           1.12%           7.99%   
4         `education` == "Preschool"          -5.48%           1.39%   

   Rnk 95-CI L  Rnk 95-CI R  True Rnk True Rel Inf  
0            1            3         1        8.60%  
1            1           29         4        6.73%  
2            1            5         2        8.48%  
3            1          100         9        2.23%  
4            1          102        73       -0.12%  
