In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from IPython.display import display, Markdown, Latex, HTML

In [3]:
import sys
sys.path.append('./')

import pandas as pd
import pprint

from privex.basic import Schema, Query, Question
from privex.basic import generate_explanation_predicates
from privex.explanation_session import ExplanationSession

import logging
logger = logging.getLogger(__name__)
logging.getLogger().setLevel(logging.INFO)

# Motivate the suprression of small group of people by they are sensitive and ...

# Some changes to the system ...
## .... (and they will be introduced through the experiment)
1. Limit queries to AVG queries (How about SUM and COUNT?)
2. Limit questions to comparison
3. Change impact function to normalized difference
4. Change rank bound to confidence interval of rank
5. Add a reference predicate to the explanation table

# Load dataset and schema

In [25]:
df = pd.read_csv('./data/adult.csv')
print(len(df)) # null stuff
schema = Schema.from_json('./data/adult.json')
display(Markdown('## adult (first two rows)'))
display(df.head(2))
display(Markdown('## schema (attribute "education" and "high_income")'))
pprint.pprint(schema.domains['education'].as_dict(), compact=True)
pprint.pprint(schema.domains['high_income'].as_dict(), compact=True)

48842


## adult (first two rows)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class,high_income
0,"(20, 30]",Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K,0
1,"(30, 40]",Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K,0


## schema (attribute "education" and "high_income")

{'attr': 'education',
 'vals': ['11th', 'HS-grad', 'Assoc-acdm', 'Some-college', '10th',
          'Prof-school', '7th-8th', 'Bachelors', 'Masters', 'Doctorate',
          '5th-6th', 'Assoc-voc', '9th', '12th', '1st-4th', 'Preschool'],
 'vmax': None,
 'vmin': None}
{'attr': 'high_income', 'vals': None, 'vmax': 1, 'vmin': 0}


# Set up the parameters for the explanation session

In [5]:
gamma = 0.95
attributes = ['marital-status', 'occupation', 'age', 'relationship', 'race', 'workclass', 'sex', 'native-country']
predicates = generate_explanation_predicates(attributes, schema, strategy='1-way marginal')
es = ExplanationSession(df, schema, gamma, predicates)
display(Markdown(f'## The confidence level $\gamma$ is set at {gamma}'))
display(Markdown(f'## We consider the following {len(predicates)} predicates for future explanation:'))
pprint.pprint(predicates)

## The confidence level $\gamma$ is set at 0.95

## We consider the following 94 predicates for future explanation:

[`marital-status` is "Never-married",
 `marital-status` is "Married-civ-spouse",
 `marital-status` is "Widowed",
 `marital-status` is "Divorced",
 `marital-status` is "Separated",
 `marital-status` is "Married-spouse-absent",
 `marital-status` is "Married-AF-spouse",
 `occupation` is "Machine-op-inspct",
 `occupation` is "Farming-fishing",
 `occupation` is "Protective-serv",
 `occupation` is "nan",
 `occupation` is "Other-service",
 `occupation` is "Prof-specialty",
 `occupation` is "Craft-repair",
 `occupation` is "Adm-clerical",
 `occupation` is "Exec-managerial",
 `occupation` is "Tech-support",
 `occupation` is "Sales",
 `occupation` is "Priv-house-serv",
 `occupation` is "Transport-moving",
 `occupation` is "Handlers-cleaners",
 `occupation` is "Armed-Forces",
 `age` is "(20, 30]",
 `age` is "(30, 40]",
 `age` is "(40, 50]",
 `age` is "(10, 20]",
 `age` is "(60, 70]",
 `age` is "(50, 60]",
 `age` is "(70, 80]",
 `age` is "(80, 90]",
 `relationship` is "Own-child",
 `relationship` 

# Phase 1: Please submit queries in the form "SELECT AVG(X) FROM R WHERE \<predicate\>"

In [6]:
# Phase 1
q1 = Query('education == "Doctorate"', 'high_income', schema)
q2 = Query('education == "Prof-school"', 'high_income', schema)
rho_query = 10.0
es.submit_queries([q1, q2], rho_query)
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 
'display.max_colwidth', 1000):
    display(Markdown('## We submitted two queries'))
    display(HTML(es.show_query_results().to_html(index=False)))

## We submitted two queries

index,query,answer
1,"SELECT AVG(high_income) FROM R WHERE education == ""Doctorate""",0.726362
2,"SELECT AVG(high_income) FROM R WHERE education == ""Prof-school""",0.739742


# Phase 2: Ask your question in the form "Why the result of q1 <higher/lower> than the result of q2?"

In [7]:
# Phase 2
question = Question(q1, 'lower', q2)
es.submit_question(question)
ci = es.question_ci()
display(Markdown(f'### Users asks: \n#### {question}'))
display(Markdown(f'### The system first shows: \n#### The {gamma*100:.0f}% confidence interval of the difference is [{ci[0]:.2f}, {ci[1]:.2f}]. Noise is not the reason.'))

### Users asks: 
#### Why AVG(high_income) of group education == "Doctorate" is lower than the one of group education == "Prof-school"?

### The system first shows: 
#### The 95% confidence interval of the difference is [-0.02, -0.01]. Noise is not the reason.

# Phase 3: Submit your request for the explanation of this question with privacy budget $\rho$.

### Review the existing impact function, and list the challenges why it is hard for DP, and this is the reason why we change it into this way ....

### Start with Sudeepa's definition, and then motivate the scaler with privacy

### Two reasons for the scaler ... 1) Expla... 2) Privacy

### We consider the impact function as the normalized difference in the what-if-removed world.
Denote 
$$q_1(D) = q_1^{sum}(D) / q_1^{cnt}(D)$$
$$q_2(D) = q_2^{sum}(D) / q_2^{cnt}(D)$$
$$Impact(p) = (q_1(\neg p(D)) - q_2(\neg p(D))) \cdot min(q_1^{cnt}(\neg p(D), q_2^{cnt}(\neg p(D))$$

### Now we show the explanation table below

In [8]:
# Phase 3
k = 5
display(Markdown(f'### There are {len(predicates)} predicates.'))
display(Markdown(f'### k = {k}.'))
display(Markdown(f'### Below shows the explanation with k+1 predicates including \<no selection\>.'))
rho_expl = 200.0
es.submit_explanation_request(k, rho_expl)
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 
'display.max_colwidth', 1000, 'display.width', None):
    display(es.show_explanation_table())
    # drop the index column

### There are 94 predicates.

### k = 5.

### Below shows the explanation with k+1 predicates including \<no selection\>.

Unnamed: 0,predicates,Imp 95-CI L,Imp 95-CI R,Rnk 95-CI L,Rnk 95-CI R
0,"`occupation` is ""Prof-specialty""",16.653985,23.498109,1,1
1,"`age` is ""(30, 40]""",9.269621,16.113744,1,6
2,"`sex` is ""Male""",7.296179,14.140303,2,10
3,"`marital-status` is ""Married-civ-spouse""",6.29943,13.143554,2,10
4,"`relationship` is ""Husband""",4.957543,11.801666,2,10
5,<no selection>,-11.265085,-4.629051,11,91


## Now consider the following metrics for evaluating the explanation table.

#### TOP-K: Report the distance between lowest impact of noisy top-k and the true k-th largest impact.
#### Impact CI: precision (impact CI width) and accuracy (coverage probability)
#### Rank CI: precision (corresponding impact width) and accuracy (coverage probability)

In [9]:
pprint.pprint(es.measure_explanation_table())

{'imp_ci_width': 6.809441877299036,
 'imp_coverage_prob': 1.0,
 'rnk_ci_width': 8.118084925444343,
 'rnk_coverage_prob': 1.0,
 'topk_gap': 0.0}


## Another baseline algorithm for the explanation table is given as follows
#### 1. Add gaussian noise to each predicate impact
#### 2. Derive CI for top-k predicate impact by gaussian CI.
#### 3. Derive CI for top-k predicate rank in a naive way.

In [10]:
es.baseline_submit_explanation_request(k, rho_expl)
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 
'display.max_colwidth', 1000, 'display.width', None):
    display(es.show_explanation_table())

Unnamed: 0,predicates,Imp 95-CI L,Imp 95-CI R,Rnk 95-CI L,Rnk 95-CI R
0,"`occupation` is ""Prof-specialty""",18.344874,22.165548,1,2
1,"`age` is ""(30, 40]""",11.864217,15.68489,1,6
2,"`sex` is ""Male""",8.25415,12.074824,2,9
3,"`marital-status` is ""Married-civ-spouse""",6.71953,10.540203,2,10
4,"`workclass` is ""Self-emp-inc""",4.529285,8.349958,2,10
5,<no selection>,-10.290361,-6.469688,11,91


In [11]:
pprint.pprint(es.measure_explanation_table())

{'imp_ci_width': 3.820673176108851,
 'imp_coverage_prob': 1.0,
 'rnk_ci_width': 9.034516801297807,
 'rnk_coverage_prob': 1.0,
 'topk_gap': 0.4431853222325053}


### The noise of baseline grows with $\sqrt{|P|}$, while our solution grows with $\log{|P|}$. Only when we have large size of predicates, our solution is better.

### Now consider predicates with two attributes.

In [12]:
predicates = generate_explanation_predicates(attributes, schema, strategy='1-way marginal')
predicates += generate_explanation_predicates(attributes, schema, strategy='2-way marginal')
es = ExplanationSession(df, schema, gamma, predicates)
display(Markdown(f'### There are {len(predicates)} predicates.'))

### There are 3388 predicates.

In [13]:
# Phase 1
q1 = Query('education == "Doctorate"', 'high_income', schema)
q2 = Query('education == "Prof-school"', 'high_income', schema)
rho_query = 10.0
es.submit_queries([q1, q2], rho_query)

# Phase 2
question = Question(q1, 'lower', q2)
es.submit_question(question)
ci = es.question_ci()

In [14]:
# Phase 3
k = 5
rho_expl = 200.0

In [15]:
#### Our solution of explanation table

In [16]:
es.submit_explanation_request(k, rho_expl)
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 
'display.max_colwidth', 1000, 'display.width', None):
    display(es.show_explanation_table())

Unnamed: 0,predicates,Imp 95-CI L,Imp 95-CI R,Rnk 95-CI L,Rnk 95-CI R
0,"`occupation` is ""Prof-specialty"" and `relationship` is ""Husband""",13.81916,24.094368,1,11
1,"`occupation` is ""Prof-specialty"" and `native-country` is ""United-States""",18.313978,28.589186,1,13
2,"`occupation` is ""Prof-specialty"" and `race` is ""White""",17.687918,27.963126,1,16
3,"`occupation` is ""Prof-specialty"" and `sex` is ""Male""",16.908603,27.183811,1,16
4,"`marital-status` is ""Married-civ-spouse"" and `occupation` is ""Prof-specialty""",10.929882,21.20509,3,23
5,<no selection>,-11.279271,-4.647341,39,3381


In [17]:
pprint.pprint(es.measure_explanation_table())

{'imp_ci_width': 9.667994711719963,
 'imp_coverage_prob': 1.0,
 'rnk_ci_width': 13.978437307632717,
 'rnk_coverage_prob': 1.0,
 'topk_gap': 0.0}


In [18]:
#### Baseline solution of explanation table

In [19]:
es.baseline_submit_explanation_request(k, rho_expl)
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 
'display.max_colwidth', 1000, 'display.width', None):
    display(es.show_explanation_table())

Unnamed: 0,predicates,Imp 95-CI L,Imp 95-CI R,Rnk 95-CI L,Rnk 95-CI R
0,"`occupation` is ""Prof-specialty"" and `race` is ""White""",12.697342,35.517249,1,3389
1,"`occupation` is ""Prof-specialty"" and `native-country` is ""United-States""",11.424361,34.244268,1,3389
2,"`occupation` is ""Prof-specialty"" and `sex` is ""Male""",10.766359,33.586266,1,3389
3,"`age` is ""(30, 40]""",9.406059,32.225966,1,3389
4,"`relationship` is ""Husband"" and `native-country` is ""United-States""",9.388683,32.20859,1,3389
5,<no selection>,-14.214699,8.605208,1,3389


In [20]:
pprint.pprint(es.measure_explanation_table())

{'imp_ci_width': 22.819907049181776,
 'imp_coverage_prob': 1.0,
 'rnk_ci_width': 48.169216438168284,
 'rnk_coverage_prob': 1.0,
 'topk_gap': 6.836201882402705}


In [21]:
len(es.predicates_with_scores)

3389

### Repeat for 100 times

In [22]:
import pandas as pd

solution_reports = []
baseline_reports = []
for i in range(100):
    es.submit_explanation_request(k, rho_expl)
    solution_reports.append(es.measure_explanation_table())
    
    es.baseline_submit_explanation_request(k, rho_expl)
    baseline_reports.append(es.measure_explanation_table())
solution_reports = pd.DataFrame(solution_reports)
baseline_reports = pd.DataFrame(baseline_reports)

In [23]:
solution_reports.mean()

topk_gap              0.134884
imp_coverage_prob     0.961667
imp_ci_width          9.667995
rnk_coverage_prob     1.000000
rnk_ci_width         13.941279
dtype: float64

In [24]:
baseline_reports.mean()

topk_gap              7.746929
imp_coverage_prob     0.858333
imp_ci_width         22.819907
rnk_coverage_prob     1.000000
rnk_ci_width         48.169216
dtype: float64