In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../')

import pandas as pd
import pprint

from privex.components.basic import Schema, Dataset, Predicate, GroupbyQuery, Question
from privex.components.utils import generate_explanation_predicates
from privex.framework.solution import ExplanationSession

import logging
logger = logging.getLogger(__name__)

In [49]:
df = pd.read_csv('../data/taxi/taxi_imbalance.csv')
schema = Schema.from_json('../data/taxi/taxi.json')
dataset = Dataset(df, schema)
gamma = 0.95
attributes = ['PU_Zone', 'PU_Hour', 'PU_WeekDay', 'DO_Zone', 'DO_Hour', 'DO_WeekDay']
zones = list(
    df.query('PU_Borough == "Brooklyn"')['PU_Zone'].unique()
) + list(
    df.query('PU_Borough == "Queens"')['PU_Zone'].unique()
)
predicates = [
    Predicate(f'`PU_Zone` == "{zone}" or `DO_Zone` == "{zone}"')
    for zone in zones
]
es = ExplanationSession(dataset, gamma, predicates, random_seed = 10523)

In [50]:
# Phase 1
groupby_query = GroupbyQuery(
    agg = 'CNT',
    attr_agg = 'trip_speed',
    predicate = None,
    attr_group = ['PU_Borough', 'DO_Borough'],
    schema = schema
)
rho_query = 0.1
es.phase_1_submit_query(groupby_query, rho_query)
print(f'submiited queries with rho = {rho_query}')
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 
'display.max_colwidth', 1000):
    print(es.phase_1_show_query_results())

submiited queries with rho = 0.1
                             group         answer
4            (Manhattan, Brooklyn)      -3.644976
29                 (Bronx, Queens)      -2.840827
8                (Unknown, Queens)      -2.831044
19                   (Queens, EWR)      -2.016936
17               (Queens, Unknown)      -1.975370
9                 (Unknown, Bronx)      -1.823035
44          (Staten Island, Bronx)      -1.772644
20         (Queens, Staten Island)      -1.602404
26                 (Brooklyn, EWR)      -1.217854
2               (Manhattan, Bronx)      -1.154666
7             (Unknown, Manhattan)      -1.096609
27       (Brooklyn, Staten Island)      -1.080694
23               (Brooklyn, Bronx)      -0.560931
16                 (Queens, Bronx)      -0.531726
38                  (EWR, Unknown)      -0.268910
47            (Staten Island, EWR)      -0.246043
35                (EWR, Manhattan)      -0.018620
13        (Unknown, Staten Island)       0.125860
48  (Staten Islan

In [51]:
es.phase_1_show_query_results().shape

(49, 2)

In [52]:
# Phase 2
def weight_mapping(group):
    if group == ('Brooklyn', 'Queens'):
        return -10
    if group == ('Queens', 'Brooklyn'):
        return 1
    return 0
group_weights = {
    group: weight_mapping(group)
    for group in groupby_query.groups
}
question = Question.from_group_weights(groupby_query, group_weights)
es.phase_2_submit_question(question)
es.phase_2_prepare_question_ci()
ci = es.phase_2_show_question_ci()
point = es.phase_2_show_question_point()
print('question: ', question.to_natural_language())
print('The noisy group difference is ', point)
print(f'The {gamma*100:.0f}% confidence interval of the difference is ', ci)
rho_expl = 1.0

question:  ([(CNT(trip_speed) WHERE `PU_Borough` == "Queens" and `DO_Borough` == "Brooklyn", 1), (CNT(trip_speed) WHERE `PU_Borough` == "Brooklyn" and `DO_Borough` == "Queens", -10)], 0)
The noisy group difference is  7624.085314090087
The 95% confidence interval of the difference is  (7580.04060153057, 7668.130026649605)


In [56]:
# Phase 3
k = 5
logger.debug(f'Length of predicates is {len(predicates)}')
rho_expl = 1.0
es.phase_3_submit_explanation_request()
es.phase_3_prepare_explanation(k, rho_expl, random_seed = 10523)
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 
'display.max_colwidth', 1000, 'display.width', None, 'display.expand_frame_repr', False):
    t1, t2 = es.phase_3_show_explanation_table()
    print(t1)
    print(t2)

2022-04-11 13:45:03,657 DEBUG    [<ipython-input-56-40f55a58d9b1>:3] Length of predicates is 127
(`PU_Borough` == "Queens" and `DO_Borough` == "Brooklyn")or(`PU_Borough` == "Brooklyn" and `DO_Borough` == "Queens")
2022-04-11 13:45:03,677 INFO     [influence_function.py:24] Dataset relative to the question has length 133372
2022-04-11 13:45:27,372 DEBUG    [meta_explanation_session.py:154] 127 predicates and their influences & scores have been loaded.
2022-04-11 13:45:27,379 DEBUG    [meta_explanation_session.py:158] 
         influence        score
count   127.000000   127.000000
mean      7.373480     7.373480
std     540.759678   540.759678
min    -709.157497  -709.157497
25%    -104.808888  -104.808888
50%     -25.253200   -25.253200
75%      16.636589    16.636589
max    5125.002280  5125.002280
2022-04-11 13:45:27,379 DEBUG    [meta_explanation_session.py:207] total rho_expl is 1.0
2022-04-11 13:45:27,380 DEBUG    [meta_explanation_session.py:208] rho_topk is 0.025
2022-04-11 13:4

100%|██████████| 5/5 [00:00<00:00,  5.28it/s]

2022-04-11 13:45:28,331 DEBUG    [meta_explanation_session.py:236] [(4209.2064264939245, 5502.78265629036), (2191.933138585652, 3485.509368382088), (-819.5937694397498, 473.98246035668603), (505.8786373426184, 1799.4548671390544), (-954.3563719948218, 339.2198578016142)]
2022-04-11 13:45:28,331 INFO     [meta_explanation_session.py:239] computing rank ci



100%|██████████| 5/5 [00:00<00:00, 7755.74it/s]

2022-04-11 13:45:28,333 DEBUG    [meta_explanation_session.py:251] [(1, 1), (1, 3), (3, 127), (3, 127), (3, 127)]
                                                             predicates Rel Inf 90-CI L Rel Inf 90-CI R  Rnk 95-CI L  Rnk 95-CI R
0              `PU_Zone` == "JFK Airport" or `DO_Zone` == "JFK Airport"          55.21%          72.18%            1            1
1  `PU_Zone` == "LaGuardia Airport" or `DO_Zone` == "LaGuardia Airport"          28.75%          45.72%            1            3
2                  `PU_Zone` == "Bay Ridge" or `DO_Zone` == "Bay Ridge"           6.64%          23.60%            3          127
3      `PU_Zone` == "Queensboro Hill" or `DO_Zone` == "Queensboro Hill"         -10.75%           6.22%            3          127
4                    `PU_Zone` == "Flushing" or `DO_Zone` == "Flushing"         -12.52%           4.45%            3          127
                                                             predicates  Inf 95-CI L  Inf 95-CI R  Rnk 95-


