In [3]:
# implementing OPE of the BernoulliTS policy using log data generated by the Random policy
from obp.dataset import OpenBanditDataset
from obp.policy import BernoulliTS
from obp.ope import OffPolicyEvaluation, InverseProbabilityWeighting

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# (1) Data Loading and Preprocessing
dataset = OpenBanditDataset(behavior_policy="random", campaign="all", data_path="open_bandit_dataset/")
bandit_feedback = dataset.obtain_batch_bandit_feedback()

In [5]:
# (2) Production Policy Replication
evaluation_policy = BernoulliTS(
    n_actions=dataset.n_actions,
    len_list=dataset.len_list,
    is_zozotown_prior=True, # replicate the policy in the ZOZOTOWN production
    campaign="all",
    random_state=12345
)
action_dist = evaluation_policy.compute_batch_action_dist(
    n_sim=100000, n_rounds=bandit_feedback["n_rounds"]
)

In [6]:
# (3) Off-Policy Evaluation
ope = OffPolicyEvaluation(bandit_feedback=bandit_feedback, ope_estimators=[InverseProbabilityWeighting()])
estimated_policy_value = ope.estimate_policy_values(action_dist=action_dist)

In [7]:
# estimated performance of BernoulliTS relative to the ground-truth performance of Random
relative_policy_value_of_bernoulli_ts = estimated_policy_value['ipw'] / bandit_feedback['reward'].mean()
print(relative_policy_value_of_bernoulli_ts)

1.35292399328859


In [8]:
bandit_feedback.keys()

dict_keys(['n_rounds', 'n_actions', 'action', 'position', 'reward', 'pscore', 'context', 'action_context'])

In [30]:
bandit_feedback["action_context"].shape

(80, 4)

In [10]:
action_dist.shape

(1374327, 80, 3)