In [15]:
import numpy as np
from obp.utils import softmax

from dataset import SyntheticBanditDatasetWithActionEmbeds
from estimator import InversePropensityScore as IPS

In [11]:
n_actions=1000
dim_context=5
n_category=10
p_e_a_param_std=1.
random_state=12345
n_rounds=1000
is_category_probabialistic=True
return_marginal_pi=True

In [4]:
# ground truth
dataset = SyntheticBanditDatasetWithActionEmbeds(
    n_actions=n_actions,
    dim_context=dim_context,
    n_category=n_category,
    is_category_probabialistic=is_category_probabialistic
)

data = dataset.obtain_batch_bandit_feedback(n_rounds=10000, return_marginal_pi_b=False)
evaluation_policy = softmax(
    dataset.random_.normal(size=(10000, n_actions))
)

policy_value = dataset.calc_ground_truth_policy_value(
    expected_reward=data["expected_reward"],
    evaluation_policy=evaluation_policy
)
policy_value

0.694823794672106

In [12]:
# observed data
dataset = SyntheticBanditDatasetWithActionEmbeds(
    n_actions=n_actions,
    dim_context=dim_context,
    n_category=n_category,
    is_category_probabialistic=is_category_probabialistic
)
data = dataset.obtain_batch_bandit_feedback(n_rounds=n_rounds, return_marginal_pi_b=return_marginal_pi)

# define \pi_e
evaluation_policy = softmax(
    dataset.random_.normal(size=(n_rounds, n_actions))
)
evaluation_policy_pscore = evaluation_policy[np.arange(n_rounds), data["action"]].copy()

marginal_pi_e, marginal_pi_e_pscore = dataset.compute_marginal_probability(
    pi=evaluation_policy,
    action=data["action"],
    p_e_a=data["p_e_a"],
    category=data["category"]
)

In [13]:
# Conventional IPS
mips = IPS(estimator_name="IPS")

estimated_policy_value = mips.estimate_policy_value(
    reward=data["reward"],
    behavior_policy_pscore=data["pscore"],
    evaluation_policy_pscore=evaluation_policy_pscore
)
estimated_policy_value

0.7123163850611358

In [14]:
# MIPS
mips = IPS(estimator_name="MIPS")

estimated_policy_value = mips.estimate_policy_value(
    reward=data["reward"],
    behavior_policy_pscore=data["marginal_pscore"],
    evaluation_policy_pscore=marginal_pi_e_pscore
)
estimated_policy_value

0.7108472004503967