# OPE/OPL Experiments with Synthetic Bandit Data
---
This notebook provides an example implementation of conducting OPE of several different evaluation policies with synthetic bandit feedback data.

In [1]:
from sklearn.linear_model import LogisticRegression

# import open bandit pipeline (obp)
import obp
from obp.dataset import (
    SyntheticBanditDataset,
    logistic_reward_function,
    linear_behavior_policy,
)
from obp.policy import IPWLearner
from obp.ope import (
    OffPolicyEvaluation, 
    RegressionModel,
    InverseProbabilityWeighting as IPS,
    DirectMethod as DM,
    DoublyRobust as DR,
)

In [2]:
# obp version
print(obp.__version__)

0.5.4


## (1) Generate Synthetic Data

`SyntheticBanditDataset` is an easy-to-use synthetic data generator class implemented in the dataset module.

In [3]:
dataset = SyntheticBanditDataset(
    n_actions=10, # number of actions; |A|
    dim_context=5, # dimension of context vector
    reward_function=logistic_reward_function, # expected reward function; r(x,a)
    beta=1.0, # temperature parameter to control te logging policy; \pi_0
    random_state=12345,
)

In [4]:
training_bandit_data = dataset.obtain_batch_bandit_feedback(n_rounds=10000)
test_bandit_data = dataset.obtain_batch_bandit_feedback(n_rounds=1000000)

In [5]:
training_bandit_data

{'n_rounds': 10000,
 'n_actions': 10,
 'context': array([[-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057],
        [ 1.39340583,  0.09290788,  0.28174615,  0.76902257,  1.24643474],
        [ 1.00718936, -1.29622111,  0.27499163,  0.22891288,  1.35291684],
        ...,
        [-1.27028221,  0.80914602, -0.45084222,  0.47179511,  1.89401115],
        [-0.68890924,  0.08857502, -0.56359347, -0.41135069,  0.65157486],
        [ 0.51204121,  0.65384817, -1.98849253, -2.14429131, -0.34186901]]),
 'action_context': array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]),
 'action': array([9, 3, 2, ..., 0, 2, 7]),
 'position': None,
 're

## (2) Train Bandit Policies (OPL)
`obp.policy.IPWLearner` is used here (IPS=IPW), but many other choices are possible depending on your research question

$ \hat{\pi} \in \underset{\pi \in \Pi}{\operatorname{argmax}} \, \hat{V}_{I P S}\left(\pi ; \mathcal{D}_{t r}\right) $

In [6]:
ipw_learner = IPWLearner(
    n_actions=dataset.n_actions, # number of actions; |A|
    base_classifier=LogisticRegression(C=100, random_state=12345) # any sklearn classifier
)

In [7]:
# fit
ipw_learner.fit(
    context=training_bandit_data["context"], # context; x
    action=training_bandit_data["action"], # action; a
    reward=training_bandit_data["reward"], # reward; r
    pscore=training_bandit_data["pscore"], # propensity score; pi_0(a|x)
)

In [8]:
# predict (action dist = action distribution)
action_dist_ipw = ipw_learner.predict(
    context=test_bandit_data["context"], # context in the test data
)

In [9]:
action_dist_ipw[:, :, 0] # which action to take for each context 

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## (3) Approximate the Ground-truth Policy Value
$V(\pi) \approx \frac{1}{|\mathcal{D}_{te}|} \sum_{i=1}^{|\mathcal{D}_{te}|} \mathbb{E}_{a \sim \pi(a|x_i)} [r(x_i, a)], \; \, where \; \, r(x,a) := \mathbb{E}_{r \sim p(r|x,a)} [r]$

In [10]:
policy_value_of_ipw = dataset.calc_ground_truth_policy_value(
    expected_reward=test_bandit_data["expected_reward"], # expected reward; r(x,a)
    action_dist=action_dist_ipw, # action distribution of IPWLearner
)

In [11]:
# ground-truth policy value of `IPWLearner`, which will be compared with OPE estimates
policy_value_of_ipw

0.7932179711565702

## (4) Off-Policy Evaluation (OPE)

### (4-1) obtain a reward estimator
`obp.ope.RegressionModel` simplifies the process of reward modeling

$r(x,a) = \mathbb{E} [r \mid x, a] \approx \hat{r}(x,a)$

In [12]:
regression_model = RegressionModel(
    n_actions=dataset.n_actions, # number of actions; |A|
    base_model=LogisticRegression(C=100, random_state=12345) # any sklearn classifier
)

In [13]:
estimated_rewards = regression_model.fit_predict(
    context=test_bandit_data["context"], # context; x
    action=test_bandit_data["action"], # action; a
    reward=test_bandit_data["reward"], # reward; r
    random_state=12345,
)

In [14]:
estimated_rewards[:, :, 0] # \hat{r}(x,a)

array([[0.65272354, 0.61704824, 0.586804  , ..., 0.48577246, 0.66897743,
        0.61495777],
       [0.67466355, 0.63999902, 0.61042298, ..., 0.51034758, 0.69037767,
        0.63796033],
       [0.6132436 , 0.5761462 , 0.54505335, ..., 0.44349471, 0.63029871,
        0.57398661],
       ...,
       [0.53528662, 0.49684684, 0.46533516, ..., 0.36665906, 0.55327521,
        0.49463761],
       [0.56154344, 0.52334154, 0.49179191, ..., 0.39161478, 0.57931443,
        0.52113652],
       [0.6701928 , 0.63530941, 0.60558553, ..., 0.5052746 , 0.68602237,
        0.63325939]])

### (4-2) OPE
`obp.ope.OffPolicyEvaluation` simplifies the OPE process

$V(\pi_e) \approx \hat{V} (\pi_e; \mathcal{D}_0, \theta)$ 

Here we use DM, IPS, and DR

In [15]:
ope = OffPolicyEvaluation(
    bandit_feedback=test_bandit_data, # test data
    ope_estimators=[
        IPS(estimator_name="IPS"), 
        DM(estimator_name="DM"), 
        DR(estimator_name="DR"),
    ] # estimators
)

In [16]:
estimated_policy_value = ope.estimate_policy_values(
    action_dist=action_dist_ipw, # \pi_e(a|x)
    estimated_rewards_by_reg_model=estimated_rewards, # \hat{r}(x,a)
)

In [17]:
# OPE results given by the three estimators
estimated_policy_value

{'IPS': 0.7888273216784351, 'DM': 0.5990759314912072, 'DR': 0.7910361006651566}

## (5) Evaluation of OPE
Now, let's evaluate the OPE performance (estimation accuracy) of the three estimators

$SE (\hat{V}; \mathcal{D}_0) := \left( V(\pi_e) - \hat{V} (\pi_e; \mathcal{D}_0, \theta) \right)^2$,     (squared error of $\hat{V}$)

In [18]:
squared_errors = ope.evaluate_performance_of_estimators(
    ground_truth_policy_value=policy_value_of_ipw, # V(\pi_e)
    action_dist=action_dist_ipw, # \pi_e(a|x)
    estimated_rewards_by_reg_model=estimated_rewards, # \hat{q}(x,a)
    metric="se", # squared error
)

In [19]:
squared_errors # DR is the most accurate followed by IPS

{'IPS': 1.9277802839848613e-05,
 'DM': 0.037691131565427395,
 'DR': 4.760558841301454e-06}

We can iterate the above process several times and calculate the following MSE as an accuracy metric of an estimator

$MSE (\hat{V}) := T^{-1} \sum_{t=1}^T SE (\hat{V}; \mathcal{D}_0^{(t)}) $

where $\mathcal{D}_0^{(t)}$ is the synthetic data generated in the $t$-th iteration