# OPE/OPL Experiments with Synthetic Bandit Data
---
This notebook provides an example of conducting OPE of several different evaluation policies with synthetic bandit feedback data.

In [1]:
from sklearn.linear_model import LogisticRegression

# import open bandit pipeline (obp)
import obp
from obp.dataset import (
    SyntheticBanditDataset,
    logistic_reward_function,
    linear_reward_function,
    linear_behavior_policy
)
from obp.policy import IPWLearner, Random
from obp.ope import (
    OffPolicyEvaluation, 
    RegressionModel,
    InverseProbabilityWeighting,
    DirectMethod,
    DoublyRobust
)

In [2]:
# obp version
print(obp.__version__)

0.4.1


## (1) Generate Synthetic Data

`SyntheticBanditDataset` is an easy-to-use synthetic data generator class in the dataset module.

In [3]:
dataset = SyntheticBanditDataset(
    n_actions=10, # number of actions; |A|
    dim_context=5, # number of dimensions of context vector
    reward_function=logistic_reward_function, # mean reward function; q(x,a)
    behavior_policy_function=linear_behavior_policy, # behavior policy; \pi_b
    random_state=12345,
)

In [4]:
training_bandit_data = dataset.obtain_batch_bandit_feedback(n_rounds=10000)
test_bandit_data = dataset.obtain_batch_bandit_feedback(n_rounds=1000000)

In [5]:
training_bandit_data

{'n_rounds': 10000,
 'n_actions': 10,
 'context': array([[-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057],
        [ 1.39340583,  0.09290788,  0.28174615,  0.76902257,  1.24643474],
        [ 1.00718936, -1.29622111,  0.27499163,  0.22891288,  1.35291684],
        ...,
        [-1.27028221,  0.80914602, -0.45084222,  0.47179511,  1.89401115],
        [-0.68890924,  0.08857502, -0.56359347, -0.41135069,  0.65157486],
        [ 0.51204121,  0.65384817, -1.98849253, -2.14429131, -0.34186901]]),
 'action_context': array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]),
 'action': array([6, 3, 2, ..., 9, 3, 6]),
 'position': None,
 're

## (2) Train Bandit Policies (OPL)
`obp.policy.IPWLearner` can be a first choice

In [6]:
ipw_learner = IPWLearner(
    n_actions=dataset.n_actions, # number of actions; |A|
    base_classifier=LogisticRegression(C=100, random_state=12345) # any sklearn classifier
)

In [7]:
# fit
ipw_learner.fit(
    context=training_bandit_data["context"], # context; x
    action=training_bandit_data["action"], # action; a
    reward=training_bandit_data["reward"], # reward; r
    pscore=training_bandit_data["pscore"], # propensity score; pi_b(a|x)
)

In [8]:
# predict (action dist = action distribution)
action_dist_ipw = ipw_learner.predict(
    context=test_bandit_data["context"], # context in the test data
)

In [9]:
action_dist_ipw[:, :, 0] # which action to take for each context 

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

## (3) Approximate the Ground-truth Policy Value
$V(\pi) \approx \frac{1}{|\mathcal{D}_{te}|} \sum_{i=1}^{|\mathcal{D}_{te}|} \mathbb{E}_{a \sim \pi(a|x_i)} [q(x_i, a)], \; \, where \; \, q(x,a) := \mathbb{E}_{r \sim p(r|x,a)} [r]$

In [10]:
policy_value_of_ipw = dataset.calc_ground_truth_policy_value(
    expected_reward=test_bandit_data["expected_reward"], # expected rewards; q(x,a)
    action_dist=action_dist_ipw, # action distribution of IPWLearner
)

In [11]:
# ground-truth policy value of `IPWLearner`
policy_value_of_ipw

0.7687767099367703

## (4) Off-Policy Evaluation (OPE)

### (4-1) obtain a reward estimator
$q(x,a) \approx \hat{q}(x,a)$ with cross-fitting

In [None]:
regression_model = RegressionModel(
    n_actions=dataset.n_actions, # number of actions; |A|
    base_model=LogisticRegression(C=100, random_state=12345) # any sklearn classifier
)

In [None]:
estimated_rewards = regression_model.fit_predict(
    context=test_bandit_data["context"], # context; x
    action=test_bandit_data["action"], # action; a
    reward=test_bandit_data["reward"], # reward; r
    n_folds=2, # use 2-fold cross-fitting
    random_state=12345,
)

In [None]:
estimated_rewards[:, :, 0] # \hat{q}(x,a)

### (4-2) conduct OPE
$V(\pi_e) \approx \hat{V} (\pi_e; \mathcal{D}_b, \theta)$ using DM, IPW, and DR

In [15]:
# obp.ope.OffPolicyEvaluation
ope = OffPolicyEvaluation(
    bandit_feedback=test_bandit_data, # test data
    ope_estimators=[
        InverseProbabilityWeighting(), DirectMethod(), DoublyRobust(), # used estimators
    ]
)

In [16]:
estimated_policy_value = ope.estimate_policy_values(
    action_dist=action_dist_ipw, # \pi_e(a|x)
    estimated_rewards_by_reg_model=estimated_rewards, # \hat{q}
)

In [17]:
# OPE results given by the three estimators
estimated_policy_value

{'ipw': 0.772086551899346, 'dm': 0.6482802006214271, 'dr': 0.7694161359613515}

## (5) Evaluation of OPE
$SE (\hat{V}; \mathcal{D}_b) := \left( V(\pi_e) - \hat{V} (\pi_e; \mathcal{D}_b, \theta) \right)^2$,     (squared error of $\hat{V}$)

In [18]:
squared_errors = ope.evaluate_performance_of_estimators(
    ground_truth_policy_value=policy_value_of_ipw, # V(\pi_e)
    action_dist=action_dist_ipw, # \pi_e(a|x)
    estimated_rewards_by_reg_model=estimated_rewards, # \hat{q}(x,a)
    metric="se", # squared error
)

In [19]:
squared_errors # DR is the most accurate

{'ipw': 1.095505381722747e-05,
 'dm': 0.014519408757182583,
 'dr': 4.0886564091174864e-07}

We can iterate the above process several times and calculate the following MSE

$MSE (\hat{V}) := T^{-1} \sum_{t=1}^T SE (\hat{V}; \mathcal{D}_b^{(t)}) $

where $\mathcal{D}_b^{(t)}$ is the synthetic data in the $t$-th iteration