In [None]:
from obp.dataset import OpenBanditDataset
import numpy as np
from obp.ope import OffPolicyEvaluation, InverseProbabilityWeighting as IPW, DirectMethod as DM, DoublyRobust as DR
from obp.ope import RegressionModel
from sklearn.linear_model import LogisticRegression
from pathlib import Path

In [47]:
# -------------------------
# 1. Dataset 생성
# -------------------------
data_root = Path("./open_bandit_dataset")
dataset = OpenBanditDataset(
    behavior_policy="random",
    campaign="all",
    data_path=data_root
)

bandit_data = dataset.obtain_batch_bandit_feedback()

contexts = bandit_data["context"]
actions = bandit_data["action"]
rewards = bandit_data["reward"]
pscores = bandit_data["pscore"]

In [41]:
bandit_feedback_train, bandit_feedback_test = bandit_data.obtain_batch_bandit_feedback(is_timeseries_split=True)

In [48]:
# -------------------------
# 1. Define Discounted LinUCB
# -------------------------
class DiscountedLinUCB:
    def __init__(self, n_actions, dim, alpha=1.0, gamma=0.99):
        self.n_actions = n_actions
        self.dim = dim
        self.alpha = alpha
        self.gamma = gamma
        self.A = [np.identity(self.dim) for _ in range(n_actions)]
        self.b = [np.zeros(self.dim) for _ in range(n_actions)]

    def select_action(self, x):
        p_values = []
        for a in range(self.n_actions):
            A_inv = np.linalg.inv(self.A[a])
            theta = A_inv @ self.b[a]
            ucb_bonus = self.alpha * np.sqrt(x @ A_inv @ x.T)
            p = theta @ x + ucb_bonus
            p_values.append(p)
        return int(np.argmax(p_values))

    def update(self, action, x, reward):
        self.A[action] = self.gamma * self.A[action] + np.outer(x, x)
        self.b[action] = self.gamma * self.b[action] + reward * x

In [None]:
# -------------------------
# 2. Run Discounted LinUCB on OBD
# -------------------------
agent = DiscountedLinUCB(n_actions=dataset.n_actions, dim=contexts.shape[1], alpha=1.0, gamma=0.99)
pi_actions = []
for t in range(contexts.shape[0]):
    x = contexts[t]
    chosen_action = agent.select_action(x)
    pi_actions.append(chosen_action)
    reward = rewards[t] if chosen_action == actions[t] else 0
    agent.update(chosen_action, x, reward)

pi_actions = np.array(pi_actions)

In [None]:
# -------------------------
# 3. Prepare regression model for OPE
# -------------------------
reg_model = RegressionModel(n_actions=dataset.n_actions, base_model=LogisticRegression())
estimated_rewards_by_reg_model = reg_model.fit_predict(
    context=contexts,
    action=actions,
    reward=rewards
)

In [None]:
# -------------------------
# 4. Compute action distribution (deterministic LinUCB)
# -------------------------
action_dist = np.zeros((contexts.shape[0], dataset.n_actions))
for t, a in enumerate(pi_actions):
    action_dist[t, a] = 1.0

In [None]:
# -------------------------
# 5. Off-Policy Evaluation
# -------------------------
ope = OffPolicyEvaluation(
    bandit_feedback=bandit_data,
    ope_estimators=[IPW(), DM(), DR()]
)
ope.visualize_off_policy_estimates(
    action_dist=action_dist,
    estimated_rewards_by_reg_model=estimated_rewards_by_reg_model
)