In [1]:
import numpy as np
from obp.ope import RegressionModel
from sklearn.tree import RandomForestRegressor as RFR

from tqdm import tqdm

from dataset.synthetic import SyntheticBanditDatasetWithCluster
from ope.regression import PairWiseRegression
from ope.meta import OffPolicyEvaluation
from ope.estimator import InversePropensityScore as IPS
from ope.estimator import MarginalizedIPS as MIPS
from ope.estimator import DoublyRobust as DR
from ope.estimator import OFFCEM
from utils.common import visualize_mean_squared_error
from utils.common import aggregate_simulation_results
from policy.function import gen_eps_greedy

  from .autonotebook import tqdm as notebook_tqdm


ImportError: cannot import name 'RandomForestRegressor' from 'sklearn.tree' (/usr/local/lib/python3.9/site-packages/sklearn/tree/__init__.py)

In [2]:
# setting
n_users=100
dim_context=10
n_actions=100
n_cat_per_dim=3
n_cat_dim=10
n_clusters=30
beta=-1.0
eps=0.3
reward_noise=1.0
random_state=12345
n_sim=2

In [3]:
# MSE with varying sample sizes in logged data

sample_sizes = [1000, 3000, 6000, 12000]
ope_estimators = [
    IPS(estimator_name="IPS"),
    DR(estimator_name="DR"),
    MIPS(estimator_name="MIPS (true)"),
    OFFCEM(estimator_name="OFFCEM"),
    OFFCEM(estimator_name="OFFCEM + 1-step reg"),
    OFFCEM(estimator_name="OFFCEM (LC)")
]

In [None]:
dataset = SyntheticBanditDatasetWithCluster(
    n_users=n_users,
    dim_context=dim_context,
    n_actions=n_actions,
    n_cat_per_dim=n_cat_per_dim,
    n_cat_dim=n_cat_dim,
    n_clusters=n_clusters,
    beta=beta,
    reward_noise=reward_noise,
    random_state=random_state
)
n_clusters = dataset.n_clusters

test_data = dataset.obtain_batch_bandit_feedback(n_rounds=30000)
policy_value = dataset.calc_ground_truth_policy_value(
    q_x_a=test_data["expected_reward"],
    pi_e=gen_eps_greedy(expected_reward=test_data["expected_reward"], eps=eps)
)

result_df_list = []
for val_size in sample_sizes:
    
    result_list = []
    for _ in tqdm(range(n_sim), desc=f"val_size={val_size}"):
        val_data = dataset.obtain_batch_bandit_feedback(n_rounds=val_size)
        
        pi_e = gen_eps_greedy(
            expected_reward=val_data["expected_reward"],
            eps=eps,
        )
        # off policy evaluation
        ope = OffPolicyEvaluation(
            bandit_feedback=val_data,
            ope_estimators=ope_estimators,
        )
        
        ## train_reward_via_two_stage
        ### 1st-stage

        pairwise_model = PairWiseRegression(
            dim_context=dim_context,
            n_actions=n_actions,
            n_clusters=n_clusters,
            verbose=False
        )

        h_hat = pairwise_model.fit_predict(bandit_data=val_data)
        
        ### 2st-stage
        reward = val_data["reward"]
        reward_residual = reward - h_hat[np.arange(val_size), val_data["action"]]
        cluster, phi_x_a = val_data["cluster"], val_data["phi_x_a"]

        reg_model = RegressionModel(
            n_actions=n_clusters,
            action_context=np.eye(n_clusters),
            base_model=MLP(hidden_layer_sizes=(10, 10, 10), random_state=random_state)
        )

        g_hat = reg_model.fit_predict(
            context=val_data["context"],
            action=cluster,
            reward=reward_residual
        )[:, :, 0]

        f_hat_x_a_e = h_hat + g_hat[np.arange(val_size)[:, None], phi_x_a]
        
        ## one-step reward regression
        reg_model = RegressionModel(
            n_actions=n_actions,
            action_context=val_data["action_context_one_hot"],
            base_model=MLP(hidden_layer_sizes=(10, 10, 10), random_state=random_state),
        )
        q_hat_x_a = reg_model.fit_predict(
            context=val_data["context"],
            action=val_data["action"],
            reward=val_data["reward"],
        )
        
        q_hat_dict = {
            "DR": q_hat_x_a,
            "OFFCEM": f_hat_x_a_e,
            "OFFCEM + 1-step reg": q_hat_x_a,
            "OFFCEM (LC)": val_data["expected_reward"]
        }
        
        estimated_policy_values = ope.estimate_policy_values(action_dist=pi_e, estimated_rewards=q_hat_dict)
        result_list.append(estimated_policy_values)
    
    # calculate MSE
    result_df = aggregate_simulation_results(
        simulation_result_list=result_list, policy_value=policy_value, x_value=alpha
    )
    result_df_list.append(result_df)

result_df = pd.concat(result_df_list).reset_index(level=0)
visualize_mean_squared_error(
    result_df=result_df,
    xlabel="sample sizes in logged data"
)

