# Long-term OPL Synthetic Simulation: Varying data sizes

In this notebook, we compare the long-term effectiveness of OPL methods with varying data sizes of historical data.

In [1]:
import time
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pandas import DataFrame
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import check_random_state

from dataset import SyntheticDataset
from learning import run_all_policy_learning_methods

In [2]:
sns.set_theme(style='white')
legend = ["Reg-based", "IPS-PG", "DR-PG", "Long-term OPL (Ours)"]
palette = ['#FF6437', '#509BF5', '#F037A5', '#4B917D']

In [13]:
### experiment configurations ###
n_sims = 200 # number of simulation runs
n_test_data = 1000 # number of samples to approximate the ground-truth performance of the policies
n_actions = 30 # number of the actions
x_dim = 5 # feature dimension
a_dim = 5 # action feature dimension
s_dim = 3 # short-term reward dimension (number of short-term metrics)
lambda_ = 0.5 # contribution of the short-term rewards in the expected long-term reward function
reward_std = 7.0 # level of noise on the long-term reward
reward_type = "continuous" # binary or continuous
beta = 1.0 # baseline policy

n_train_data_list = [500, 1000, 2000, 4000] # experiment parameter range

In [14]:
result_df_list = []
for n_train_data in n_train_data_list:
    dataset = SyntheticDataset(
        n_actions=n_actions, x_dim=x_dim, a_dim=a_dim, s_dim=s_dim, lambda_=lambda_,
        reward_type=reward_type, reward_std=reward_std,
    )
    D_test = dataset.generate_dataset(n_data=n_test_data, beta=beta, baseline=True)
    true_value_of_baseline_policy = dataset.calc_policy_value_beta(beta=beta)
    print(f"true_value_of_baseline_policy: {np.round(true_value_of_baseline_policy, 3)}")

    test_policy_value_list = []
    for _ in tqdm(range(n_sims), desc=f"n_train_data={n_train_data}..."):
        D_H = dataset.generate_dataset(n_data=n_train_data, beta=beta) # historical data generated by a baseline policy
        D_E_0 = dataset.generate_dataset(n_data=n_train_data, beta=beta, baseline=True) # short-term experiment data generated by a baseline policy

        true_value_of_learned_policies = run_all_policy_learning_methods(D_H, D_E_0, D_test)
        test_policy_value_list.append(true_value_of_learned_policies)

    ## summarize results
    result_df = (
        DataFrame(test_policy_value_list).stack().reset_index(1)
        .rename(columns={"level_1": "method", 0: "value"})
    )
    result_df["n_train_data"] = n_train_data
    result_df["log_policy_value"] = true_value_of_baseline_policy
    result_df["rel_value"] = result_df["value"] / true_value_of_baseline_policy
    result_df_list.append(result_df)
result_df = pd.concat(result_df_list).reset_index(level=0)

true_value_of_baseline_policy: 0.791


n_train_data=500...:   0%|          | 0/200 [00:00<?, ?it/s]

n_train_data=500...: 100%|██████████| 200/200 [34:00<00:00, 10.20s/it]


true_value_of_baseline_policy: 0.814


n_train_data=1000...: 100%|██████████| 200/200 [1:05:28<00:00, 19.64s/it]


true_value_of_baseline_policy: 0.796


n_train_data=2000...: 100%|██████████| 200/200 [2:07:59<00:00, 38.40s/it]  


true_value_of_baseline_policy: 0.798


n_train_data=4000...: 100%|██████████| 200/200 [4:09:29<00:00, 74.85s/it]  
