# Long-term OPL Synthetic Simulation: Varying sparsity factors

In this notebook, we compare the long-term effectiveness of OPL methods with varying noise levels on the long-term reward.

In [1]:
import time
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pandas import DataFrame
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import check_random_state

from dataset import SyntheticDataset
from learning import run_all_policy_learning_methods

In [2]:
sns.set_theme(style='white')
legend = ["Reg-based", "IPS-PG", "DR-PG", "Long-term OPL (Ours)"]
palette = ['#FF6437', '#509BF5', '#F037A5', '#4B917D']

In [5]:
### experiment configurations ###
n_sims = 200 # number of simulation runs
n_train_data = 1000 # sample size of the historical and short-term experiment data
n_test_data = 1000 # number of samples to approximate the ground-truth performance of the policies
n_actions = 30 # number of the actions
x_dim = 5 # feature dimension
a_dim = 5 # action feature dimension
s_dim = 3 # short-term reward dimension (number of short-term metrics)
lambda_ = 0.5 # contribution of the short-term rewards in the expected long-term reward function
reward_type = "continuous" # binary or continuous
beta = 1.0 # baseline policy

reward_std_list = [1.0, 3.0, 5.0, 7.0, 9.0] # experiment parameter range

In [6]:
result_df_list = []
for reward_std in reward_std_list:
    dataset = SyntheticDataset(
        n_actions=n_actions, x_dim=x_dim, a_dim=a_dim, s_dim=s_dim, lambda_=lambda_,
        reward_type=reward_type, reward_std=reward_std,
    )
    D_test = dataset.generate_dataset(n_data=n_test_data, beta=beta, baseline=True)
    true_value_of_baseline_policy = dataset.calc_policy_value_beta(beta=beta)
    print(f"true_value_of_baseline_policy: {np.round(true_value_of_baseline_policy, 3)}")

    test_policy_value_list = []
    for _ in tqdm(range(n_sims), desc=f"reward_std={reward_std}..."):
        D_H = dataset.generate_dataset(n_data=n_train_data, beta=beta) # historical data generated by a baseline policy
        D_E_0 = dataset.generate_dataset(n_data=n_train_data, beta=beta, baseline=True) # short-term experiment data generated by a baseline policy

        true_value_of_learned_policies = run_all_policy_learning_methods(D_H, D_E_0, D_test)
        test_policy_value_list.append(true_value_of_learned_policies)

    ## summarize results
    result_df = (
        DataFrame(DataFrame(test_policy_value_list).stack())
        .reset_index(1)
        .rename(columns={"level_1": "method", 0: "value"})
    )
    result_df["reward_std"] = reward_std
    result_df["log_policy_value"] = true_value_of_baseline_policy
    result_df["rel_value"] = result_df["value"] / true_value_of_baseline_policy
    result_df_list.append(result_df)
result_df = pd.concat(result_df_list).reset_index(level=0)

true_value_of_baseline_policy: 0.795


reward_std=1.0...: 100%|██████████| 200/200 [1:04:28<00:00, 19.34s/it]


true_value_of_baseline_policy: 0.809


reward_std=3.0...: 100%|██████████| 200/200 [1:03:34<00:00, 19.07s/it]


true_value_of_baseline_policy: 0.806


reward_std=5.0...: 100%|██████████| 200/200 [1:02:51<00:00, 18.86s/it]


true_value_of_baseline_policy: 0.786


reward_std=7.0...: 100%|██████████| 200/200 [1:03:09<00:00, 18.95s/it]


true_value_of_baseline_policy: 0.817


reward_std=9.0...: 100%|██████████| 200/200 [1:03:41<00:00, 19.11s/it]
