In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
import datetime
import time

import torch
import pandas as pd
from pandas import DataFrame
from sklearn.utils import check_random_state

from synthetic_time import SyntheticBanditWithTimeDataset
from utils import show_hyperparameters
import conf
from opl import OPL
from logging import getLogger
from pathlib import Path

In [3]:
logger = getLogger(__name__)
logger.info(f"The current working directory is {Path().cwd()}")

# log path
log_path = Path("./varying_lambda_data")
df_path = log_path / "df"
df_path.mkdir(exist_ok=True, parents=True)

INFO:__main__:The current working directory is /Users/s23599/document/research/Sony-Non-Stationary-OPE-OPL/icml2024-opfv-change-name/src/synthetic/F-OPL


In [4]:
start_time = time.time()

x = "lambda_ratio"
xlabel = r"$\lambda$"
xticklabels = conf.lambda_ratio_list


# Set seed
torch.manual_seed(conf.random_state)

result_df_list = []

# Test Data 
# Obtain the unix time when we start the evaluation of a target policy
time_at_evaluation_start = conf.time_at_evaluation_start

# Calculate the number of days in one cycle of given time structure function \phi(t)
NUM_DAYS_IN_ONE_CYCLE = 365

# Determine the unix time when we end the evaluation of a target policy 
time_at_evaluation_end_datetime = datetime.datetime.fromtimestamp(time_at_evaluation_start) + datetime.timedelta(days=NUM_DAYS_IN_ONE_CYCLE * conf.num_cycles_in_evaluation_period)
time_at_evaluation_end = int(datetime.datetime.timestamp(time_at_evaluation_end_datetime))

# Show hyperparameters
show_hyperparameters(
    time_at_evaluation_start = time_at_evaluation_start, 
    time_at_evaluation_end = time_at_evaluation_end, 
    flag_show_time_at_evaluation = True, 
    time_at_evaluation_list = None,
)

result_df_list = []
for lambda_ratio in conf.lambda_ratio_list:
    test_policy_value_list = []

    dataset = SyntheticBanditWithTimeDataset(
        n_actions=conf.n_actions,  # Number of Actions |A|
        dim_context=conf.dim_context, # Dimension of the context d_x
        n_users=conf.n_users, # number of users 
        t_oldest = conf.t_oldest, # time when we start collecting the logged data
        t_now = conf.t_now,  # time when we finish collecting the logged data
        t_future = conf.t_future, # Future time
        beta = conf.beta, # optimality of the behavior policy
        reward_std = conf.reward_std, # standard deviation of reward
        num_time_structure=conf.num_time_structure_for_logged_data, # the true number of time structure for reward
        num_time_structure_for_context=conf.num_time_structure_for_context, 
        lambda_ratio = lambda_ratio, # strength of the influence of the time structure for reward
        alpha_ratio = conf.alpha_ratio, # strength of the influence of the time structure for context
        flag_simple_reward = conf.flag_simple_reward, # if expected reward function is simple or not
        sample_non_stationary_context = False, # if the context is non-stationary or not
        g_coef=conf.g_coef, # parameter for generating g(x, phi(t), a)
        h_coef=conf.h_coef, # parameter for generating h(x, t, a)
        p_1_coef = conf.p_1_coef, # parameter for generating the part of non-staitonary context affected by time structure for context
        p_2_coef = conf.p_2_coef, # parameter for generating the part of non-staitonary context not affected by time structure for context
        random_state=conf.random_state, # random state
    )

    random_ = check_random_state(conf.random_state)
    # Sample the time at evaluation from given distribution (uniform)
    time_at_evaluation_vec = random_.uniform(time_at_evaluation_start, time_at_evaluation_end, size=conf.num_test).astype(int)


    ### test bandit data is used to approximate the ground-truth policy value
    dataset_test = dataset.obtain_batch_bandit_feedback(
        n_rounds=conf.num_test, 
        evaluation_mode=True, 
        time_at_evaluation_vec=time_at_evaluation_vec, 
        random_state_for_sampling = conf.random_state, 
    )

    for _ in tqdm(range(conf.n_seeds), desc=f"{x}={lambda_ratio}"):
        ## generate training data
        dataset_train = dataset.obtain_batch_bandit_feedback(
            n_rounds=conf.num_train, 
            evaluation_mode=False, 
            random_state_for_sampling=_
        )


        true_value_of_learned_policies, pi_0_value = OPL(
            dataset = dataset, 
            dataset_test = dataset_test, 
            dataset_train = dataset_train, 
            time_at_evaluation_start = time_at_evaluation_start, 
            time_at_evaluation_end = time_at_evaluation_end, 
            round = _, 
            flag_plot_loss = conf.flag_plot_loss, 
            flag_plot_value = conf.flag_plot_value, 
            num_time_structure_for_OPFV_reward = conf.num_true_time_structure_for_OPFV_reward, 
            max_iter = conf.max_iter, 
            batch_size = conf.batch_size, 
            num_time_learn = conf.num_time_learn, 
        )

        test_policy_value_list.append(true_value_of_learned_policies)

    ## summarize results
    result_df = DataFrame(test_policy_value_list).stack().reset_index(1)\
        .rename(columns={"level_1": "method", 0: "value"})
    result_df[f"{x}"] = lambda_ratio
    result_df["pi_0_value"] = pi_0_value
    result_df["rel_value"] = result_df["value"] / pi_0_value
    result_df_list.append(result_df)
result_df_data = pd.concat(result_df_list).reset_index(level=0)
result_df_data.to_csv(df_path / "result_df_data.csv")

end_time = time.time()
elapsed_time = end_time - start_time

print(f"execution time: {elapsed_time / 60} mins")


################# START hyperparameters #################
### About Seeds and Number of Samples ###
number of seeds = 20
number of training samples (n) = 8000
number of test samples = 10000

### About Time Structure ###
number of true time structures for reward (|C_r|) = 8
strength of time structure for reward (lambda) = 0.5

### About OPL ###
number of epochs = 25
batch size = 32
number of the samples of time when we learn a policy for each batch = 50

### About Prognosticator ###
list of time features for Prognosticator = [<function fourier_scalar at 0x298fe65e0>]
optimality of the data driven feature selection for Prognosticator = True
number of time features for Prognosticator = 3
list of the numbers of time features for Prognosticator = range(3, 8, 2)

### About Logged Data Collection Period and evaluation Period ###
time when we start collecting the logged data = 2022-01-01 00:00:00
time when we finish collecting the logged data = 2022-12-31 23:59:59
time when we start evaluation

lambda_ratio=0.0: 100%|██████████| 20/20 [8:05:30<00:00, 1456.51s/it]  
lambda_ratio=0.2: 100%|██████████| 20/20 [5:36:51<00:00, 1010.57s/it]  
lambda_ratio=0.4: 100%|██████████| 20/20 [2:33:41<00:00, 461.10s/it]  
lambda_ratio=0.6: 100%|██████████| 20/20 [1:16:22<00:00, 229.14s/it]
lambda_ratio=0.8: 100%|██████████| 20/20 [59:36<00:00, 178.80s/it]
lambda_ratio=1.0: 100%|██████████| 20/20 [1:00:20<00:00, 181.03s/it]

execution time: 1173.4567830522856 mins



