In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
import datetime
import time
import torch
import pandas as pd
from pandas import DataFrame
from sklearn.utils import check_random_state
from synthetic_time import SyntheticBanditWithTimeDataset
from utils import show_hyperparameters
import conf
from opl import OPL
from logging import getLogger
from pathlib import Path

In [3]:
logger = getLogger(__name__)
logger.info(f"The current working directory is {Path().cwd()}")

# log path
log_path = Path("./varying_target_time_data")
df_path = log_path / "df"
df_path.mkdir(exist_ok=True, parents=True)

INFO:__main__:The current working directory is /Users/s23599/document/research/Sony-Non-Stationary-OPE-OPL/icml2024-opfv-change-name/src/synthetic/F-OPL


In [4]:
start_time = time.time()

x = "time_at_evaluation"
xlabel = "target time (days later)"

time_at_evaluation_list = []
x_ticks_list_single = []

NUM_DAYS_IN_ONE_CYCLE = 365

for i in range(conf.num_time_at_evaluation):
    t_at_evaluation_datetime = datetime.datetime.fromtimestamp(conf.t_now) + datetime.timedelta(days=((i+1) * NUM_DAYS_IN_ONE_CYCLE // conf.num_time_structure_for_logged_data))
    t_at_evaluation = int(datetime.datetime.timestamp(t_at_evaluation_datetime))
    time_at_evaluation_list.append(t_at_evaluation)
    x_ticks_list_single.append((i+1) * 365 // conf.num_time_structure_for_logged_data)

x_ticks_list = []

for i in range(len(x_ticks_list_single)):
    if i != 0:
        x_ticks_list.append(f"{x_ticks_list_single[i - 1] + 1}~{x_ticks_list_single[i]}")
    else:
        x_ticks_list.append(f"1~{x_ticks_list_single[i]}")

# Set seed
torch.manual_seed(conf.random_state)

result_df_list = []


# Show hyperparameters
show_hyperparameters(
    time_at_evaluation_start = None, 
    time_at_evaluation_end = None, 
    flag_show_time_at_evaluation = False, 
    time_at_evaluation_list = time_at_evaluation_list,
)

result_df_list = []
for i in tqdm(range(len(time_at_evaluation_list))):
    test_policy_value_list = []

    dataset = SyntheticBanditWithTimeDataset(
        n_actions=conf.n_actions,  # Number of Actions |A|
        dim_context=conf.dim_context, # Dimension of the context d_x
        n_users=conf.n_users, # number of users 
        t_oldest = conf.t_oldest, # time when we start collecting the logged data
        t_now = conf.t_now,  # time when we finish collecting the logged data
        t_future = conf.t_future, # Future time
        beta = conf.beta, # optimality of the behavior policy
        reward_std = conf.reward_std, # standard deviation of reward
        num_time_structure=conf.num_time_structure_for_logged_data, # the true number of time structure for reward
        num_time_structure_for_context=conf.num_time_structure_for_context, 
        lambda_ratio = conf.lambda_ratio, # strength of the influence of the time structure for reward
        alpha_ratio = conf.alpha_ratio, # strength of the influence of the time structure for context
        flag_simple_reward = conf.flag_simple_reward, # if expected reward function is simple or not
        sample_non_stationary_context = False, # if the context is non-stationary or not
        g_coef=conf.g_coef, # parameter for generating g(x, phi(t), a)
        h_coef=conf.h_coef, # parameter for generating h(x, t, a)
        p_1_coef = conf.p_1_coef, # parameter for generating the part of non-staitonary context affected by time structure for context
        p_2_coef = conf.p_2_coef, # parameter for generating the part of non-staitonary context not affected by time structure for context
        random_state=conf.random_state, # random state
    )

    time_at_evaluation_start = time_at_evaluation_list[i]
    time_at_evaluation_end = time_at_evaluation_list[i]

    if i != 0:
        time_at_evaluation_start = time_at_evaluation_list[i - 1] + 1
        time_at_evaluation_end = time_at_evaluation_list[i]
    else:
        time_at_evaluation_start = dataset.t_now + 1
        time_at_evaluation_end = time_at_evaluation_list[i]

    
    random_ = check_random_state(conf.random_state + i)

    # Sample the time at evaluation from given distribution (uniform)
    time_at_evaluation_vec = random_.uniform(time_at_evaluation_start, time_at_evaluation_end, size=conf.num_test).astype(int)


    ### test bandit data is used to approximate the ground-truth policy value
    dataset_test = dataset.obtain_batch_bandit_feedback(
        n_rounds=conf.num_test, 
        evaluation_mode=True, 
        time_at_evaluation_vec=time_at_evaluation_vec, 
        random_state_for_sampling=conf.random_state + i
    )

    for _ in tqdm(range(conf.n_seeds), desc=f"{x}={x_ticks_list[i]}"):
        ## generate training data
        dataset_train = dataset.obtain_batch_bandit_feedback(
            n_rounds=conf.num_train, 
            evaluation_mode=False, 
            random_state_for_sampling=_
        )

        true_value_of_learned_policies, pi_0_value = OPL(
            dataset = dataset, 
            dataset_test = dataset_test, 
            dataset_train = dataset_train, 
            time_at_evaluation_start = time_at_evaluation_start, 
            time_at_evaluation_end = time_at_evaluation_end, 
            round = _, 
            flag_plot_loss = conf.flag_plot_loss, 
            flag_plot_value = conf.flag_plot_value, 
            num_time_structure_for_OPFV_reward = conf.num_true_time_structure_for_OPFV_reward, 
            n_actions = conf.n_actions, 
            dim_context = conf.dim_context, 
            max_iter = conf.max_iter, 
            batch_size = conf.batch_size, 
            num_time_learn = conf.num_time_learn, 
        )

        test_policy_value_list.append(true_value_of_learned_policies)

    ## summarize results
    result_df = DataFrame(test_policy_value_list).stack().reset_index(1)\
        .rename(columns={"level_1": "method", 0: "value"})
    result_df[f"{x}"] = x_ticks_list_single[i]
    result_df["pi_0_value"] = pi_0_value
    result_df["rel_value"] = result_df["value"] / pi_0_value
    result_df_list.append(result_df)
result_df_data = pd.concat(result_df_list).reset_index(level=0)
result_df_data.to_csv(df_path / "result_df_data.csv")

end_time = time.time()
elapsed_time = end_time - start_time

print(f"execution time: {elapsed_time / 60} mins")


################# START hyperparameters #################
### About Seeds and Number of Samples ###
number of seeds = 20
number of training samples (n) = 8000
number of test samples = 10000

### About Time Structure ###
number of true time structures for reward (|C_r|) = 8
strength of time structure for reward (lambda) = 0.5

### About OPL ###
number of epochs = 25
batch size = 32
number of the samples of time when we learn a policy for each batch = 50

### About Prognosticator ###
list of time features for Prognosticator = [<function fourier_scalar at 0x15ac975e0>]
optimality of the data driven feature selection for Prognosticator = True
number of time features for Prognosticator = 3
list of the numbers of time features for Prognosticator = range(3, 8, 2)

### About Logged Data Collection Period and evaluation Period ###
time when we start collecting the logged data = 2022-01-01 00:00:00
time when we finish collecting the logged data = 2022-12-31 23:59:59
future time = 2024-01-01 00:0

  0%|          | 0/8 [00:00<?, ?it/s]

time_at_evaluation=1~45: 100%|██████████| 20/20 [7:57:55<00:00, 1433.76s/it]
time_at_evaluation=46~91: 100%|██████████| 20/20 [5:32:29<00:00, 997.49s/it]
time_at_evaluation=92~136: 100%|██████████| 20/20 [2:34:09<00:00, 462.49s/it]
time_at_evaluation=137~182: 100%|██████████| 20/20 [1:16:30<00:00, 229.55s/it]
time_at_evaluation=183~228: 100%|██████████| 20/20 [55:18<00:00, 165.92s/it]
time_at_evaluation=229~273: 100%|██████████| 20/20 [55:15<00:00, 165.76s/it]
time_at_evaluation=274~319: 100%|██████████| 20/20 [52:47<00:00, 158.39s/it]
time_at_evaluation=320~365: 100%|██████████| 20/20 [52:28<00:00, 157.41s/it]
100%|██████████| 8/8 [20:58:08<00:00, 9436.06s/it]  

execution time: 1258.1422306338945 mins



