In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import datetime
import time
import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn.utils import check_random_state
from tqdm import tqdm

import conf
from synthetic_time import (
    SyntheticBanditWithTimeDataset, 
    SECONDS_PER_DAY
)
from policy import gen_eps_greedy
from ope import run_ope
from utils import show_hyperparameters
from logging import getLogger
from pathlib import Path

In [3]:
logger = getLogger(__name__)
logger.info(f"The current working directory is {Path().cwd()}")

# log path
log_path = Path("./varying_n_trains_data")
df_path = log_path / "df"
df_path.mkdir(exist_ok=True, parents=True)

INFO:__main__:The current working directory is c:\Users\taish\kdd2025-opfv\src\synthetic\F-OPE


In [4]:
start_time = time.time()


# Obtain the unix time when we start the evaluation of a target policy
time_at_evaluation_start = conf.time_at_evaluation
# Calculate the number of days in one cycle of given time structure function \phi(t)
NUM_DAYS_IN_ONE_CYCLE = 365
# Determine the unix time when we end the evaluation of a target policy 
time_at_evaluation_end_datetime = datetime.datetime.fromtimestamp(time_at_evaluation_start) + datetime.timedelta(days=NUM_DAYS_IN_ONE_CYCLE * conf.num_cycles_in_evaluation_period) - datetime.timedelta(seconds=1)
time_at_evaluation_end = int(datetime.datetime.timestamp(time_at_evaluation_end_datetime))

# Show hyperparameters
show_hyperparameters(time_at_evaluation_start=time_at_evaluation_start, 
                     time_at_evaluation_end=time_at_evaluation_end, 
                     flag_show_time_at_evaluation=True)

x = "n_rounds"
xlabel = "number of samples in the logged data"
xticklabels = conf.n_rounds_list

result_df_list = []

for h in range(conf.n_seeds_all):
    for n_rounds in conf.n_rounds_list: 

        dataset = SyntheticBanditWithTimeDataset(
            n_actions=conf.n_actions, 
            dim_context=conf.dim_context,
            n_users=conf.n_users, 
            t_oldest = conf.t_oldest,
            t_now = conf.t_now,
            t_future = conf.t_future,
            beta = conf.beta, 
            reward_std = conf.reward_std, 
            num_time_structure=conf.num_time_structure_for_logged_data, 
            lambda_ratio = conf.lambda_ratio, 
            flag_simple_reward = conf.flag_simple_reward, 
            g_coef=conf.g_coef, 
            h_coef=conf.h_coef, 
            random_state=conf.random_state + h * 10,
        )

        for s in range(conf.n_seeds_for_time_eval_sampling):

            estimated_policy_value_list = []   


            # Obtain random state
            random_ = check_random_state(s + h * 10)
            # Sample the time at evaluation from given distribution (uniform)
            time_at_evaluation = random_.uniform(time_at_evaluation_start, time_at_evaluation_end, size=1).astype(int)

            ### test bandit data is used to approximate the ground-truth policy value
            test_bandit_data = dataset.obtain_batch_bandit_feedback(
                n_rounds=conf.num_test, 
                evaluation_mode=True, 
                time_at_evaluation=time_at_evaluation, 
                random_state_for_sampling= s + h * 10
            )

            # Generate an evaluation policy via the epsilon-greedy rule
            action_dist_test = gen_eps_greedy(
                expected_reward=test_bandit_data["expected_reward"],
                is_optimal=True,
                eps=conf.eps,
            )

            # actulal policy value 
            policy_value = dataset.calc_ground_truth_policy_value(
                expected_reward=test_bandit_data["expected_reward"],
                action_dist=action_dist_test,
            )
        
            for _ in tqdm(range(conf.n_seeds), desc=f"h = {h}, {xlabel} = {n_rounds}, n_seeds_for_time_eval_sampling = {s}"):

                
                ## generate validation data
                val_bandit_data = dataset.obtain_batch_bandit_feedback(
                    n_rounds=n_rounds, 
                    evaluation_mode=False, 
                    random_state_for_sampling = _ + s * 10 + n_rounds + h * 100
                )
                
                ## make decisions on validation data
                action_dist_val = gen_eps_greedy(
                    expected_reward=val_bandit_data["expected_reward"],
                    is_optimal=True,
                    eps=conf.eps,
                )

                days_after_logged_data = (time_at_evaluation - dataset.t_now) // SECONDS_PER_DAY

                days_per_time_structure = NUM_DAYS_IN_ONE_CYCLE / dataset.num_time_structure

                num_time_structure_from_t_now_to_time_at_evaluation = np.ceil(days_after_logged_data / days_per_time_structure).astype(int)
                
                
                run_ope(dataset=dataset, 
                        round = _ + s * 10 + h * 100, 
                        time_at_evaluation=time_at_evaluation, 
                        estimated_policy_value_list=estimated_policy_value_list, 
                        val_bandit_data = val_bandit_data, 
                        action_dist_val = action_dist_val, 
                        num_true_time_structure_for_OPFV_reward = conf.num_true_time_structure_for_OPFV_reward,
                        num_true_time_structure_for_OPFV_for_context = None, 
                        num_episodes_for_Prognosticator = conf.num_episodes_for_Prognosticator, 
                        num_time_structure_from_t_now_to_time_at_evaluation = num_time_structure_from_t_now_to_time_at_evaluation, 
                        eps=conf.eps, 
                        true_policy_value = policy_value, 
                        flag_Prognosticator_optimality = conf.flag_Prognosticator_optimality, 
                        num_features_for_Prognosticator_list = conf.num_features_for_Prognosticator_list,
                        flag_include_DM=conf.flag_include_DM, 
                        flag_calculate_data_driven_OPFV = conf.flag_calculate_data_driven_OPFV, 
                        candidate_num_time_structure_list = conf.candidate_num_time_structure_list, 
                        )
            
            result_df = (
                DataFrame(DataFrame(estimated_policy_value_list).stack())
                .reset_index(1)
                .rename(columns={"level_1": "est", 0: "value"})
            )
            result_df[x] = n_rounds
            result_df["se"] = (result_df.value - policy_value) ** 2
            result_df["bias"] = 0
            result_df["variance"] = 0
            sample_mean = DataFrame(result_df[result_df["est"] != "V_t"].groupby(["est"]).mean().value).reset_index()
            for est_ in sample_mean["est"]:
                estimates = result_df.loc[result_df["est"] == est_, "value"].values
                mean_estimates = sample_mean.loc[sample_mean["est"] == est_, "value"].values
                mean_estimates = np.ones_like(estimates) * mean_estimates
                result_df.loc[result_df["est"] == est_, "bias"] = (
                    policy_value - mean_estimates
                ) ** 2
                result_df.loc[result_df["est"] == est_, "variance"] = (
                    estimates - mean_estimates
                ) ** 2
            result_df_list.append(result_df)


# aggregate all results
result_df = pd.concat(result_df_list).reset_index(level=0)
result_df.to_csv(df_path / "result_df.csv")

end_time = time.time()
elapsed_time = end_time - start_time

print(f"execution time: {elapsed_time / 60} mins")

################# START hyperparameters #################
### About Seeds and Number of Samples ###
number of seeds = 20
number of seeds for time at evaluation = 20
number of training samples (n) = 1000
number of test samples = 10000

### About Time Structure ###
number of true time structures for reward (|C_r|) = 8
strength of time structure for reward (lambda) = 0.5

### About Prognosticator ###
list of time features for Prognosticator = [<function fourier_scalar at 0x0000018774942A60>]
optimality of the data driven feature selection for Prognosticator = True
number of time features for Prognosticator = 3
list of the numbers of time features for Prognosticator = range(3, 8, 2)

### About Logged Data Collection Period and Evaluation Period ###
time when we start collecting the logged data = 2022-01-01 00:00:00
time when we finish collecting the logged data = 2022-12-31 23:59:59
time when we start evaluating a target policy = 2023-01-01 00:00:00
time when we finish evaluating a target 

  g1 = mu3 / np.power(mu2, 1.5)
  g1 = mu3 / np.power(mu2, 1.5)
  g1 = mu3 / np.power(mu2, 1.5)
  g1 = mu3 / np.power(mu2, 1.5)
  g1 = mu3 / np.power(mu2, 1.5)
  g1 = mu3 / np.power(mu2, 1.5)
  g1 = mu3 / np.power(mu2, 1.5)
h = 0, number of samples in the logged data = 500, n_seeds_for_time_eval_sampling = 0:  30%|███       | 6/20 [00:07<00:16,  1.20s/it]


KeyboardInterrupt: 

In [None]:
# ================== Imports ==================
import time, datetime
from pathlib import Path

import numpy as np
import pandas as pd
from pandas import DataFrame
from tqdm import tqdm
from sklearn.utils import check_random_state

import conf
from utils import show_hyperparameters
from policy import gen_eps_greedy,   # 既存
# マスク版はあれば使う（無ければ後でフォールバック）
try:
    from policy import gen_eps_greedy_masked
except Exception:
    gen_eps_greedy_masked = None

# データセット：既存 & 動的
from synthetic_time import SyntheticBanditWithTimeDataset
try:
    # あなたの新クラス（ファイル配置に合わせて適宜 import してください）
    from synthetic_time_dynamic import DynamicActionBanditWithTime  # 例
except Exception:
    try:
        # synthetic_time 内に定義している場合はこちらで拾う
        from synthetic_time import DynamicActionBanditWithTime
    except Exception:
        DynamicActionBanditWithTime = None  # フォールバック用

# ランナー：私用（動的）→ 既存 の順で採用
try:
    from ope_dynamic import run_ope_masked as run_ope_fn
except Exception:
    from ope import run_ope as run_ope_fn  # 既存ランナーにフォールバック

[autoreload of policy failed: Traceback (most recent call last):
  File "c:\Users\taish\anaconda3\envs\cfml\lib\site-packages\IPython\extensions\autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "c:\Users\taish\anaconda3\envs\cfml\lib\site-packages\IPython\extensions\autoreload.py", line 475, in superreload
    module = reload(module)
  File "c:\Users\taish\anaconda3\envs\cfml\lib\importlib\__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 613, in _exec
  File "<frozen importlib._bootstrap_external>", line 850, in exec_module
  File "<frozen importlib._bootstrap>", line 228, in _call_with_frames_removed
  File "c:\Users\taish\kdd2025-opfv\src\synthetic\F-OPE\policy.py", line 38, in <module>
    available_actions: np.ndarray | None = None,
TypeError: unsupported operand type(s) for |: 'type' and 'NoneType'
]
[autoreload of utils failed: Traceback (most recent call last):
  File "c:\Users

In [18]:
# ランナー：私用（動的）→ 既存 の順で採用
try:
    import importlib
    import ope_dynamic                      # ← ファイル名と一致
    importlib.reload(ope_dynamic)           # ノートブックでの上書きに備えて
    from ope_dynamic import run_ope_masked as run_ope_fn
except Exception as e:
    print("Falling back to standard run_ope. Reason:", e)
    from ope import run_ope as run_ope_fn       # 既存ランナー


In [None]:
# ================== Configs & helpers ==================
SECONDS_PER_DAY = 86400
NUM_DAYS_IN_ONE_CYCLE = 365  # φ(t) の1周期（日数）—必要に応じて置き換え

df_path = Path("./results")
df_path.mkdir(parents=True, exist_ok=True)

# Dynamic dataset を作るヘルパ
def build_dataset(seed_offset: int = 0):
    if getattr(conf, "use_dynamic_action_dataset", False) and DynamicActionBanditWithTime is not None:
        kwargs = dict(
            n_actions=conf.n_actions,
            dim_context=conf.dim_context,
            n_users=conf.n_users,
            t_oldest=conf.t_oldest,
            t_now=conf.t_now,
            t_future=conf.t_future,
            beta=conf.beta,
            reward_std=conf.reward_std,
            num_time_structure=conf.num_time_structure_for_logged_data,
            lambda_ratio=conf.lambda_ratio,
            flag_simple_reward=conf.flag_simple_reward,
            g_coef=conf.g_coef,
            h_coef=conf.h_coef,
            random_state=conf.random_state + seed_offset,
        )
        # 可用性の指定（conf 側の追加パラメタを利用）
        if getattr(conf, "use_availability_func", False):
            kwargs["availability_func"] = getattr(conf, "availability_func_weekly_stair")
        else:
            # birth/death を conf から渡す（存在すれば）
            if hasattr(conf, "action_birth_time"):
                kwargs["action_birth_time"] = conf.action_birth_time
            if hasattr(conf, "action_death_time") and conf.action_death_time is not None:
                kwargs["action_death_time"] = conf.action_death_time

        return DynamicActionBanditWithTime(**kwargs)
    else:
        # 既存の定常（時間あり）ベースライン
        return SyntheticBanditWithTimeDataset(
            n_actions=conf.n_actions,
            dim_context=conf.dim_context,
            n_users=conf.n_users,
            t_oldest=conf.t_oldest,
            t_now=conf.t_now,
            t_future=conf.t_future,
            beta=conf.beta,
            reward_std=conf.reward_std,
            num_time_structure=conf.num_time_structure_for_logged_data,
            lambda_ratio=conf.lambda_ratio,
            flag_simple_reward=conf.flag_simple_reward,
            g_coef=conf.g_coef,
            h_coef=conf.h_coef,
            random_state=conf.random_state + seed_offset,
        )

# 可用マスクを安全に取得
def get_avail_from_feedback_or_dataset(dataset, times_vec, feedback=None, fallback_shape=None):
    if feedback is not None and "available_actions" in feedback:
        return feedback["available_actions"].astype(bool)
    # dataset 側の _availability を使う
    if hasattr(dataset, "_availability") and callable(dataset._availability):
        return dataset._availability(times_vec).astype(bool)
    # 全 True フォールバック
    if fallback_shape is None and feedback is not None and "expected_reward" in feedback:
        fallback_shape = feedback["expected_reward"].shape
    if fallback_shape is None:
        raise ValueError("fallback_shape is required when neither feedback nor dataset provide availability.")
    return np.ones(fallback_shape, dtype=bool)


# ================== Experiment ==================
start_time = time.time()

# 将来評価期間の開始・終了
time_at_evaluation_start = conf.time_at_evaluation
time_at_evaluation_end_datetime = datetime.datetime.fromtimestamp(time_at_evaluation_start) \
    + datetime.timedelta(days=NUM_DAYS_IN_ONE_CYCLE * conf.num_cycles_in_evaluation_period) \
    - datetime.timedelta(seconds=1)
time_at_evaluation_end = int(datetime.datetime.timestamp(time_at_evaluation_end_datetime))

# ハイパラ表示
show_hyperparameters(
    time_at_evaluation_start=time_at_evaluation_start,
    time_at_evaluation_end=time_at_evaluation_end,
    flag_show_time_at_evaluation=True,
)

x = "n_rounds"
xlabel = "number of samples in the logged data"
xticklabels = conf.n_rounds_list

result_df_list = []

for h in range(conf.n_seeds_all):
    for n_rounds in conf.n_rounds_list:

        # データセット（seed をずらす）
        dataset = build_dataset(seed_offset=h * 10)

        for s in range(conf.n_seeds_for_time_eval_sampling):

            estimated_policy_value_list = []

            # 評価時刻 t' を一様にサンプリング
            random_ = check_random_state(s + h * 10)
            time_at_evaluation = random_.uniform(
                time_at_evaluation_start, time_at_evaluation_end, size=1
            ).astype(int)
            time_at_evaluation_int = int(time_at_evaluation.item())  # 安全のため int 化

            # === 真値近似用の test データ（全て t' ）===
            test_bandit_data = dataset.obtain_batch_bandit_feedback(
                n_rounds=conf.num_test,
                evaluation_mode=True,
                time_at_evaluation=time_at_evaluation_int,
                random_state_for_sampling=s + h * 10,
            )

            # 将来時刻の可用マスク（Dynamic 環境なら存在）
            avail_test = get_avail_from_feedback_or_dataset(
                dataset=dataset,
                times_vec=np.full(conf.num_test, time_at_evaluation_int, dtype=int),
                feedback=test_bandit_data,
                fallback_shape=test_bandit_data["expected_reward"].shape,
            )

            # 評価方策（将来 t'）— マスク版が使えるならそれを優先
            if gen_eps_greedy_masked is not None and getattr(conf, "use_masked_policy", True):
                action_dist_test = gen_eps_greedy_masked(
                    expected_reward=test_bandit_data["expected_reward"],
                    eps=conf.eps,
                    is_optimal=True,
                    available_actions=avail_test,
                )
            else:
                action_dist_test = gen_eps_greedy(
                    expected_reward=test_bandit_data["expected_reward"],
                    is_optimal=True,
                    eps=conf.eps,
                )

            # 真値 V_t（Dynamic 環境では可用集合で再正規化）
            try:
                policy_value = dataset.calc_ground_truth_policy_value(
                    expected_reward=test_bandit_data["expected_reward"],
                    action_dist=action_dist_test,
                    available_actions=avail_test,  # 受け取れる実装なら渡す
                )
            except TypeError:
                policy_value = np.average(
                    test_bandit_data["expected_reward"],
                    weights=action_dist_test,
                    axis=1,
                ).mean()

            # === 推定を複数 seed で回す ===
            for _ in tqdm(range(conf.n_seeds), desc=f"h = {h}, {xlabel} = {n_rounds}, n_seeds_for_time_eval_sampling = {s}"):

                # 検証データ（ログ）
                val_bandit_data = dataset.obtain_batch_bandit_feedback(
                    n_rounds=n_rounds,
                    evaluation_mode=False,
                    random_state_for_sampling=_ + s * 10 + n_rounds + h * 100,
                )

                # 検証時の評価方策（既存どおり）
                action_dist_val = gen_eps_greedy(
                    expected_reward=val_bandit_data["expected_reward"],
                    is_optimal=True,
                    eps=conf.eps,
                )

                # t_now から t' までに跨る φ のステップ数
                days_after_logged_data = (time_at_evaluation_int - dataset.t_now) // SECONDS_PER_DAY
                days_per_time_structure = NUM_DAYS_IN_ONE_CYCLE / dataset.num_time_structure
                num_time_structure_from_t_now_to_time_at_evaluation = int(
                    np.ceil(days_after_logged_data / days_per_time_structure)
                )

                # ランナー呼び出し（私用 run_ope_dynamic があればそちらを使用）
                run_ope_fn(
                    dataset=dataset,
                    round=_ + s * 10 + h * 100,
                    time_at_evaluation=time_at_evaluation_int,
                    estimated_policy_value_list=estimated_policy_value_list,
                    val_bandit_data=val_bandit_data,
                    action_dist_val=action_dist_val,
                    num_true_time_structure_for_OPFV_reward=conf.num_true_time_structure_for_OPFV_reward,
                    num_true_time_structure_for_OPFV_for_context=None,
                    num_episodes_for_Prognosticator=conf.num_episodes_for_Prognosticator,
                    num_time_structure_from_t_now_to_time_at_evaluation=num_time_structure_from_t_now_to_time_at_evaluation,
                    eps=conf.eps,
                    true_policy_value=policy_value,
                    flag_Prognosticator_optimality=conf.flag_Prognosticator_optimality,
                    num_features_for_Prognosticator_list=conf.num_features_for_Prognosticator_list,
                    flag_include_DM=conf.flag_include_DM,
                    flag_calculate_data_driven_OPFV=conf.flag_calculate_data_driven_OPFV,
                    candidate_num_time_structure_list=conf.candidate_num_time_structure_list,
                )

            # 推定結果を DataFrame 化
            result_df = (
                DataFrame(DataFrame(estimated_policy_value_list).stack())
                .reset_index(1)
                .rename(columns={"level_1": "est", 0: "value"})
            )
            result_df[x] = n_rounds
            result_df["se"] = (result_df.value - policy_value) ** 2
            result_df["bias"] = 0.0
            result_df["variance"] = 0.0

            sample_mean = (
                DataFrame(result_df[result_df["est"] != "V_t"]
                          .groupby(["est"])
                          .mean()
                          .value)
                .reset_index()
            )
            for est_ in sample_mean["est"]:
                estimates = result_df.loc[result_df["est"] == est_, "value"].values
                mean_est = sample_mean.loc[sample_mean["est"] == est_, "value"].values[0]
                mean_vec = np.ones_like(estimates) * mean_est
                result_df.loc[result_df["est"] == est_, "bias"] = (policy_value - mean_vec) ** 2
                result_df.loc[result_df["est"] == est_, "variance"] = (estimates - mean_vec) ** 2

            result_df_list.append(result_df)

# 集計＆保存
result_df = pd.concat(result_df_list).reset_index(level=0)
result_df.to_csv(df_path / "result_df.csv", index=False)

elapsed_time = time.time() - start_time
print(f"execution time: {elapsed_time / 60:.2f} mins")
print(f"Saved: {df_path / 'result_df.csv'}")

In [12]:
import inspect


In [19]:
# ===== 超小型設定（conf を上書きせず、このセル内だけで使う） =====
SECONDS_PER_DAY = 86400
NUM_DAYS_IN_ONE_CYCLE = 365

# 1回だけ回す
n_rounds_min     = 200
num_test_min     = 1000
n_inner_seeds    = 1     # 既存コードの conf.n_seeds 相当
seed_offset      = 0

# 将来評価の開始〜終了（1周期だけ）
time_at_evaluation_start = conf.time_at_evaluation
time_at_evaluation_end_datetime = datetime.datetime.fromtimestamp(time_at_evaluation_start) \
    + datetime.timedelta(days=NUM_DAYS_IN_ONE_CYCLE * conf.num_cycles_in_evaluation_period) \
    - datetime.timedelta(seconds=1)
time_at_evaluation_end = int(datetime.datetime.timestamp(time_at_evaluation_end_datetime))

show_hyperparameters(
    time_at_evaluation_start=time_at_evaluation_start,
    time_at_evaluation_end=time_at_evaluation_end,
    flag_show_time_at_evaluation=True
)

# ===== ヘルパ =====
def build_dataset(seed_offset: int = 0):
    """Dynamic が使える場合は __init__ のシグネチャに合わせて渡す引数を自動フィルタする。"""
    if getattr(conf, "use_dynamic_action_dataset", False) and DynamicActionBanditWithTime is not None:
        # 候補となる引数（None は後で落とす）
        cand = dict(
            n_actions=conf.n_actions,
            dim_context=conf.dim_context,
            n_users=conf.n_users,
            t_oldest=conf.t_oldest,
            t_now=conf.t_now,
            t_future=conf.t_future,
            beta=conf.beta,
            reward_std=conf.reward_std,
            num_time_structure=getattr(conf, "num_time_structure_for_logged_data", None),
            lambda_ratio=getattr(conf, "lambda_ratio", None),
            random_state=conf.random_state + seed_offset,
        )

        # 可用性：関数優先／なければ birth/death
        if getattr(conf, "use_availability_func", False) and hasattr(conf, "availability_func_weekly_stair"):
            cand["availability_func"] = conf.availability_func_weekly_stair
        else:
            if hasattr(conf, "action_birth_time"):
                cand["action_birth_time"] = conf.action_birth_time
            if hasattr(conf, "action_death_time") and conf.action_death_time is not None:
                cand["action_death_time"] = conf.action_death_time

        # None を除去
        cand = {k: v for k, v in cand.items() if v is not None}

        # ★ __init__ のシグネチャに存在するキーだけ残す（g_coef/h_coef等は自動で落ちる）
        params = inspect.signature(DynamicActionBanditWithTime.__init__).parameters
        kwargs = {k: v for k, v in cand.items() if k in params}

        return DynamicActionBanditWithTime(**kwargs)

    # フォールバック：既存の合成データ
    return SyntheticBanditWithTimeDataset(
        n_actions=conf.n_actions,
        dim_context=conf.dim_context,
        n_users=conf.n_users,
        t_oldest=conf.t_oldest,
        t_now=conf.t_now,
        t_future=conf.t_future,
        beta=conf.beta,
        reward_std=conf.reward_std,
        num_time_structure=conf.num_time_structure_for_logged_data,
        lambda_ratio=conf.lambda_ratio,
        flag_simple_reward=conf.flag_simple_reward,
        g_coef=conf.g_coef,
        h_coef=conf.h_coef,
        random_state=conf.random_state + seed_offset,
    )

def get_avail(dataset, times_vec, feedback=None, fallback_shape=None):
    """可用マスクの安全取得"""
    if feedback is not None and "available_actions" in feedback:
        return feedback["available_actions"].astype(bool)
    if hasattr(dataset, "_availability") and callable(dataset._availability):
        return dataset._availability(times_vec).astype(bool)
    if fallback_shape is None and feedback is not None and "expected_reward" in feedback:
        fallback_shape = feedback["expected_reward"].shape
    if fallback_shape is None:
        raise ValueError("fallback_shape を指定してください。")
    return np.ones(fallback_shape, dtype=bool)


# ===== 実験（極小） =====
start_time = time.time()
dataset = build_dataset(seed_offset=seed_offset)

# 将来時刻を1つだけサンプル
rng = check_random_state(42)
t_eval = int(rng.uniform(time_at_evaluation_start, time_at_evaluation_end))

# --- 真値近似用 test データ（すべて t_eval） ---
test = dataset.obtain_batch_bandit_feedback(
    n_rounds=num_test_min,
    evaluation_mode=True,
    time_at_evaluation=t_eval,
    random_state_for_sampling=123
)
avail_test = get_avail(dataset, np.full(num_test_min, t_eval, dtype=int),
                       feedback=test, fallback_shape=test["expected_reward"].shape)

# 将来の評価方策（マスクがあれば使う）
if gen_eps_greedy_masked is not None and getattr(conf, "use_masked_policy", True):
    pi_e_test = gen_eps_greedy_masked(
        expected_reward=test["expected_reward"],
        eps=conf.eps, is_optimal=True,
        available_actions=avail_test,
    )
else:
    pi_e_test = gen_eps_greedy(
        expected_reward=test["expected_reward"], eps=conf.eps, is_optimal=True
    )

# 真値 V_t（データセット実装によりマスク渡し可/不可を自動判定）
try:
    V_true = dataset.calc_ground_truth_policy_value(
        expected_reward=test["expected_reward"],
        action_dist=pi_e_test,
        available_actions=avail_test,
    )
except TypeError:
    V_true = np.average(test["expected_reward"], weights=pi_e_test, axis=1).mean()

# --- 検証データ（ログ）を最小で作成 ---
val = dataset.obtain_batch_bandit_feedback(
    n_rounds=n_rounds_min, evaluation_mode=False,
    random_state_for_sampling=999
)
pi_b_like = gen_eps_greedy(val["expected_reward"], eps=conf.eps, is_optimal=True)

# t_now→t_eval の φ ステップ数（既存 run_ope が要求する引数）
days_after = (t_eval - dataset.t_now) // SECONDS_PER_DAY
days_per_phi = NUM_DAYS_IN_ONE_CYCLE / dataset.num_time_structure
num_phi_steps = int(np.ceil(days_after / days_per_phi))

# --- OPE 実行（私用ランナーがあれば masked で、無ければ既存） ---
est_list = []
for r in range(n_inner_seeds):
    run_ope_fn(
        dataset=dataset,
        round=r,
        time_at_evaluation=t_eval,
        estimated_policy_value_list=est_list,
        val_bandit_data=val,
        action_dist_val=pi_b_like,
        num_true_time_structure_for_OPFV_reward=conf.num_true_time_structure_for_OPFV_reward,
        num_true_time_structure_for_OPFV_for_context=None,
        num_episodes_for_Prognosticator=conf.num_episodes_for_Prognosticator,
        num_time_structure_from_t_now_to_time_at_evaluation=num_phi_steps,
        eps=conf.eps,
        true_policy_value=V_true,
        flag_Prognosticator_optimality=conf.flag_Prognosticator_optimality,
        num_features_for_Prognosticator_list=conf.num_features_for_Prognosticator_list,
        flag_include_DM=conf.flag_include_DM,
        flag_calculate_data_driven_OPFV=conf.flag_calculate_data_driven_OPFV,
        candidate_num_time_structure_list=conf.candidate_num_time_structure_list,
    )

# === 結果表示（小さな表だけ） ===
df = DataFrame(DataFrame(est_list).stack()).reset_index(1).rename(columns={"level_1": "est", 0: "value"})
df["SE"] = (df["value"] - V_true) ** 2
print("t_eval:", datetime.datetime.fromtimestamp(t_eval), "| n_rounds:", n_rounds_min, "| num_test:", num_test_min)
print("True V_t:", V_true)
display(df)

elapsed = time.time() - start_time
print(f"elapsed: {elapsed:.2f}s")

################# START hyperparameters #################
### About Seeds and Number of Samples ###
number of seeds = 20
number of seeds for time at evaluation = 20
number of training samples (n) = 1000
number of test samples = 10000

### About Time Structure ###
number of true time structures for reward (|C_r|) = 8
strength of time structure for reward (lambda) = 0.5

### About Prognosticator ###
list of time features for Prognosticator = [<function fourier_scalar at 0x0000018774942A60>]
optimality of the data driven feature selection for Prognosticator = True
number of time features for Prognosticator = 3
list of the numbers of time features for Prognosticator = range(3, 8, 2)

### About Logged Data Collection Period and Evaluation Period ###
time when we start collecting the logged data = 2022-01-01 00:00:00
time when we finish collecting the logged data = 2022-12-31 23:59:59
time when we start evaluating a target policy = 2023-01-01 00:00:00
time when we finish evaluating a target 

  g1 = mu3 / np.power(mu2, 1.5)


t_eval: 2023-05-17 16:58:16 | n_rounds: 200 | num_test: 1000
True V_t: 14.229388574845641


Unnamed: 0,est,value,SE
0,IPS,13.048208,1.395187
0,DR,13.698106,0.282262
0,Prognosticator,10.126804,16.831198
0,OPFV-masked,13.802855,0.181931
0,OPFV,13.802855,0.181931
0,data-driven OPFV,12.778959,2.103745
0,V_t,14.229389,0.0


elapsed: 1.68s


In [20]:
# 将来 t' の不可用が存在するラウンド割合（>0 なら可用変化あり）
eval_unavail_frac = np.mean(avail_test.sum(axis=1) < dataset.n_actions)

# 将来方策が不可用行動に載せた確率の合計（理想は 0）
mass_on_unavail = float(np.max(np.sum(pi_e_test * (~avail_test), axis=1)))

# ログ側にも不可用がある？（Dynamic なら >0 が普通）
log_unavail_frac = np.mean(dataset._availability(val["time"]).sum(axis=1) < dataset.n_actions) \
                   if hasattr(dataset, "_availability") else 0.0

print(eval_unavail_frac, mass_on_unavail, log_unavail_frac)


0.0 0.0 0.155


In [22]:
# 既存の dataset / val / run_ope_fn がある前提（無ければ最小実験セルの前段を流してください）
SECONDS_PER_DAY = 86400

# 1) t' を “登場がまだ途中” のタイミングに固定（例：最古時刻 + 2週間）
t_eval_early = int(dataset.t_oldest + 2 * 7 * SECONDS_PER_DAY)

# 将来の可用マスクと将来方策（マスク付き）
test_early = dataset.obtain_batch_bandit_feedback(
    n_rounds=1000, evaluation_mode=True, time_at_evaluation=t_eval_early, random_state_for_sampling=123
)
avail_test_early = dataset._availability(np.full(1000, t_eval_early)).astype(bool) if hasattr(dataset, "_availability") else np.ones_like(test_early["expected_reward"], bool)

from policy import gen_eps_greedy, gen_eps_greedy_masked
pi_e_test_early = gen_eps_greedy_masked(
    expected_reward=test_early["expected_reward"], eps=conf.eps, is_optimal=True, available_actions=avail_test_early
)

# どのくらい“まだ不可用”がある？
eval_unavail_frac = float(np.mean(avail_test_early.sum(axis=1) < dataset.n_actions))
mass_on_unavail   = float(np.max(np.sum(pi_e_test_early * (~avail_test_early), axis=1)))
print("eval_unavail_frac:", eval_unavail_frac, "| mass_on_unavail:", mass_on_unavail)

# 2) 最小のログを作ってランナー実行
val_small = dataset.obtain_batch_bandit_feedback(n_rounds=200, evaluation_mode=False, random_state_for_sampling=999)

# 参考：将来真値
try:
    V_true_early = dataset.calc_ground_truth_policy_value(
        expected_reward=test_early["expected_reward"],
        action_dist=pi_e_test_early,
        available_actions=avail_test_early,
    )
except TypeError:
    V_true_early = np.average(test_early["expected_reward"], weights=pi_e_test_early, axis=1).mean()

from pandas import DataFrame
est_list = []
run_ope_fn(
    dataset=dataset,
    round=0,
    time_at_evaluation=t_eval_early,
    estimated_policy_value_list=est_list,
    val_bandit_data=val_small,
    action_dist_val=gen_eps_greedy(val_small["expected_reward"], is_optimal=True, eps=conf.eps),
    num_true_time_structure_for_OPFV_reward=conf.num_true_time_structure_for_OPFV_reward,
    num_true_time_structure_for_OPFV_for_context=None,
    num_episodes_for_Prognosticator=conf.num_episodes_for_Prognosticator,
    num_time_structure_from_t_now_to_time_at_evaluation=1,
    eps=conf.eps,
    true_policy_value=V_true_early,
    flag_Prognosticator_optimality=conf.flag_Prognosticator_optimality,
    num_features_for_Prognosticator_list=conf.num_features_for_Prognosticator_list,
    flag_include_DM=conf.flag_include_DM,
    flag_calculate_data_driven_OPFV=conf.flag_calculate_data_driven_OPFV,
    candidate_num_time_structure_list=conf.candidate_num_time_structure_list,
)
df = DataFrame(DataFrame(est_list).stack()).reset_index(1).rename(columns={"level_1":"est", 0:"value"})
df["SE"] = (df["value"] - V_true_early) ** 2
display(df.sort_values("est"))


  g1 = mu3 / np.power(mu2, 1.5)


eval_unavail_frac: 1.0 | mass_on_unavail: 0.0


Unnamed: 0,est,value,SE
0,DR,13.698106,16.638305
0,IPS,13.048208,11.758799
0,OPFV,9.81302,0.037606
0,OPFV-masked,9.81302,0.037606
0,Prognosticator,13.048208,11.758799
0,V_t,9.619098,0.0
0,data-driven OPFV,10.42705,0.652787
