In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from time import time
from tqdm import tqdm   # ⬅ tqdm 추가

from obp.dataset import OpenBanditDataset
from obp.policy import LinUCB, LinTS
from obp.ope import (
    OffPolicyEvaluation,
    InverseProbabilityWeighting,
    DoublyRobust,
    SwitchDoublyRobust,
    SubGaussianDoublyRobust,
    DoublyRobustTuning,
    RegressionModel,
)

#############################################################
# 1) Load Open Bandit Dataset
#############################################################

def load_obd():
    dataset = OpenBanditDataset(
        behavior_policy="random",
        campaign="all",
        data_path="C:\\Users\\lyms0\\Downloads\\open_bandit_dataset\\open_bandit_dataset"
    )
    fb = dataset.obtain_batch_bandit_feedback()
    print(f"[INFO] Loaded rounds={fb['n_rounds']} actions={fb['n_actions']}")
    return dataset, fb


#############################################################
# 2) Train LinUCB / LinTS using logged data (with tqdm)
#############################################################

def train_policy(policy, fb):
    X, A, R = fb["context"], fb["action"], fb["reward"]

    print(f"[TRAIN] Training {policy.policy_name} ...")
    for x, a, r in tqdm(zip(X, A, R), total=len(R)):
        policy.update_params(
            action=int(a),
            reward=float(r),
            context=x.reshape(1, -1),
        )
    return policy


#############################################################
# 3) deterministic action_dist (with tqdm)
#############################################################

def compute_action_dist(policy, X, n_actions):
    n = X.shape[0]
    dist = np.zeros((n, n_actions, 1))

    print(f"[ACTION_DIST] Computing action_dist for {policy.policy_name} ...")
    for i in tqdm(range(n)):
        action = policy.select_action(X[i].reshape(1, -1))[0]
        dist[i, action, 0] = 1.0

    return dist


#############################################################
# 4) Reward model for DR, Switch-DR, SG-DR, MRDR
#############################################################

def build_reward_model(fb, n_actions):
    print("[REWARD MODEL] Fitting logistic regression models ...")
    start = time()

    model = RegressionModel(
        n_actions=n_actions,
        len_list=1,
        base_model=LogisticRegression(max_iter=500),
    )
    pred = model.fit_predict(fb)

    print(f"[DONE] Reward model training took {time() - start:.2f} sec\n")
    return pred


#############################################################
# 5) OPE
#############################################################

def run_ope(fb, action_dist, reward_model):
    ope = OffPolicyEvaluation(
        fb,
        ope_estimators=[
            InverseProbabilityWeighting(),
            DoublyRobust(),
            SwitchDoublyRobust(),
            SubGaussianDoublyRobust(),
            DoublyRobustTuning(
                lambdas=[0.0, 0.01, 0.05, 0.1, 0.2, 0.5, 1.0],
                tuning_method="slope",
                estimator_name="mrdr",
            ),
        ],
    )

    print("[OPE] Estimating policy values ...")
    start = time()

    values = ope.estimate_policy_values(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=reward_model,
    )
    intervals = ope.estimate_interval(
        action_dist=action_dist,
        estimated_rewards_by_reg_model=reward_model,
        alpha=0.05,
    )

    print(f"[DONE] OPE took {time() - start:.2f} sec\n")
    return values, intervals


#############################################################
# 6) Full Experiment Runner
#############################################################

def run_experiment():
    total_start = time()   # 전체 시간 측정 시작

    dataset, fb = load_obd()
    X, n_actions = fb["context"], fb["n_actions"]

    baseline = fb["reward"].mean()
    print(f"\n[Random Baseline Reward] {baseline:.4f}\n")

    # Train LinUCB and LinTS
    t0 = time()
    linucb = train_policy(LinUCB(dim=X.shape[1], n_actions=n_actions, len_list=1), fb)
    print(f"[TIME] LinUCB training took {time() - t0:.2f} sec\n")

    t1 = time()
    lints  = train_policy(LinTS(dim=X.shape[1], n_actions=n_actions, len_list=1), fb)
    print(f"[TIME] LinTS training took {time() - t1:.2f} sec\n")

    # Compute action distributions
    t2 = time()
    dist_ucb = compute_action_dist(linucb, X, n_actions)
    print(f"[TIME] LinUCB action_dist took {time() - t2:.2f} sec\n")

    t3 = time()
    dist_ts  = compute_action_dist(lints, X, n_actions)
    print(f"[TIME] LinTS action_dist took {time() - t3:.2f} sec\n")

    # Reward model
    reward_model = build_reward_model(fb, n_actions)

    # OPE
    v_ucb, ci_ucb = run_ope(fb, dist_ucb, reward_model)
    v_ts, ci_ts   = run_ope(fb, dist_ts,  reward_model)

    print("\n========= LinUCB =========")
    for name, v in v_ucb.items():
        print(f"{name:<25} {v:.4f}   CI=({ci_ucb[name]['lower']:.4f}, {ci_ucb[name]['upper']:.4f})")

    print("\n========= LinTS =========")
    for name, v in v_ts.items():
        print(f"{name:<25} {v:.4f}   CI=({ci_ts[name]['lower']:.4f}, {ci_ts[name]['upper']:.4f})")

    print(f"\n[TOTAL TIME] Entire experiment took {time() - total_start:.2f} sec")


run_experiment()
