In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.environ['OULAD_DIR'] = '/content/drive/MyDrive/OULAD_data'


Mounted at /content/drive


In [None]:
import os
import math
import random
from dataclasses import dataclass
from typing import Dict, Tuple, List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import time

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
try:
    import torch.backends.cudnn as cudnn
    cudnn.deterministic = True
    cudnn.benchmark = False
except Exception:
    pass

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

INPUT_LEN = 12     # 觀測過去 12 週
PRED_LEN  = 6      # 預測未來 6 週
MIN_WEEKS = INPUT_LEN + PRED_LEN
MAX_WEEKS_CAP = 45  # 最長保留到 45 週，避免極長課程拉長向量

FEAT_DIM = 4         # [clicks, has_submit, avg_score_sofar, clicks_diff1]
HIDDEN_SIZE = 64
LATENT_SIZE = 32
NUM_LAYERS = 1
BATCH_SIZE  = 128
LR = 1e-3
EPOCHS_LSTM = 12
EPOCHS_VAE  = 14
BETA_KL = 0.05
VAE_SAMPLES = 40
COVER_EPS_RATIO = 0.20

ACT_NAME = "tanh"
import torch.nn.functional as F

def get_act(name: str):
    n = name.lower()
    if n == "tanh":
        return torch.tanh
    if n == "relu":
        return F.relu
    if n == "elu":
        return F.elu
    if n == "leaky_relu":
        return lambda x: F.leaky_relu(x, negative_slope=0.1)
    return torch.tanh

ACT = get_act(ACT_NAME)

OUTPUT_DIR = "outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

DATA_DIR = os.environ.get("OULAD_DIR", ".")

def fpath(name: str) -> str:
    return os.path.join(DATA_DIR, name)

REQ_FILES = ["studentInfo.csv", "studentVle.csv", "studentAssessment.csv"]
for f in REQ_FILES:
    if not os.path.exists(fpath(f)):
        raise FileNotFoundError(f"找不到 {f}，請把檔案放到 {DATA_DIR} 或設定 os.environ['OULAD_DIR'] 再執行。")

info = pd.read_csv(fpath("studentInfo.csv"))
vle  = pd.read_csv(fpath("studentVle.csv"))
sasm = pd.read_csv(fpath("studentAssessment.csv"))

info = info[["id_student", "code_module", "code_presentation", "final_result"]].copy()

vle = vle[["id_student", "code_module", "code_presentation", "date", "sum_click"]].copy()
vle = vle.dropna(subset=["id_student", "code_module", "code_presentation", "date", "sum_click"])
vle = vle[vle["date"] >= 0]

sasm = sasm[["id_student", "date_submitted", "score"]].copy()
sasm = sasm.dropna(subset=["id_student", "date_submitted"])
sasm = sasm[sasm["date_submitted"] >= 0]


infocnt = info.groupby("id_student").size().rename("n").reset_index()
unique_students = set(infocnt[infocnt["n"] == 1]["id_student"].tolist())
info_u = info[info["id_student"].isin(unique_students)].copy()

mp_map: Dict[int, Tuple[str,str]] = (
    info_u.set_index("id_student")[ ["code_module", "code_presentation"] ]
          .apply(tuple, axis=1).to_dict()
)

vle  = vle[vle["id_student"].isin(unique_students)].copy()
sasm = sasm[sasm["id_student"].isin(unique_students)].copy()
vle["mp"]  = vle["id_student"].map(mp_map)
sasm["mp"] = sasm["id_student"].map(mp_map)

vle["week"]  = (vle["date"] // 7).astype(int)
sasm["week"] = (sasm["date_submitted"] // 7).astype(int)

max_week = 0
if len(vle):
    max_week = int(vle["week"].max())
if len(sasm):
    max_week = max(max_week, int(sasm["week"].max()))
WEEKS = int(min(MAX_WEEKS_CAP, max_week + 1))
if WEEKS < MIN_WEEKS:
    raise RuntimeError(f"可用週數不足（WEEKS={WEEKS} < {MIN_WEEKS}），請調整 INPUT_LEN/PRED_LEN 或擴大資料。")

wk = pd.DataFrame([(sid, mp) for sid, mp in mp_map.items()], columns=["id_student","mp"])
skeleton = wk.merge(pd.DataFrame({"week": np.arange(WEEKS, dtype=int)}), how="cross")

clicks_wk = (
    vle.groupby(["id_student", "mp", "week"], as_index=False)["sum_click"].sum()
       .assign(clicks=lambda df: np.log1p(df["sum_click"]))
       [["id_student", "mp", "week", "clicks"]]
)

sasm2 = sasm.copy()
sasm2["score"] = sasm2["score"].fillna(0.0)
subs_cnt_wk = (
    sasm2.groupby(["id_student", "mp", "week"], as_index=False)["date_submitted"].count()
         .rename(columns={"date_submitted": "submit_cnt"})
)
score_sum_wk = (
    sasm2.groupby(["id_student", "mp", "week"], as_index=False)["score"].sum()
         .rename(columns={"score": "score_sum"})
)
subs_score_wk = subs_cnt_wk.merge(score_sum_wk, on=["id_student","mp","week"], how="outer").fillna({"submit_cnt":0, "score_sum":0.0})

X_clicks = skeleton.merge(clicks_wk, on=["id_student","mp","week"], how="left")
X_clicks["clicks"] = X_clicks["clicks"].fillna(0.0)
X_tmp = X_clicks.merge(subs_score_wk, on=["id_student","mp","week"], how="left").fillna({"submit_cnt":0, "score_sum":0.0})

X_tmp["has_submit"] = (X_tmp["submit_cnt"] > 0).astype(np.float32)

X_tmp = (
    X_tmp.sort_values(["id_student","mp","week"])
        .groupby(["id_student","mp"], as_index=False, group_keys=False)
        .apply(lambda g: g.assign(
            cum_cnt = g["submit_cnt"].cumsum(),
            cum_score = g["score_sum"].cumsum(),
            avg_score_sofar = np.where(g["submit_cnt"].cumsum()>0, g["score_sum"].cumsum() / np.maximum(g["submit_cnt"].cumsum(), 1e-8), 0.0)
        ))
)

X_tmp = (
    X_tmp.sort_values(["id_student","mp","week"])
         .groupby(["id_student","mp"], as_index=False, group_keys=False)
         .apply(lambda g: g.assign(clicks_diff1 = g["clicks"].diff().fillna(0.0)))
)

X = X_tmp[["id_student","mp","week","clicks","has_submit","avg_score_sofar","clicks_diff1"]].copy()

seqs: Dict[Tuple[int,Tuple[str,str]], np.ndarray] = {}
for (sid, mp), g in X.groupby(["id_student","mp"], sort=False):
    g = g.sort_values("week")
    mat = g[["clicks","has_submit","avg_score_sofar","clicks_diff1"]].to_numpy(dtype=np.float32)
    if mat.shape[0] == WEEKS:
        seqs[(sid, mp)] = mat

print(f"可用學生序列數：{len(seqs)} ；每條長度（週）={WEEKS}，特徵數={FEAT_DIM}")

all_keys = list(seqs.keys())
random.shuffle(all_keys)
N = len(all_keys)
tr_n = int(N*0.7)
va_n = int(N*0.15)
tr_keys = set(all_keys[:tr_n])
va_keys = set(all_keys[tr_n:tr_n+va_n])
te_keys = set(all_keys[tr_n+va_n:])

@dataclass
class WindowedSet:
    X_in: np.ndarray  # (N, T_in, D)
    Y_out: np.ndarray # (N, T_out, D)

def make_windows(keys: List[Tuple[int,Tuple[str,str]]]) -> WindowedSet:
    xs, ys = [], []
    for k in keys:
        seq = seqs[k]  # (WEEKS, D)

        for start in range(0, WEEKS - MIN_WEEKS + 1):
            end = start + MIN_WEEKS
            chunk = seq[start:end]
            x = chunk[:INPUT_LEN]
            y = chunk[INPUT_LEN:]
            xs.append(x)
            ys.append(y)
    X_in = np.stack(xs, axis=0).astype(np.float32)
    Y_out = np.stack(ys, axis=0).astype(np.float32)
    return WindowedSet(X_in, Y_out)

ws_tr = make_windows(list(tr_keys))
ws_va = make_windows(list(va_keys))
ws_te = make_windows(list(te_keys))

print(f"Train windows: {ws_tr.X_in.shape[0]}  | Val: {ws_va.X_in.shape[0]} | Test: {ws_te.X_in.shape[0]}")

mu = ws_tr.X_in.reshape(-1, FEAT_DIM).mean(axis=0)
sig = ws_tr.X_in.reshape(-1, FEAT_DIM).std(axis=0) + 1e-8

def z(x):
    return (x - mu) / sig

ws_tr = WindowedSet(z(ws_tr.X_in), z(ws_tr.Y_out))
ws_va = WindowedSet(z(ws_va.X_in), z(ws_va.Y_out))
ws_te = WindowedSet(z(ws_te.X_in), z(ws_te.Y_out))

class OULADDataset(Dataset):
    def __init__(self, ws: WindowedSet):
        self.X = torch.from_numpy(ws.X_in)
        self.Y = torch.from_numpy(ws.Y_out)
    def __len__(self):
        return self.X.shape[0]
    def __getitem__(self, i):
        return self.X[i], self.Y[i]

ld_tr = DataLoader(OULADDataset(ws_tr), batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
ld_va = DataLoader(OULADDataset(ws_va), batch_size=BATCH_SIZE, shuffle=False)
ld_te = DataLoader(OULADDataset(ws_te), batch_size=BATCH_SIZE, shuffle=False)

train_std = float(np.std(np.concatenate([ws_tr.X_in.reshape(-1, FEAT_DIM), ws_tr.Y_out.reshape(-1, FEAT_DIM)], axis=0)))
cover_eps = COVER_EPS_RATIO * train_std

# 模型：Seq2Seq LSTM
class EncoderLSTM(nn.Module):
    def __init__(self, in_dim, hidden, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(in_dim, hidden, num_layers=num_layers, batch_first=True)
    def forward(self, x):
        _, (h, c) = self.lstm(x)
        return h, c

class DecoderLSTM(nn.Module):
    def __init__(self, in_dim, hidden, out_dim, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(in_dim, hidden, num_layers=num_layers, batch_first=True)
        self.proj = nn.Linear(hidden, out_dim)
    def forward(self, y_prev, h, c):
        out, (h, c) = self.lstm(y_prev, (h, c))
        y = self.proj(out)
        return y, h, c

class Seq2SeqLSTM(nn.Module):
    def __init__(self, feat_dim, hidden, num_layers=1):
        super().__init__()
        self.enc = EncoderLSTM(feat_dim, hidden, num_layers)
        self.dec = DecoderLSTM(feat_dim, hidden, feat_dim, num_layers)
    def forward(self, x_in, y_future=None, teacher_ratio=0.0):
        B = x_in.size(0)
        h, c = self.enc(x_in)
        y_prev = x_in[:, -1:, :]
        outputs = []
        T_out = PRED_LEN if y_future is None else y_future.size(1)
        for t in range(T_out):
            y_t, h, c = self.dec(y_prev, h, c)
            outputs.append(y_t)
            if (y_future is not None) and (torch.rand(1).item() < teacher_ratio):
                y_prev = y_future[:, t:t+1, :]
            else:
                y_prev = y_t.detach()
        return torch.cat(outputs, dim=1)

# 模型：Seq2Seq VAE
class VAEEncoder(nn.Module):
    def __init__(self, in_dim, hidden, latent, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(in_dim, hidden, num_layers=num_layers, batch_first=True)
        self.mu = nn.Linear(hidden, latent)
        self.logvar = nn.Linear(hidden, latent)
    def forward(self, x):
        _, (h, c) = self.lstm(x)
        h_last = h[-1]
        mu = self.mu(h_last)
        logvar = self.logvar(h_last)
        return mu, logvar

def reparameterize(mu, logvar):
    std = torch.exp(0.5 * logvar)
    eps = torch.randn_like(std)
    return mu + eps * std

class VAEDecoder(nn.Module):
    def __init__(self, feat_dim, hidden, latent, num_layers=1):
        super().__init__()
        self.in_dim = feat_dim + latent
        self.lstm = nn.LSTM(self.in_dim, hidden, num_layers=num_layers, batch_first=True)
        self.proj = nn.Linear(hidden, feat_dim)
        self.h0 = nn.Linear(latent, hidden)
        self.c0 = nn.Linear(latent, hidden)
    def forward(self, y0, z, T_out):
        B = y0.size(0)
        h = ACT(self.h0(z)).unsqueeze(0)
        c = ACT(self.c0(z)).unsqueeze(0)
        y_prev = y0
        outs = []
        for _ in range(T_out):
            dec_in = torch.cat([y_prev, z.unsqueeze(1)], dim=-1)
            o, (h, c) = self.lstm(dec_in, (h, c))
            y_t = self.proj(o)
            outs.append(y_t)
            y_prev = y_t
        return torch.cat(outs, dim=1)

class Seq2SeqVAE(nn.Module):
    def __init__(self, feat_dim, hidden, latent, num_layers=1):
        super().__init__()
        self.enc = VAEEncoder(feat_dim, hidden, latent, num_layers)
        self.dec = VAEDecoder(feat_dim, hidden, latent, num_layers)
    def forward(self, x_in, y_future=None):
        mu, logvar = self.enc(x_in)
        z = reparameterize(mu, logvar)
        y0 = x_in[:, -1:, :]
        T_out = PRED_LEN if y_future is None else y_future.size(1)
        y_pred = self.dec(y0, z, T_out)
        return y_pred, mu, logvar
    @torch.no_grad()
    def sample(self, x_in, n_samples):
        mu, logvar = self.enc(x_in)
        B = x_in.size(0)
        y0 = x_in[:, -1:, :]
        outs = []
        for _ in range(n_samples):
            z = reparameterize(mu, logvar)
            y_pred = self.dec(y0, z, PRED_LEN)
            outs.append(y_pred.unsqueeze(0))
        return torch.cat(outs, dim=0)

# 訓練
def train_lstm(model, ld_tr, ld_va):
    opt = torch.optim.Adam(model.parameters(), lr=LR)
    loss_fn = nn.MSELoss()
    best_va = float('inf')
    teacher = 0.7
    start_time = time.perf_counter()
    for ep in range(1, EPOCHS_LSTM + 1):
        model.train()
        tr_loss = 0.0
        for x, y in ld_tr:
            x = x.to(device); y = y.to(device)
            opt.zero_grad()
            y_hat = model(x, y_future=y, teacher_ratio=teacher)
            loss = loss_fn(y_hat, y)
            loss.backward(); opt.step()
            tr_loss += loss.item() * x.size(0)
        tr_loss /= len(ld_tr.dataset)
        # 驗證
        model.eval(); va_loss = 0.0
        with torch.no_grad():
            for x, y in ld_va:
                x = x.to(device); y = y.to(device)
                y_hat = model(x)
                va_loss += loss_fn(y_hat, y).item() * x.size(0)
        va_loss /= len(ld_va.dataset)
        teacher = max(0.0, teacher * 0.9)
        print(f"[LSTM] Epoch {ep:02d} | train {tr_loss:.4f} | val {va_loss:.4f} | teacher {teacher:.2f}")
        if va_loss < best_va:
            best_va = va_loss
            torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, "lstm_best.pt"))
    total_time = time.perf_counter() - start_time
    print(f"[LSTM] Total training time: {total_time:.2f}s")
    return total_time


def kld_loss(mu, logvar):
    return 0.5 * torch.sum(torch.exp(logvar) + mu**2 - 1.0 - logvar, dim=1).mean()


def train_vae(model, ld_tr, ld_va):
    opt = torch.optim.Adam(model.parameters(), lr=LR)
    rec_loss_fn = nn.MSELoss()
    best_va = float('inf')
    start_time = time.perf_counter()
    for ep in range(1, EPOCHS_VAE + 1):
        model.train(); tr_rec, tr_kl = 0.0, 0.0
        for x, y in ld_tr:
            x = x.to(device); y = y.to(device)
            opt.zero_grad()
            y_hat, mu, logvar = model(x, y_future=y)
            rec = rec_loss_fn(y_hat, y)
            kl  = kld_loss(mu, logvar)
            loss = rec + BETA_KL * kl
            loss.backward(); opt.step()
            tr_rec += rec.item() * x.size(0)
            tr_kl  += kl.item()  * x.size(0)
        tr_rec /= len(ld_tr.dataset)
        tr_kl  /= len(ld_tr.dataset)
        # 驗證（看重建）
        model.eval(); va_rec = 0.0
        with torch.no_grad():
            for x, y in ld_va:
                x = x.to(device); y = y.to(device)
                y_hat, _, _ = model(x)
                va_rec += rec_loss_fn(y_hat, y).item() * x.size(0)
        va_rec /= len(ld_va.dataset)
        print(f"[VAE ] Epoch {ep:02d} | train_rec {tr_rec:.4f} | train_kl {tr_kl:.4f} | val_rec {va_rec:.4f}")
        if va_rec < best_va:
            best_va = va_rec
            torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, "vae_best.pt"))
    total_time = time.perf_counter() - start_time
    print(f"[VAE ] Total training time: {total_time:.2f}s")
    return total_time

@torch.no_grad()
def eval_and_report(lstm, vae, ld_te, cover_eps):

    lstm.eval(); vae.eval()
    sig_click = float(sig[0])
    mu_click  = float(mu[0])

    per_rows = []
    global_idx = 0
    for x, y in ld_te:
        x = x.to(device); y = y.to(device)
        B = x.size(0)
        # LSTM 單一路徑
        y_lstm = lstm(x)
        # VAE 多樣本
        Ys = vae.sample(x, n_samples=VAE_SAMPLES)  # (S,B,T,D)

        # 只看 clicks 維度（z 尺度）
        y_true_z = y[:, :, 0]
        y_lstm_z = y_lstm[:, :, 0]
        Ys_z     = Ys[:, :, :, 0]  # (S,B,T)

        # MSE（原始尺度）：z 的 MSE × (sigma^2)
        lstm_mse_seq = ((y_lstm_z - y_true_z) ** 2).mean(dim=1) * (sig_click ** 2)   # (B,)
        per_sample_mse = ((Ys_z - y_true_z.unsqueeze(0)) ** 2).mean(dim=2) * (sig_click ** 2)  # (S,B)
        best_mse_vals, best_idx = per_sample_mse.min(dim=0)  # (B,)

        # Diversity：跨樣本 std（原始尺度）
        diversity_std_seq = Ys_z.std(dim=0).mean(dim=1) * sig_click  # (B,)

        # Coverage：任一樣本每一步落在真值 ± cover_eps（以 z 尺度）— 取序列平均
        within = (torch.abs(Ys_z - y_true_z.unsqueeze(0)) <= cover_eps)
        cov_seq = within.any(dim=0).float().mean(dim=1)  # (B,)

        # 收集行列
        for b in range(B):
            s_best = int(best_idx[b].item())
            per_rows.append({
                "idx": global_idx,
                "LSTM_MSE": float(lstm_mse_seq[b].item()),
                "VAE_best_MSE": float(best_mse_vals[b].item()),
                "delta": float(lstm_mse_seq[b].item() - best_mse_vals[b].item()),
                "y_true": ((y_true_z[b] * sig_click + mu_click).detach().cpu().numpy()).round(3).tolist(),
                "y_LSTM": ((y_lstm_z[b] * sig_click + mu_click).detach().cpu().numpy()).round(3).tolist(),
                "y_VAE_best": ((Ys_z[s_best, b] * sig_click + mu_click).detach().cpu().numpy()).round(3).tolist(),
                "Diversity_std": float(diversity_std_seq[b].item()),
                "Coverage_seq": float(cov_seq[b].item()),
            })
            global_idx += 1

    df = pd.DataFrame(per_rows)

    # === Win-rate by improvement bucket ===
    print("=== Win-rate by improvement bucket (Δ = LSTM MSE - VAE best MSE) ===")
    bins = [-np.inf, -1000, -200, -50, -10, 10, 50, 200, 1000, np.inf]
    labels = [
        "VAE<<劣(>1000)",
        "VAE劣(200~1000)",
        "VAE劣(50~200)",
        "VAE劣(10~50)",
        "持平(-10~10)",
        "VAE勝(10~50)",
        "VAE勝(50~200)",
        "VAE勝(200~1000)",
        "VAE>>大勝(>1000)",
    ]
    df["bucket"] = pd.cut(df["delta"], bins=bins, labels=labels)
    bucket_counts = df["bucket"].value_counts().reindex(labels, fill_value=0)
    bucket_ratio = (bucket_counts / len(df)).round(4)
    win_table = pd.DataFrame({
        "Improvement bucket": bucket_counts.index,
        "count": bucket_counts.values,
        "ratio": bucket_ratio.values,
    })
    try:
        from IPython.display import display
        display(win_table)
    except Exception:
        print(win_table.to_string(index=False))

    # === Top-5 Regressed（VAE 明顯劣於 LSTM） ===
    print("=== Top-5 Regressed (VAE best >> LSTM) ===")
    top5 = df.nsmallest(5, "delta")[
        ["idx", "LSTM_MSE", "VAE_best_MSE", "delta", "y_true", "y_LSTM", "y_VAE_best", "Diversity_std"]
    ]
    try:
        from IPython.display import display
        display(top5)
    except Exception:
        print(top5.to_string(index=False))

    # === 簡要評估結果（原始尺度） ===
    print("========== 評估結果（原始尺度） ==========")
    print(f"LSTM  MSE（整體） : {df['LSTM_MSE'].mean():8.4f}")
    print(f"VAE   Best-of-N MSE: {df['VAE_best_MSE'].mean():8.4f}  (N={VAE_SAMPLES})")
    print(f"VAE   Diversity (std): {df['Diversity_std'].mean():6.4f}")
    print(f"VAE   Coverage（比例）: {df['Coverage_seq'].mean():6.4f}")

@torch.no_grad()
def eval_oulad_tables(lstm, vae, ld_te, mu, sig, cover_eps, n_samples=20, feature_index=0):
    import pandas as pd
    import numpy as np
    import torch
    lstm.eval(); vae.eval()
    sig_f = float(sig[feature_index]); mu_f = float(mu[feature_index])
    rows = []
    idx = 0
    for x, y in ld_te:
        x = x.to(device); y = y.to(device)
        y_l = lstm(x)
        Ys = vae.sample(x, n_samples=n_samples)
        y_true = y[:, :, feature_index]
        y_l = y_l[:, :, feature_index]
        Ys_f = Ys[:, :, :, feature_index]
        lstm_mse = ((y_l - y_true)**2).mean(dim=1) * (sig_f**2)
        per_s_mse = ((Ys_f - y_true.unsqueeze(0))**2).mean(dim=2) * (sig_f**2)
        best_mse, best_idx = per_s_mse.min(dim=0)
        diversity = Ys_f.std(dim=0).mean(dim=1) * sig_f
        cover = (torch.abs(Ys_f - y_true.unsqueeze(0)) <= cover_eps).any(dim=0).float().mean(dim=1)
        for b in range(x.size(0)):
            s = int(best_idx[b])
            rows.append({
                'idx': idx,
                'LSTM_MSE': float(lstm_mse[b]),
                'VAE_best_MSE': float(best_mse[b]),
                'Δ(LSTM-VAE)': float(lstm_mse[b]-best_mse[b]),
                'y_true': ((y_true[b]*sig_f+mu_f).detach().cpu().numpy().round(3)).tolist(),
                'y_LSTM': ((y_l[b]*sig_f+mu_f).detach().cpu().numpy().round(3)).tolist(),
                'y_VAE_best': ((Ys_f[s,b]*sig_f+mu_f).detach().cpu().numpy().round(3)).tolist(),
                'Diversity_std': float(diversity[b]),
                'Coverage_seq': float(cover[b])
            })
            idx += 1
    df = pd.DataFrame(rows)
    bins = [-np.inf,-1000,-200,-50,-10,10,50,200,1000,np.inf]
    labels = ["VAE<<劣(>1000)","VAE劣(200~1000)","VAE劣(50~200)","VAE劣(10~50)","持平(-10~10)","VAE勝(10~50)","VAE勝(50~200)","VAE勝(200~1000)","VAE>>大勝(>1000)"]
    df['bucket'] = pd.cut(df['Δ(LSTM-VAE)'], bins=bins, labels=labels)
    win_counts = df['bucket'].value_counts().reindex(labels, fill_value=0)
    win_table = win_counts.rename_axis('Improvement bucket').reset_index(name='count')
    win_table['ratio'] = (win_table['count']/len(df)).round(4)
    try:
        from IPython.display import display
        display(win_table)
    except Exception:
        print(win_table.to_string(index=False))
    top5 = df.nsmallest(5,'Δ(LSTM-VAE)')[['idx','LSTM_MSE','VAE_best_MSE','Δ(LSTM-VAE)','y_true','y_LSTM','y_VAE_best','Diversity_std']]
    try:
        from IPython.display import display
        display(top5)
    except Exception:
        print(top5.to_string(index=False))
    print('========== 評估結果（原始尺度） ==========')
    print(f'LSTM  MSE（整體） : {df["LSTM_MSE"].mean():.4f}')
    print(f'VAE   Best-of-N MSE: {df["VAE_best_MSE"].mean():.4f}  (N={n_samples})')
    print(f'VAE   Diversity (std): {df["Diversity_std"].mean():.4f}')
    print(f'VAE   Coverage（比例）: {df["Coverage_seq"].mean():.4f}')
    return df, win_table, top5

# ===============
# 主程式
# ===============
if __name__ == "__main__":
    print("=== Training Config ===")
    print(f"Seed={SEED}, Batch={BATCH_SIZE}, Optim=Adam(lr={LR})")
    print(f"Epochs: LSTM={EPOCHS_LSTM}, VAE={EPOCHS_VAE}")
    print(f"Hidden={HIDDEN_SIZE}, Latent={LATENT_SIZE}, Activation={ACT_NAME}")

    # LSTM
    lstm = Seq2SeqLSTM(FEAT_DIM, HIDDEN_SIZE, NUM_LAYERS).to(device)
    print(">>> 訓練 LSTM...")
    _ = train_lstm(lstm, ld_tr, ld_va)
    lstm.load_state_dict(torch.load(os.path.join(OUTPUT_DIR, "lstm_best.pt"), map_location=device))

    # VAE
    vae = Seq2SeqVAE(FEAT_DIM, HIDDEN_SIZE, LATENT_SIZE, NUM_LAYERS).to(device)
    print(">>> 訓練 VAE...")
    _ = train_vae(vae, ld_tr, ld_va)
    vae.load_state_dict(torch.load(os.path.join(OUTPUT_DIR, "vae_best.pt"), map_location=device))

    # 只輸出兩張表（你給的範例樣式）
    print(">>> 生成彙總表...")
    _ = eval_oulad_tables(lstm, vae, ld_te, mu, sig, cover_eps, n_samples=VAE_SAMPLES, feature_index=0)


  .apply(lambda g: g.assign(
  .apply(lambda g: g.assign(clicks_diff1 = g["clicks"].diff().fillna(0.0)))


可用學生序列數：25247 ；每條長度（週）=45，特徵數=4
Train windows: 494816  | Val: 106036 | Test: 106064
=== Training Config ===
Seed=42, Batch=128, Optim=Adam(lr=0.001)
Epochs: LSTM=12, VAE=14
Hidden=64, Latent=32, Activation=tanh
>>> 訓練 LSTM...
[LSTM] Epoch 01 | train 0.3784 | val 0.4234 | teacher 0.63
[LSTM] Epoch 02 | train 0.3604 | val 0.4145 | teacher 0.57
[LSTM] Epoch 03 | train 0.3594 | val 0.4109 | teacher 0.51
[LSTM] Epoch 04 | train 0.3602 | val 0.4082 | teacher 0.46
[LSTM] Epoch 05 | train 0.3621 | val 0.4072 | teacher 0.41
[LSTM] Epoch 06 | train 0.3641 | val 0.4051 | teacher 0.37
[LSTM] Epoch 07 | train 0.3661 | val 0.4030 | teacher 0.33
[LSTM] Epoch 08 | train 0.3676 | val 0.4022 | teacher 0.30
[LSTM] Epoch 09 | train 0.3702 | val 0.4028 | teacher 0.27
[LSTM] Epoch 10 | train 0.3715 | val 0.4005 | teacher 0.24
[LSTM] Epoch 11 | train 0.3738 | val 0.4003 | teacher 0.22
[LSTM] Epoch 12 | train 0.3744 | val 0.4005 | teacher 0.20
[LSTM] Total training time: 1416.38s
>>> 訓練 VAE...
[VAE ] Epoch 01

Unnamed: 0,Improvement bucket,count,ratio
0,VAE<<劣(>1000),0,0.0
1,VAE劣(200~1000),0,0.0
2,VAE劣(50~200),0,0.0
3,VAE劣(10~50),4,0.0
4,持平(-10~10),105999,0.9994
5,VAE勝(10~50),61,0.0006
6,VAE勝(50~200),0,0.0
7,VAE勝(200~1000),0,0.0
8,VAE>>大勝(>1000),0,0.0


Unnamed: 0,idx,LSTM_MSE,VAE_best_MSE,Δ(LSTM-VAE),y_true,y_LSTM,y_VAE_best,Diversity_std
66220,66220,1.560592,15.836799,-14.276206,"[4.059999942779541, 3.8289999961853027, 4.3689...","[2.5739998817443848, 2.9730000495910645, 3.101...","[0.31200000643730164, 0.24199999868869781, 0.2...",0.027685
34972,34972,2.316921,14.858671,-12.541751,"[3.8289999961853027, 3.638000011444092, 5.2829...","[2.2850000858306885, 2.5309998989105225, 2.611...","[0.24199999868869781, 0.24799999594688416, 0.2...",0.028501
30689,30689,2.34324,13.452154,-11.108914,"[5.914000034332275, 4.6539998054504395, 3.4660...","[2.861999988555908, 3.3970000743865967, 3.5729...","[0.9860000014305115, 0.9490000009536743, 0.952...",0.06031
57680,57680,1.954371,12.85912,-10.904749,"[5.497000217437744, 4.263000011444092, 1.94599...","[3.2269999980926514, 3.6419999599456787, 3.375...","[0.34299999475479126, 0.21299999952316284, 0.2...",0.02822
1960,1960,1.998083,11.617039,-9.618956,"[2.3980000019073486, 5.302999973297119, 1.0989...","[2.4200000762939453, 2.5899999141693115, 2.525...","[0.22599999606609344, 0.19300000369548798, 0.1...",0.019325


LSTM  MSE（整體） : 1.3934
VAE   Best-of-N MSE: 1.1169  (N=40)
VAE   Diversity (std): 0.1951
VAE   Coverage（比例）: 0.6348
