In [1]:
from __future__ import annotations
import sys
import os

sys.path.append(os.path.abspath("../.."))

import argparse
import math
import random
from pathlib import Path
from typing import List, Tuple, Dict

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, Subset
import matplotlib.pyplot as plt

from vol_predict.models.mixture.tm_pred2 import (
    TM_N_Predictor,
    TM_LN_Predictor,
    TM_IG_Predictor,
    TM_W_Predictor,
    TM_HN_W_Predictor,
    TM_HN_IG_Predictor
)
from vol_predict.loss.tm_loss import (
    MixtureNormalNLL,
    HingeNormalMixtureNLL,
    MixtureLogNormalNLL,
    MixtureInverseGaussianNLL,
    MixtureWeibullNLL,
    GenericMixtureNLL,
    MixtureHingeNormalWeibullNLL,
    MixtureHingeNormalInvGaussianNLL
)

# --------------------------------------------------------------------------- #
#  Reproducibility                                                            #
# --------------------------------------------------------------------------- #
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True  # might slow down but ensures reproducibility
    torch.backends.cudnn.benchmark = False
    
set_seed()

# --------------------------------------------------------------------------- #
#  Data utilities                                                             #
# --------------------------------------------------------------------------- #
def preprocess_csv(csv_path: Path) -> pd.DataFrame:
    """
    Load, clean and standardise the raw CSV exactly as required.
    """
    data = pd.read_csv(csv_path)

    # --- drop unneeded columns ------------------------------------------------
    data.dropna(inplace=True)
    cols_to_drop = [c for c in data.columns if c.startswith("ask_depth_t")]
    data.drop(columns=cols_to_drop, inplace=True)
    cols_to_drop = [c for c in data.columns if c.startswith("ret")]
    data.drop(columns=cols_to_drop, inplace=True)
    data.drop(columns='datetime', inplace=True)
    # --- z-score for *all* features except timestamp / vol --------------------
    vol_std = data["vol"].std()

    for col in data.columns[1:]:
        m, s = data[col].mean(), data[col].std()
        data[col] = (data[col] - m) / s

    data["vol"] = data["vol"] / vol_std
    return data


class TMDataset(Dataset):
    """Dataset producing (past_vol, features_flat, target_vol)."""

    def __init__(self, df: pd.DataFrame, ar_order: int = 16):
        super().__init__()
        self.ar_order = ar_order

        self.vol_array = df["vol"].values.astype(np.float32)
        self.feat_array = df.iloc[:, 1:].values.astype(np.float32)

        self.valid_idx = range(ar_order, len(df))

    def __len__(self) -> int:
        return len(self.valid_idx)

    def __getitem__(self, idx: int):
        i = self.valid_idx[idx]
        past_vol = self.vol_array[i - self.ar_order : i]           # [ar_order]
        feats = self.feat_array[i]                                 # [n*lb]
        target = self.vol_array[i]

        return (torch.tensor(past_vol, dtype=torch.float32),
                torch.tensor(feats, dtype=torch.float32),
                torch.tensor(target, dtype=torch.float32))


# --------------------------------------------------------------------------- #
#  Model / loss factory                                                       #
# --------------------------------------------------------------------------- #
def model_and_loss(model_type: str,
                   ar_order: int,
                   n: int,
                   lb: int,
                   penalty_coef: float = 1.0,
                   kl_weight: float = 0,
                   delta: float = 0.0,
                   l2_coef: float = 0.0,
                   eps: float = 1e-12) -> Tuple[nn.Module, nn.Module]:

    if model_type == "normal":
        model = TM_N_Predictor(ar_order, n, lb)
        loss = MixtureNormalNLL(eps=eps, l2_coef=l2_coef)
    elif model_type == "hinge":
        model = TM_N_Predictor(ar_order, n, lb)
        loss = HingeNormalMixtureNLL(
            penalty_coef=penalty_coef, delta=delta,
            eps=eps, l2_coef=l2_coef
        )
    elif model_type == "lognormal":
        model = TM_LN_Predictor(ar_order, n, lb)
        loss = GenericMixtureNLL(crps_weight=0.1,kl_weight = kl_weight, l2_coef=l2_coef)
        loss.set_model(model)
    elif model_type == "inverse_gaussian":
        model = TM_IG_Predictor(ar_order, n, lb)
        loss = GenericMixtureNLL(crps_weight=0.1,kl_weight = kl_weight, l2_coef=l2_coef)
        loss.set_model(model)
    elif model_type == "weibull":
        model = TM_W_Predictor(ar_order, n, lb)
        loss = GenericMixtureNLL(crps_weight=0.1, kl_weight = kl_weight, l2_coef=l2_coef)
        loss.set_model(model)
    elif model_type == "hinge_weibull":
        model = TM_HN_W_Predictor(ar_order, n, lb)
        loss = MixtureHingeNormalWeibullNLL(
            penalty_coef=penalty_coef, delta=delta,kl_weight = kl_weight,
            eps=eps, l2_coef=l2_coef
        )
    elif model_type == "hinge_inverse_gaussian":
        model = TM_HN_IG_Predictor(ar_order, n, lb)
        loss = MixtureHingeNormalInvGaussianNLL(
            penalty_coef=penalty_coef, delta=delta,kl_weight = kl_weight,
            eps=eps, l2_coef=l2_coef
        )
    else:
        raise ValueError(f"Unknown model_type '{model_type}'")

    if hasattr(loss, "set_model"):
        loss.set_model(model)
    return model, loss


# --------------------------------------------------------------------------- #
#  Core training helpers (one epoch & eval)                                   #
# --------------------------------------------------------------------------- #
def epoch_step(loader: DataLoader,
               model: nn.Module,
               loss_fn: nn.Module,
               optimiser: torch.optim.Optimizer | None,
               device: torch.device) -> float:
    is_train = optimiser is not None
    model.train(mode=is_train)

    total, n = 0.0, 0
    for vol_hist, feats, target in loader:
        vol_hist, feats, target = (vol_hist.to(device),
                                   feats.to(device),
                                   target.to(device))

        if is_train:
            optimiser.zero_grad()

        roll_mean = vol_hist.mean(dim=1)
        out = model(vol_hist, feats, roll_mean=roll_mean)
        loss = loss_fn(target, out, model)

        if is_train:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
            optimiser.step()

        total += loss.item() * target.size(0)
        n += target.size(0)

    return total / n


@torch.no_grad()
def compute_metrics(loader: DataLoader,
                    model: nn.Module,
                    loss_fn: nn.Module,
                    device: torch.device) -> Dict[str, float]:

    model.eval()
    tot_sq, tot_abs, n = 0.0, 0.0, 0
    for vol_hist, feats, target in loader:
        vol_hist, feats, target = (vol_hist.to(device),
                                   feats.to(device),
                                   target.to(device))
        roll_mean = vol_hist.mean(dim=1)
        out = model(vol_hist, feats, roll_mean=roll_mean)
        pred = out["mixture_mean"]
        diff = pred - target
        tot_sq += (diff ** 2).sum().item()
        tot_abs += diff.abs().sum().item()
        n += target.size(0)

    return dict(rmse=math.sqrt(tot_sq / n), mae=tot_abs / n, count=n)

def winsorize_subset(df_base: pd.DataFrame,
                     rows: List[int],
                     thresh: float) -> pd.DataFrame:
    df_w = df_base.iloc[rows].copy()
    df_w = df_w.clip(lower=-thresh, upper=thresh)
    return df_w.reset_index(drop=True)
    
# --------------------------------------------------------------------------- #
#  Single-split training (original pipeline)                                  #
# --------------------------------------------------------------------------- #
def run_single_strategy(df: pd.DataFrame,
                        args) -> None:
    """Keeps the behaviour of the original `run_training` pathway."""
    device = torch.device(args.device)
    dataset = TMDataset(df, ar_order=args.ar_order)
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_ds = Subset(dataset, range(train_size))
    val_ds = Subset(dataset, range(train_size, train_size + val_size))

    n_features = df.shape[1] - 2
    if n_features % args.lb != 0:
        raise ValueError("lb must divide total feature dimension.")
    n_feat = n_features // args.lb

    model, loss_fn = model_and_loss(
        args.model_type, args.ar_order, n_feat, args.lb,
        args.penalty_coef, args.delta, args.l2_coef, args.eps
    )
    model.to(device)

    opt = torch.optim.Adam(model.parameters(), lr=args.lr)
    sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, patience=5, factor=0.5)

    train_loader = DataLoader(train_ds, batch_size=args.batch, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=args.batch, shuffle=False)

    best_val = float("inf")
    for epoch in range(1, args.epochs + 1):
        tr = epoch_step(train_loader, model, loss_fn, opt, device)
        vl = epoch_step(val_loader, model, loss_fn, None, device)
        sched.step(vl)
        if vl < best_val:
            best_val = vl
            best_state = model.state_dict()
        print(f"[single] epoch {epoch:02d}/{args.epochs}  "
              f"train={tr:.4f}  val={vl:.4f}")

    model.load_state_dict(best_state)
    metrics = compute_metrics(val_loader, model, loss_fn, device)
    print(f"[single] final  RMSE={metrics['rmse']:.4f}  "
          f"MAE={metrics['mae']:.4f}")


# --------------------------------------------------------------------------- #
#  Rolling / Incremental strategies                                           #
# --------------------------------------------------------------------------- #
def split_intervals(total_len: int, interval: int) -> List[Tuple[int, int]]:
    return [(i, min(i + interval, total_len))
            for i in range(0, total_len, interval)]


def run_rolling_incremental(df: pd.DataFrame,
                            args,
                            strategy: str) -> None:
    """
    Either a *rolling* or *incremental* evaluation loop.

    rolling:
        – test interval has `interval_hours` points
        – training uses `back_intervals` previous intervals
    incremental:
        – test interval as above
        – training window grows with each iteration
    """
    device = torch.device(args.device)
    dataset = TMDataset(df, ar_order=args.ar_order)
    full_len = len(dataset)
    intervals = split_intervals(full_len, args.interval_hours)

    n_features = df.shape[1] - 1
    if n_features % args.lb != 0:
        raise ValueError("lb must divide total feature dimension.")
    n_feat = n_features // args.lb

    history: List[Dict[str, float]] = []
    accumulated_train: List[int] = []

    for idx, (beg, end) in enumerate(intervals):
        if strategy == "rolling":
            train_beg = max(0, beg - args.back_intervals * args.interval_hours)
            train_idx = list(range(train_beg, beg))
        else:  # incremental
            if idx == 0:
                train_idx = list(range(0, beg))
            else:
                accumulated_train.extend(range(intervals[idx - 1][0],
                                               intervals[idx - 1][1]))
                train_idx = accumulated_train.copy()

        if not train_idx:
            # first interval for incremental – skip evaluation
            accumulated_train.extend(range(beg, end))
            continue

        train_ds = Subset(dataset, train_idx)
        test_ds = Subset(dataset, range(beg, end))
        tr_loader = DataLoader(train_ds, batch_size=args.batch, shuffle=True)
        te_loader = DataLoader(test_ds, batch_size=args.batch, shuffle=False)

        model, loss_fn = model_and_loss(
            args.model_type, args.ar_order, n_feat, args.lb,
            args.penalty_coef, args.delta, args.l2_coef, args.eps
        )
        model.to(device)
        opt = torch.optim.Adam(model.parameters(), lr=args.lr)

        # --- quick training (few epochs) --------------------------------------
        best = float("inf")
        for epoch in range(1, args.epochs + 1):
            tr_loss = epoch_step(tr_loader, model, loss_fn, opt, device)
            if tr_loss < best:
                best = tr_loss
                best_state = model.state_dict()

        model.load_state_dict(best_state)
        metrics = compute_metrics(te_loader, model, loss_fn, device)
        print(f"[{strategy}] interval {idx:02d}  "
              f"RMSE={metrics['rmse']:.4f}  MAE={metrics['mae']:.4f}")
        history.append(dict(interval=idx, **metrics))

        if strategy == "incremental":
            accumulated_train.extend(range(beg, end))

    print(f"[{strategy}] done – evaluated {len(history)} intervals.")

# --- poprawiona funkcja quick_plot – zgodna z wymiarem gate_weights ---------
def quick_plot_separate(loader: DataLoader,
                        mdl: nn.Module,
                        dev: torch.device,
                        title: str,
                        train_intervals,
                        test_interval: int,
                        points: int = 100):
    """
    Rysuje dwa **oddzielne** wykresy:
        • fig_pred  – True vs Predicted volatility
        • fig_gate  – Gate weights (stacked-area), jeżeli model je zwraca

    Zwraca krotkę (fig_pred, fig_gate).  `fig_gate` == None,
    gdy model nie zwraca 'gate_weights'.
    """
    preds, trues, gates = [], [], []
    mdl.eval()
    with torch.no_grad():
        for vhist, feats, tgt in loader:
            vhist, feats = vhist.to(dev), feats.to(dev)
            roll_mean = vhist.mean(dim=1)
            out = mdl(vhist, feats, roll_mean=roll_mean)
            preds.append(out["mixture_mean"].cpu().numpy())
            trues.append(tgt.cpu().numpy())
            if "gate_weights" in out:
                gates.append(out["gate_weights"].cpu().numpy())

    preds  = np.concatenate(preds)[:points]
    trues  = np.concatenate(trues)[:points]
    gates  = np.concatenate(gates)[:points] if gates else None
    if gates is not None and gates.ndim > 1:
        gates = gates[:, 0]          # redukcja do 1-D

    # ---------- (1) True vs Pred ------------------------------------
    fig_pred, ax = plt.subplots(1, 1, figsize=(12, 4))
    ax.plot(trues, label="True", linewidth=2)
    ax.plot(preds, label="Predicted", linewidth=2)
    ax.set_ylabel("Std-scaled volatility")
    ax.set_xlabel("Time step")
    ax.set_title(f"{title}\n[train: {train_intervals}] → [test: {test_interval}]")
    ax.grid(True)
    ax.legend()
    plt.show()

    # ---------- (2) Gate weights ------------------------------------
    fig_gate = None
    if gates is not None:
        fig_gate, axg = plt.subplots(1, 1, figsize=(12, 2.5))
        x = np.arange(len(gates))
        axg.fill_between(x, 0, gates, color="#d9d96f")   # volatility (żółte)
        axg.fill_between(x, gates, 1.0, color="#7f7f7f") # order book (szare)
        axg.set_ylim(0, 1)
        axg.set_ylabel("gate g(t)")
        axg.set_xlabel("Time step")
        axg.grid(True)
        axg.legend(["Volatility", "Order book"],
                   loc="upper right", framealpha=0.9)
        plt.show()

    return fig_pred, fig_gate

In [2]:
print(torch.__version__)
print(torch.__file__)

2.6.0
/opt/anaconda3/envs/volatilityenv/lib/python3.10/site-packages/torch/__init__.py


In [3]:
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True  # might slow down but ensures reproducibility
    torch.backends.cudnn.benchmark = False

def results(csv_path = "../../data/spx/btc-2/data_df.csv",
strategy = "rolling" ,
model_type = "hinge",
ar_order = 32,
lb = 120,
lr = 0.0001,
batch = 32,
epochs = 70,
penalty_coef = 1.0,
delta = 0.0,
l2_coef = 0.001,
kl_weight = 0.05,
eps = 1e-12,
device = torch.device("cuda" if torch.cuda.is_available() else "cpu"),
interval_hours = 720,
back_intervals = 2,
return_history = False,
do_plot = False,
do_print = False):
    df = preprocess_csv(csv_path)
    n_features = df.shape[1] - 1
    assert n_features % lb == 0, "lb must divide total feature dimension."
    n_feat = n_features // lb

    dataset = TMDataset(df, ar_order=ar_order)
    full_len = len(dataset)

    if strategy == "single":
        train_size = int(0.8 * len(dataset))
        train_indices = list(range(train_size))
        outlier_thresh = 500
        train_df_win = winsorize_subset(df, train_indices, outlier_thresh)
        train_ds = TMDataset(train_df_win, ar_order=ar_order)
        #train_ds = Subset(dataset, range(train_size))
        val_ds = Subset(dataset, range(train_size, len(dataset)))

        train_loader = DataLoader(train_ds, batch_size=batch, shuffle=True)
        val_loader = DataLoader(val_ds, batch_size=batch, shuffle=False)

        model, loss_fn = model_and_loss(model_type, ar_order, n_feat, lb,
                                        penalty_coef, delta, l2_coef, eps)
        model.to(device)
        opt = torch.optim.Adam(model.parameters(), lr=lr)
        sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, patience=5, factor=0.5)

        best_val = float("inf")
        for epoch in range(1, epochs + 1):
            tr = epoch_step(train_loader, model, loss_fn, opt, device)
            vl = epoch_step(val_loader, model, loss_fn, None, device)
            sched.step(vl)
            if vl < best_val:
                best_val = vl
                best_state = model.state_dict()
            print(f"[single] epoch {epoch:02d}/{epochs}  train={tr:.4f}  val={vl:.4f}")

        model.load_state_dict(best_state)
        metrics = compute_metrics(val_loader, model, loss_fn, device)
        print(f"[single] final RMSE={metrics['rmse']:.4f}  MAE={metrics['mae']:.4f}")
        if do_plot:
            quick_plot_separate(val_loader, model, device)

    else:
        intervals = split_intervals(full_len, interval_hours)
        history: List[Dict[str, float]] = []
        accumulated_train: List[int] = []

        for idx, (beg, end) in enumerate(intervals):
            if strategy == "rolling":
                train_beg = max(0, beg - back_intervals * interval_hours)
                train_idx = list(range(train_beg, beg))
            else:  # incremental
                if idx == 0:
                    train_idx = list(range(0, beg))
                else:
                    accumulated_train.extend(range(intervals[idx - 1][0], intervals[idx - 1][1]))
                    train_idx = accumulated_train.copy()

            if not train_idx:
                accumulated_train.extend(range(beg, end))
                continue

            outlier_thresh = 500
            train_df_win = winsorize_subset(df, train_idx, outlier_thresh)
            train_ds = TMDataset(train_df_win, ar_order=ar_order)
            test_ds = Subset(dataset, range(beg, end))
            tr_loader = DataLoader(train_ds, batch_size=batch, shuffle=True)
            te_loader = DataLoader(test_ds, batch_size=batch, shuffle=False)

            model, loss_fn = model_and_loss(model_type, ar_order, n_feat, lb, kl_weight,
                                            penalty_coef, delta, l2_coef, eps)
            model.to(device)
            opt = torch.optim.Adam(model.parameters(), lr=lr)

            best = float("inf")
            for epoch in range(1, epochs + 1):
                tr_loss = epoch_step(tr_loader, model, loss_fn, opt, device)
                if tr_loss < best:
                    best = tr_loss
                    best_state = model.state_dict()

            model.load_state_dict(best_state)
            metrics = compute_metrics(te_loader, model, loss_fn, device)
            if do_print:
                print(f"[{strategy}] interval {idx:02d}  RMSE={metrics['rmse']:.4f}  MAE={metrics['mae']:.4f}")
            history.append(dict(interval=idx, **metrics))
            if do_plot:
                quick_plot_separate(te_loader, model, device,
                    title=f"{strategy.capitalize()} – interval {idx:02d}",
                    train_intervals=[train_beg // interval_hours + i
                                     for i in range((beg - train_beg) // interval_hours)]
                                   if strategy == "rolling"
                                   else list(range(0, idx)),
                    test_interval=idx)


            if strategy == "incremental":
                accumulated_train.extend(range(beg, end))

        print(f"[{strategy}] for model {model_type} finished {len(history)} intervals.")
        if return_history:
            return history

In [4]:
model_list = ["normal", "hinge", "lognormal", "inverse_gaussian", "weibull","hinge_weibull","hinge_inverse_gaussian"]

all_histories = {}
all_preds = {}

for model_name in model_list:
    print(f"\n=== Training model: {model_name.upper()} ===\n")
    history = results(csv_path="../../data/spx/btc-2/data_df.csv",
                      strategy="rolling",
                      model_type=model_name,
                      ar_order=64,
                      lb=120,
                      lr=0.001,
                      batch=32,
                      epochs=70,
                      penalty_coef=1.0,
                      delta=0.0,
                      kl_weight=0.05,
                      l2_coef=0.01,
                      eps=1e-8,
                      device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
                      interval_hours=720,
                      back_intervals=2,
                      return_history=True,
                      do_plot=False)
    all_histories[model_name] = history



=== Training model: NORMAL ===

[rolling] for model normal finished 3 intervals.

=== Training model: HINGE ===

[rolling] for model hinge finished 3 intervals.

=== Training model: LOGNORMAL ===

[rolling] for model lognormal finished 3 intervals.

=== Training model: INVERSE_GAUSSIAN ===

[rolling] for model inverse_gaussian finished 3 intervals.

=== Training model: WEIBULL ===

[rolling] for model weibull finished 3 intervals.

=== Training model: HINGE_WEIBULL ===

[rolling] for model hinge_weibull finished 3 intervals.

=== Training model: HINGE_INVERSE_GAUSSIAN ===

[rolling] for model hinge_inverse_gaussian finished 3 intervals.


In [5]:

vol_std = pd.read_csv("../../data/spx/btc-2/data_df.csv")["vol"].std()
rmse_table = {}
mae_table = {}

for model_name, hist in all_histories.items():
    df = pd.DataFrame(hist).set_index('interval')
    rmse_table[model_name] = df['rmse']
    mae_table[model_name] = df['mae']

rmse_df = pd.DataFrame(rmse_table).T 
mae_df = pd.DataFrame(mae_table).T

rmse_df = rmse_df.reindex(sorted(rmse_df.columns), axis=1)
mae_df = mae_df.reindex(sorted(mae_df.columns), axis=1)

rmse_best = rmse_df.idxmin(axis=0)
mae_best = mae_df.idxmin(axis=0)

rmse_df.loc["best"] = rmse_best
mae_df.loc["best"] = mae_best

rmse_df_unscaled = rmse_df.drop(index="best").astype(float) * vol_std
mae_df_unscaled = mae_df.drop(index="best").astype(float) * vol_std

rmse_df_unscaled.loc["best"] = rmse_df_unscaled.idxmin(axis=0)
mae_df_unscaled.loc["best"] = mae_df_unscaled.idxmin(axis=0)

combined = pd.concat(
    [rmse_df_unscaled, mae_df_unscaled],
    keys=['RMSE', 'MAE'],
    axis=1
)

combined = combined.reorder_levels([0,1], axis=1).sort_index(axis=1, level=0)

display(combined)


Unnamed: 0_level_0,MAE,MAE,MAE,RMSE,RMSE,RMSE
interval,1,2,3,1,2,3
normal,0.000112,0.000087,0.00005,0.000273,0.000257,0.000099
hinge,0.000108,0.000088,0.000062,0.000277,0.000262,0.000102
lognormal,0.000305,0.000313,0.000263,0.000428,0.000384,0.000274
inverse_gaussian,0.000156,0.00015,0.000137,0.000293,0.000281,0.000156
weibull,0.000125,0.000151,0.000055,0.000535,0.000296,0.000098
hinge_weibull,0.000105,0.0001,0.000057,0.000272,0.000265,0.000103
hinge_inverse_gaussian,0.000123,0.000127,0.00008,0.000283,0.000288,0.000116
best,hinge_weibull,normal,normal,hinge_weibull,normal,weibull


In [6]:
lr_list = [0.1,0.01,0.001,0.0001]
if False:
    for lr in lr_list:
        model_list = ["normal", "hinge", "lognormal", "inverse_gaussian", "weibull","hinge_weibull","hinge_inverse_gaussian"]

        all_histories = {}
        all_preds = {}

        for model_name in model_list:
            print(f"\n=== Training model: {model_name.upper()} ===\n")
            history = results(csv_path="../../data/spx/btc-2/data_df.csv",
                            strategy="rolling",
                            model_type=model_name,
                            ar_order=64,
                            lb=120,
                            lr=0.001,
                            batch=32,
                            epochs=70,
                            penalty_coef=1.0,
                            delta=0.0,
                            kl_weight=0.05,
                            l2_coef=0.01,
                            eps=1e-8,
                            device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
                            interval_hours=720,
                            back_intervals=2,
                            return_history=True,
                            do_plot=False)
            all_histories[model_name] = history

        vol_std = pd.read_csv("../../data/spx/btc-2/data_df.csv")["vol"].std()
        rmse_table = {}
        mae_table = {}

        for model_name, hist in all_histories.items():
            df = pd.DataFrame(hist).set_index('interval')
            rmse_table[model_name] = df['rmse']
            mae_table[model_name] = df['mae']

        rmse_df = pd.DataFrame(rmse_table).T 
        mae_df = pd.DataFrame(mae_table).T

        rmse_df = rmse_df.reindex(sorted(rmse_df.columns), axis=1)
        mae_df = mae_df.reindex(sorted(mae_df.columns), axis=1)

        rmse_best = rmse_df.idxmin(axis=0)
        mae_best = mae_df.idxmin(axis=0)

        rmse_df.loc["best"] = rmse_best
        mae_df.loc["best"] = mae_best

        rmse_df_unscaled = rmse_df.drop(index="best").astype(float) * vol_std
        mae_df_unscaled = mae_df.drop(index="best").astype(float) * vol_std

        rmse_df_unscaled.loc["best"] = rmse_df_unscaled.idxmin(axis=0)
        mae_df_unscaled.loc["best"] = mae_df_unscaled.idxmin(axis=0)

        combined = pd.concat(
            [rmse_df_unscaled, mae_df_unscaled],
            keys=['RMSE', 'MAE'],
            axis=1
        )

        combined = combined.reorder_levels([0,1], axis=1).sort_index(axis=1, level=0)
        print('The current learning rate is: ',lr)
        display(combined)
