# Flint — Local Forecasting Benchmark (Multi-Model)

This notebook runs locally (macOS/Linux/Windows) and will:
1. Load contract CSVs from the repo data folder
2. Build leakage-safe train/val/test splits
3. Benchmark multiple models (quick tune + shorter train)
4. Pick the best model and train it longer
5. Report: train/val curves, test curves, and direction confusion matrix + accuracy/precision/recall/F1

Recommended: run from the repo root so paths auto-resolve.

In [None]:
# Imports + device
import json, math, random
from pathlib import Path
from typing import Dict, List, Optional, Sequence, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch.utils.data import DataLoader, ConcatDataset
from sklearn.metrics import confusion_matrix

FEATURE_COLS = ['open', 'high', 'low', 'close', 'volume']
CLOSE_IDX = FEATURE_COLS.index('close')

def seed_all(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
USE_GPU = (device.type == 'cuda')
PIN_MEMORY = USE_GPU
NUM_WORKERS = 0  # macOS: keep 0 for stability; increase on Linux if desired

if USE_GPU:
    # Speed knobs (safe defaults)
    torch.backends.cudnn.benchmark = True
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    try:
        torch.set_float32_matmul_precision('high')
    except Exception:
        pass

print('device:', device)
if USE_GPU:
    print('gpu:', torch.cuda.get_device_name(0))

def make_loader(dataset, *, batch_size: int, shuffle: bool, drop_last: bool) -> DataLoader:
    # pin_memory + non_blocking transfers improve GPU input pipeline
    kwargs = dict(num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)
    if NUM_WORKERS > 0:
        kwargs['persistent_workers'] = True
        kwargs['prefetch_factor'] = 2
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, **kwargs)

In [None]:
# === CONFIG (local) ===
seed_all(42)

def find_notebook_dir(notebook_name: str = 'forecasting_local.ipynb', start: Optional[Path] = None) -> Path:
    """Best-effort: find the directory containing this notebook by walking up from CWD."""
    start = (start or Path.cwd()).resolve()
    for p in [start, *start.parents]:
        if (p / notebook_name).exists():
            return p
    return start

NOTEBOOK_DIR = find_notebook_dir()
DATA_DIR = NOTEBOOK_DIR  # CSVs are expected to live next to this notebook
ARTIFACTS_DIR = NOTEBOOK_DIR / 'artifacts_local'
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
print('NOTEBOOK_DIR:', NOTEBOOK_DIR)
print('DATA_DIR:', DATA_DIR)
print('ARTIFACTS_DIR:', ARTIFACTS_DIR)

# Preprocessing
LOG_VOLUME = True

# Target mode: delta-from-last is usually much easier than absolute OHLCV
TARGET_MODE = 'delta_last'  # 'delta_last' or 'absolute'

# Data/windowing
BATCH_SIZE = 32
HORIZON = 60
DAYS_PER_SAMPLE = 7
STRIDE_DAYS = 1
MIN_DAY_LEN_RATIO = 0.9
TRAIN_FRAC, VAL_FRAC, TEST_FRAC = 0.8, 0.1, 0.1

# Benchmark settings (fast-ish)
BENCH_MODELS = ['direct_lstm', 'attn_lstm', 'cnn_lstm', 'seq2seq_lstm']
BENCH_TUNE_TRIALS = 8
BENCH_TUNE_EPOCHS = 3
BENCH_FINAL_EPOCHS = 20

# Best-model long training
FINAL_EPOCHS = 80
EARLY_STOPPING_PATIENCE = 10
MAX_TRAIN_BATCHES_PER_EPOCH = None  # None = full epoch; set int to cap

required = ['df_h.csv','df_m.csv','df_u.csv','df_z.csv']
missing = [f for f in required if not (DATA_DIR / f).exists()]
if missing:
    raise FileNotFoundError(
        f"Missing files in DATA_DIR={DATA_DIR}: {missing}\n"
        "Put df_h.csv/df_m.csv/df_u.csv/df_z.csv next to forecasting_local.ipynb, or change DATA_DIR."
    )

In [None]:
# Data loading + datasets
def load_contract_csv(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path)
    expected = {'ts_event','open','high','low','close','volume','contract_month'}
    missing = expected - set(df.columns)
    if missing:
        raise ValueError(f"{path.name} missing columns: {sorted(missing)}")
    df = df.copy()
    df['ts_event'] = pd.to_datetime(df['ts_event'], errors='coerce')
    df = df.dropna(subset=['ts_event']).sort_values('ts_event').reset_index(drop=True)
    for col in FEATURE_COLS:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    df = df.dropna(subset=FEATURE_COLS)
    if LOG_VOLUME:
        df['volume'] = np.log1p(np.maximum(df['volume'].to_numpy(dtype=np.float64), 0.0)).astype(np.float32)
    for col in FEATURE_COLS:
        df[col] = df[col].astype(np.float32)
    df['contract_month'] = df['contract_month'].astype(str)
    return df

def compute_median_trading_minutes_per_day(df: pd.DataFrame) -> int:
    day_counts = df.groupby(df['ts_event'].dt.date).size().astype(int)
    if day_counts.empty:
        raise ValueError('No daily rows found')
    median_count = int(day_counts.median())
    if median_count <= 0:
        raise ValueError('Median daily count is <= 0')
    return median_count

def build_day_segments(
    df: pd.DataFrame,
    *,
    day_len: int,
    min_day_len_ratio: float = 0.9,
) -> Tuple[np.ndarray, List[Tuple[int,int]], List[pd.Timestamp]]:
    if day_len <= 60:
        raise ValueError('day_len must be > 60 to hold a 1-hour target')
    df = df.sort_values('ts_event').reset_index(drop=True)
    data = df[FEATURE_COLS].to_numpy(dtype=np.float32, copy=True)
    groups = df.groupby(df['ts_event'].dt.date, sort=True)
    day_segments: List[Tuple[int,int]] = []
    day_starts: List[pd.Timestamp] = []
    min_len = int(day_len * min_day_len_ratio)
    for _, g in groups:
        if len(g) < min_len:
            continue
        day_start_idx = int(g.index.min())
        day_end_idx = int(g.index.max()) + 1
        if (day_end_idx - day_start_idx) < day_len:
            continue
        seg_end = day_end_idx
        seg_start = seg_end - day_len
        day_segments.append((seg_start, seg_end))
        day_starts.append(pd.Timestamp(g['ts_event'].iloc[0]))
    if not day_segments:
        raise ValueError('No valid day segments were created; check day_len/min_day_len_ratio')
    return data, day_segments, day_starts

class WeekToHourDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        *,
        contract_name: str,
        data: np.ndarray,
        day_segments: List[Tuple[int,int]],
        week_starts: List[int],
        day_len: int,
        days_per_sample: int = 7,
        horizon: int = 60,
        mean: Optional[np.ndarray] = None,
        std: Optional[np.ndarray] = None,
    ):
        self.contract_name = contract_name
        self.data = data
        self.day_segments = day_segments
        self.week_starts = week_starts
        self.day_len = day_len
        self.days_per_sample = days_per_sample
        self.horizon = horizon
        self.mean = mean
        self.std = std
        self.total_len = self.days_per_sample * day_len
        if self.total_len <= horizon:
            raise ValueError('Total sample length must be > horizon')
        self.input_len = self.total_len - horizon

    def __len__(self) -> int:
        return len(self.week_starts)

    def set_scaler(self, mean: np.ndarray, std: np.ndarray) -> None:
        self.mean = mean.astype(np.float32)
        self.std = std.astype(np.float32)

    def _get_week_array(self, start_day_idx: int) -> np.ndarray:
        segs = self.day_segments[start_day_idx : start_day_idx + self.days_per_sample]
        return np.concatenate([self.data[s:e] for (s, e) in segs], axis=0)

    def __getitem__(self, idx: int):
        start_day = self.week_starts[idx]
        seq = self._get_week_array(start_day)
        x = seq[: self.input_len]
        y = seq[self.input_len :]
        if self.mean is not None and self.std is not None:
            x = (x - self.mean) / self.std
            y = (y - self.mean) / self.std
        return torch.from_numpy(x).float(), torch.from_numpy(y).float()

def compute_global_scaler_from_train(train_datasets: Sequence[WeekToHourDataset], eps: float = 1e-6) -> Tuple[np.ndarray, np.ndarray]:
    sum_vec = np.zeros((len(FEATURE_COLS),), dtype=np.float64)
    sumsq_vec = np.zeros((len(FEATURE_COLS),), dtype=np.float64)
    count = 0
    for ds in train_datasets:
        for start_day in ds.week_starts:
            seq = ds._get_week_array(start_day)
            x = seq[: ds.input_len]
            sum_vec += x.sum(axis=0)
            sumsq_vec += (x * x).sum(axis=0)
            count += x.shape[0]
    if count == 0:
        raise ValueError('No training rows to compute scaler')
    mean = (sum_vec / count).astype(np.float32)
    var = (sumsq_vec / count) - (mean.astype(np.float64) ** 2)
    std = np.sqrt(np.maximum(var, 0.0)).astype(np.float32)
    std = np.where(std < eps, np.float32(1.0), std)
    return mean, std

def build_datasets_for_contract(
    contract_name: str,
    df: pd.DataFrame,
    *,
    day_len_override: Optional[int] = None,
    days_per_sample: int = 7,
    stride_days: int = 1,
    horizon: int = 60,
    min_day_len_ratio: float = 0.9,
    train_frac: float = 0.8,
    val_frac: float = 0.1,
    test_frac: float = 0.1,
) -> Dict[str, WeekToHourDataset]:
    day_len = day_len_override or compute_median_trading_minutes_per_day(df)
    data, day_segments, _ = build_day_segments(df, day_len=day_len, min_day_len_ratio=min_day_len_ratio)
    n_days = len(day_segments)
    train_day_end = max(days_per_sample, int(n_days * train_frac))
    val_day_end = max(train_day_end + days_per_sample, int(n_days * (train_frac + val_frac)))
    val_day_end = min(n_days, val_day_end)

    def starts_in_range(day_start: int, day_end: int) -> List[int]:
        last_start = day_end - days_per_sample
        if last_start < day_start:
            return []
        return list(range(day_start, last_start + 1, stride_days))

    train_starts = starts_in_range(0, train_day_end)
    val_starts = starts_in_range(train_day_end, val_day_end)
    test_starts = starts_in_range(val_day_end, n_days)
    if len(train_starts) < 5 or len(val_starts) < 1 or len(test_starts) < 1:
        raise ValueError(f'Not enough samples after split for {contract_name}: train={len(train_starts)} val={len(val_starts)} test={len(test_starts)}')

    return {
        'train': WeekToHourDataset(contract_name=contract_name, data=data, day_segments=day_segments, week_starts=train_starts, day_len=day_len, days_per_sample=days_per_sample, horizon=horizon),
        'val': WeekToHourDataset(contract_name=contract_name, data=data, day_segments=day_segments, week_starts=val_starts, day_len=day_len, days_per_sample=days_per_sample, horizon=horizon),
        'test': WeekToHourDataset(contract_name=contract_name, data=data, day_segments=day_segments, week_starts=test_starts, day_len=day_len, days_per_sample=days_per_sample, horizon=horizon),
    }

In [None]:
# Build splits + global scaler
files = {
    'H': DATA_DIR / 'df_h.csv',
    'M': DATA_DIR / 'df_m.csv',
    'U': DATA_DIR / 'df_u.csv',
    'Z': DATA_DIR / 'df_z.csv',
}

contract_dfs: Dict[str, pd.DataFrame] = {}
median_day_lens: Dict[str, int] = {}
for c, p in files.items():
    df = load_contract_csv(p)
    contract_dfs[c] = df
    median_day_lens[c] = compute_median_trading_minutes_per_day(df)

global_day_len = int(min(median_day_lens.values()))
if global_day_len <= HORIZON:
    raise ValueError(f'global_day_len ({global_day_len}) must be > horizon ({HORIZON})')
print('median_day_lens:', median_day_lens, 'global_day_len:', global_day_len)

contract_splits: Dict[str, Dict[str, WeekToHourDataset]] = {}
for c, df in contract_dfs.items():
    contract_splits[c] = build_datasets_for_contract(
        c, df,
        day_len_override=global_day_len,
        days_per_sample=DAYS_PER_SAMPLE, stride_days=STRIDE_DAYS, horizon=HORIZON,
        min_day_len_ratio=MIN_DAY_LEN_RATIO,
        train_frac=TRAIN_FRAC, val_frac=VAL_FRAC, test_frac=TEST_FRAC,
    )
    ds = contract_splits[c]['train']
    print(f"{c}: train={len(contract_splits[c]['train'])} val={len(contract_splits[c]['val'])} test={len(contract_splits[c]['test'])} input_len={ds.input_len}")

train_datasets = [contract_splits[c]['train'] for c in contract_splits]
mean, std = compute_global_scaler_from_train(train_datasets)
for c in contract_splits:
    for split_name in ('train','val','test'):
        contract_splits[c][split_name].set_scaler(mean, std)
print('global_scaler_mean:', {k: float(v) for k, v in zip(FEATURE_COLS, mean)})

In [None]:
# --- Model zoo (modular) ---
def build_seq2seq_lstm_forecaster(*, input_size: int = 5, hidden_size: int = 192, num_layers: int = 2, dropout: float = 0.1) -> nn.Module:
    class Seq2SeqLSTMForecaster(nn.Module):
        def __init__(self):
            super().__init__()
            self.encoder = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0.0)
            self.decoder = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0.0)
            self.proj = nn.Linear(hidden_size, input_size)

        def forward(self, x, y=None, *, horizon: int = 60, teacher_forcing: bool = True):
            _, (h, c) = self.encoder(x)
            if teacher_forcing and y is not None:
                last_x = x[:, -1:, :]
                dec_in = torch.cat([last_x, y[:, :-1, :]], dim=1)
                dec_out, _ = self.decoder(dec_in, (h, c))
                return self.proj(dec_out)
            preds = []
            inp = x[:, -1:, :]
            state = (h, c)
            for _ in range(horizon):
                dec_out, state = self.decoder(inp, state)
                step = self.proj(dec_out)
                preds.append(step)
                inp = step
            return torch.cat(preds, dim=1)
    return Seq2SeqLSTMForecaster()

def build_direct_lstm_forecaster(*, input_size: int = 5, hidden_size: int = 128, num_layers: int = 2, dropout: float = 0.1, horizon: int = 60, output_size: int = 5) -> nn.Module:
    class DirectLSTMForecaster(nn.Module):
        def __init__(self):
            super().__init__()
            self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0.0)
            self.head = nn.Sequential(nn.Linear(hidden_size, hidden_size), nn.ReLU(), nn.Linear(hidden_size, horizon * output_size))
            self.horizon = horizon
            self.output_size = output_size
        def forward(self, x, y=None, *, horizon: Optional[int] = None, teacher_forcing: bool = True):
            _out, (h, _c) = self.lstm(x)
            last = h[-1]
            hzn = int(horizon) if horizon is not None else self.horizon
            return self.head(last).view(x.shape[0], hzn, self.output_size)
    return DirectLSTMForecaster()

def build_attn_lstm_forecaster(*, input_size: int = 5, hidden_size: int = 192, num_layers: int = 2, dropout: float = 0.1, horizon: int = 60, output_size: int = 5) -> nn.Module:
    class AttnLSTMForecaster(nn.Module):
        def __init__(self):
            super().__init__()
            self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0.0)
            self.attn = nn.Sequential(nn.Linear(hidden_size, hidden_size), nn.Tanh(), nn.Linear(hidden_size, 1))
            self.head = nn.Sequential(nn.Linear(hidden_size, hidden_size), nn.ReLU(), nn.Dropout(dropout), nn.Linear(hidden_size, horizon * output_size))
            self.horizon = horizon
            self.output_size = output_size
        def forward(self, x, y=None, *, horizon: Optional[int] = None, teacher_forcing: bool = True):
            out, _ = self.lstm(x)
            scores = self.attn(out).squeeze(-1)
            w = torch.softmax(scores, dim=1).unsqueeze(-1)
            ctx = (w * out).sum(dim=1)
            hzn = int(horizon) if horizon is not None else self.horizon
            return self.head(ctx).view(x.shape[0], hzn, self.output_size)
    return AttnLSTMForecaster()

def build_cnn_lstm_forecaster(*, input_size: int = 5, conv_channels: int = 64, hidden_size: int = 160, num_layers: int = 2, dropout: float = 0.1, horizon: int = 60, output_size: int = 5) -> nn.Module:
    class CNNLSTMForecaster(nn.Module):
        def __init__(self):
            super().__init__()
            self.conv = nn.Sequential(
                nn.Conv1d(input_size, conv_channels, kernel_size=7, stride=2, padding=3), nn.ReLU(),
                nn.Conv1d(conv_channels, conv_channels, kernel_size=5, stride=2, padding=2), nn.ReLU(),
                nn.Conv1d(conv_channels, conv_channels, kernel_size=5, stride=2, padding=2), nn.ReLU(),
            )
            self.lstm = nn.LSTM(input_size=conv_channels, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0.0)
            self.head = nn.Sequential(nn.Linear(hidden_size, hidden_size), nn.ReLU(), nn.Dropout(dropout), nn.Linear(hidden_size, horizon * output_size))
            self.horizon = horizon
            self.output_size = output_size
        def forward(self, x, y=None, *, horizon: Optional[int] = None, teacher_forcing: bool = True):
            z = self.conv(x.transpose(1, 2)).transpose(1, 2)
            _out, (h, _c) = self.lstm(z)
            last = h[-1]
            hzn = int(horizon) if horizon is not None else self.horizon
            return self.head(last).view(x.shape[0], hzn, self.output_size)
    return CNNLSTMForecaster()

def build_model(*, name: str, horizon: int, output_size: int, hidden_size: int, num_layers: int, dropout: float) -> nn.Module:
    n = name.strip().lower()
    if n == 'direct_lstm':
        return build_direct_lstm_forecaster(input_size=len(FEATURE_COLS), hidden_size=hidden_size, num_layers=num_layers, dropout=dropout, horizon=horizon, output_size=output_size)
    if n == 'attn_lstm':
        return build_attn_lstm_forecaster(input_size=len(FEATURE_COLS), hidden_size=hidden_size, num_layers=num_layers, dropout=dropout, horizon=horizon, output_size=output_size)
    if n == 'cnn_lstm':
        return build_cnn_lstm_forecaster(input_size=len(FEATURE_COLS), hidden_size=hidden_size, num_layers=num_layers, dropout=dropout, horizon=horizon, output_size=output_size)
    if n == 'seq2seq_lstm':
        return build_seq2seq_lstm_forecaster(input_size=len(FEATURE_COLS), hidden_size=hidden_size, num_layers=num_layers, dropout=dropout)
    raise ValueError(f'Unknown model: {name}')

In [None]:
# Training + eval utilities (delta targets + metrics)
def _make_delta_targets(xb: torch.Tensor, yb_abs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
    last_x = xb[:, -1:, :]
    return (yb_abs - last_x), last_x

def _delta_to_abs(last_x: torch.Tensor, y_delta: torch.Tensor) -> torch.Tensor:
    return y_delta + last_x

def _to_device(x: torch.Tensor) -> torch.Tensor:
    return x.to(device, non_blocking=PIN_MEMORY)

def evaluate_abs_mse(model: nn.Module, loader: DataLoader, device: torch.device, max_batches: Optional[int] = None) -> float:
    model.eval()
    loss_fn = nn.MSELoss()
    losses: List[float] = []
    with torch.no_grad():
        for i, (xb, yb_abs) in enumerate(loader):
            if max_batches is not None and i >= max_batches:
                break
            xb = _to_device(xb)
            yb_abs = _to_device(yb_abs)
            if TARGET_MODE == 'delta_last':
                yb_delta, last_x = _make_delta_targets(xb, yb_abs)
                pred_delta = model(xb, yb_delta, horizon=yb_delta.shape[1], teacher_forcing=True)
                pred_abs = _delta_to_abs(last_x, pred_delta)
            else:
                pred_abs = model(xb, yb_abs, horizon=yb_abs.shape[1], teacher_forcing=True)
            losses.append(loss_fn(pred_abs, yb_abs).item())
    return float(np.mean(losses)) if losses else float('nan')

def train_model(
    model: nn.Module,
    *,
    train_loader: DataLoader,
    val_loaders: Dict[str, DataLoader],
    epochs: int,
    lr: float,
    device: torch.device,
    patience: int,
    max_train_batches_per_epoch: Optional[int] = None,
    min_delta: float = 1e-5,
) -> Dict[str, object]:
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode='min', factor=0.5, patience=3)
    loss_fn = nn.MSELoss()

    hist = {
        'train_delta_mse': [],
        'val_abs_mse': {k: [] for k in val_loaders.keys()},
        'val_abs_mean': [],
        'lr': [],
        'best_val': float('inf'),
        'best_epoch': None,
        'best_state_dict': None,
    }
    bad = 0
    for epoch in range(1, epochs + 1):
        model.train()
        batch_losses: List[float] = []
        for bi, (xb, yb_abs) in enumerate(train_loader):
            if max_train_batches_per_epoch is not None and bi >= max_train_batches_per_epoch:
                break
            xb = _to_device(xb)
            yb_abs = _to_device(yb_abs)
            if TARGET_MODE == 'delta_last':
                yb_delta, _last_x = _make_delta_targets(xb, yb_abs)
                y_train = yb_delta
            else:
                y_train = yb_abs
            opt.zero_grad(set_to_none=True)
            pred = model(xb, y_train, horizon=y_train.shape[1], teacher_forcing=True)
            loss = loss_fn(pred, y_train)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()
            batch_losses.append(loss.item())
        train_delta_mse = float(np.mean(batch_losses)) if batch_losses else float('nan')
        hist['train_delta_mse'].append(train_delta_mse)
        hist['lr'].append(float(opt.param_groups[0]['lr']))

        val_report = {k: evaluate_abs_mse(model, v, device) for k, v in val_loaders.items()}
        for k, v in val_report.items():
            hist['val_abs_mse'][k].append(float(v))
        val_mean = float(np.mean(list(val_report.values()))) if val_report else float('inf')
        hist['val_abs_mean'].append(val_mean)
        scheduler.step(val_mean)

        msg = ' '.join([f"{k}={v:.6f}" for k, v in val_report.items()])
        print(f"epoch={epoch:03d}/{epochs} lr={opt.param_groups[0]['lr']:.2e} train_delta_mse={train_delta_mse:.6f} val_abs_mean={val_mean:.6f} {msg}")

        if val_mean + min_delta < hist['best_val']:
            hist['best_val'] = val_mean
            hist['best_epoch'] = epoch
            hist['best_state_dict'] = {kk: vv.detach().cpu().clone() for kk, vv in model.state_dict().items()}
            bad = 0
        else:
            bad += 1
            if bad >= patience:
                print(f"early_stop: epoch={epoch} best_epoch={hist['best_epoch']} best_val={hist['best_val']:.6f}")
                break

    if hist['best_state_dict'] is not None:
        model.load_state_dict(hist['best_state_dict'])
    return hist

def plot_curves(hist: Dict[str, object], title: str) -> None:
    train = hist['train_delta_mse']
    val = hist['val_abs_mse']
    ep = np.arange(1, len(train) + 1)
    plt.figure(figsize=(10, 4))
    plt.plot(ep, train, label='train_delta_mse', linewidth=2)
    for k, series in val.items():
        plt.plot(ep, series, label=k.replace('val_', 'val_abs_'), linewidth=2)
    plt.xlabel('epoch')
    plt.ylabel('MSE (normalized)')
    plt.title(title)
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.show()

def predict_one_unscaled(model: nn.Module, dataset: WeekToHourDataset, idx: int, device: torch.device) -> Tuple[np.ndarray, np.ndarray]:
    model.eval()
    xb, yb_abs = dataset[idx]
    xb_b = xb.unsqueeze(0)
    yb_b = yb_abs.unsqueeze(0)
    xb_b = _to_device(xb_b)
    yb_b = _to_device(yb_b)
    with torch.no_grad():
        if TARGET_MODE == 'delta_last':
            yb_delta, last_x = _make_delta_targets(xb_b, yb_b)
            pred_delta = model(xb_b, yb_delta, horizon=yb_delta.shape[1], teacher_forcing=True)
            pred_abs = _delta_to_abs(last_x, pred_delta)
        else:
            pred_abs = model(xb_b, yb_b, horizon=yb_b.shape[1], teacher_forcing=True)
        pred_abs = pred_abs.squeeze(0).cpu().numpy().astype(np.float32)
    actual = yb_abs.cpu().numpy().astype(np.float32)
    if dataset.mean is not None and dataset.std is not None:
        pred_abs = pred_abs * dataset.std + dataset.mean
        actual = actual * dataset.std + dataset.mean
    return pred_abs, actual

def compute_test_feature_metrics(model: nn.Module, test_sets: Dict[str, WeekToHourDataset], device: torch.device, max_samples_per_contract: int = 200) -> pd.DataFrame:
    rows = []
    for c, ds in test_sets.items():
        n = min(len(ds), max_samples_per_contract)
        if n <= 0:
            continue
        all_pred = []
        all_true = []
        for i in range(n):
            pred, actual = predict_one_unscaled(model, ds, i, device)
            all_pred.append(pred)
            all_true.append(actual)
        P = np.concatenate(all_pred, axis=0)
        T = np.concatenate(all_true, axis=0)
        err = P - T
        mse = (err ** 2).mean(axis=0)
        mae = np.abs(err).mean(axis=0)
        rmse = np.sqrt(mse)
        for j, col in enumerate(FEATURE_COLS):
            rows.append({'contract': c, 'feature': col, 'mae': float(mae[j]), 'rmse': float(rmse[j]), 'mse': float(mse[j])})
    return pd.DataFrame(rows)

def compute_direction_metrics(model: nn.Module, test_sets: Dict[str, WeekToHourDataset], device: torch.device, max_samples_per_contract: int = 200) -> Dict[str, object]:
    y_true: List[int] = []
    y_pred: List[int] = []
    for c, ds in test_sets.items():
        n = min(len(ds), max_samples_per_contract)
        for i in range(n):
            pred, actual = predict_one_unscaled(model, ds, i, device)
            # direction relative to previous minute (seeded with last input close)
            start_day = ds.week_starts[i]
            seq = ds._get_week_array(start_day)
            last_close = float(seq[ds.input_len - 1][CLOSE_IDX])
            pred_close = pred[:, CLOSE_IDX]
            act_close = actual[:, CLOSE_IDX]
            pred_prev = np.concatenate([[last_close], pred_close[:-1]])
            act_prev = np.concatenate([[last_close], act_close[:-1]])
            pred_dir = (pred_close - pred_prev > 0).astype(np.int32)
            act_dir = (act_close - act_prev > 0).astype(np.int32)
            y_true.extend(act_dir.tolist())
            y_pred.extend(pred_dir.tolist())
    yt = np.array(y_true)
    yp = np.array(y_pred)
    cm = confusion_matrix(yt, yp, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel()
    acc = (tp + tn) / max(1, (tp + tn + fp + fn))
    precision = tp / max(1, (tp + fp))
    recall = tp / max(1, (tp + fn))
    f1 = 2 * precision * recall / max(1e-12, (precision + recall))
    return {
        'cm': cm,
        'accuracy': float(acc),
        'precision_up': float(precision),
        'recall_up': float(recall),
        'f1_up': float(f1),
        'n_labels': int(tp + tn + fp + fn),
        'tp': int(tp), 'tn': int(tn), 'fp': int(fp), 'fn': int(fn),
    }

def plot_confusion_matrix(cm: np.ndarray, title: str) -> None:
    plt.figure(figsize=(4,4))
    plt.imshow(cm, cmap='Blues')
    plt.title(title)
    plt.xticks([0,1], ['down/flat','up'])
    plt.yticks([0,1], ['down/flat','up'])
    for (i,j), v in np.ndenumerate(cm):
        plt.text(j, i, str(v), ha='center', va='center', color='black')
    plt.colorbar()
    plt.show()

In [None]:
# Benchmark models -> pick best -> train longer -> report metrics
def tune_params_for_model(model_name: str, rng_seed: int = 42) -> Dict[str, object]:
    rng = random.Random(rng_seed)
    hidden_sizes = [128, 192, 256]
    num_layers_list = [1, 2, 3]
    dropouts = [0.0, 0.1, 0.2]
    lrs = [1e-3, 3e-4, 1e-4]

    train_concat = ConcatDataset([contract_splits[c]['train'] for c in contract_splits])
    train_loader = make_loader(train_concat, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
    val_loaders = {f"val_{c}": make_loader(contract_splits[c]['val'], batch_size=BATCH_SIZE, shuffle=False, drop_last=False) for c in contract_splits}

    trials = []
    best = {'score': float('inf'), 'params': None, 'trial': None}

    for trial in range(1, BENCH_TUNE_TRIALS + 1):
        params = {
            'hidden_size': rng.choice(hidden_sizes),
            'num_layers': rng.choice(num_layers_list),
            'dropout': rng.choice(dropouts),
            'lr': rng.choice(lrs),
        }
        model = build_model(name=model_name, horizon=HORIZON, output_size=len(FEATURE_COLS), **params).to(device)
        hist = train_model(
            model,
            train_loader=train_loader,
            val_loaders=val_loaders,
            epochs=BENCH_TUNE_EPOCHS,
            lr=params['lr'],
            device=device,
            patience=max(2, min(4, BENCH_TUNE_EPOCHS)),
            max_train_batches_per_epoch=200,
        )
        score = float(hist['best_val'])
        trials.append({'trial': trial, 'score': score, **params})
        print(f"tune {model_name}: trial={trial}/{BENCH_TUNE_TRIALS} score={score:.6f} params={params}")
        if score < best['score']:
            best = {'score': score, 'params': params, 'trial': trial}

    df_trials = pd.DataFrame(trials).sort_values('score').reset_index(drop=True)
    return {'best': best, 'trials_df': df_trials}

def run_benchmark(models: List[str]) -> Tuple[pd.DataFrame, Dict[str, object]]:
    results = []
    artifacts = {}
    for m in models:
        print('\n' + '='*80)
        print('BENCH:', m)
        print('='*80)
        tune_out = tune_params_for_model(m)
        best_params = tune_out['best']['params']
        assert best_params is not None

        model = build_model(name=m, horizon=HORIZON, output_size=len(FEATURE_COLS), **best_params).to(device)
        train_concat = ConcatDataset([contract_splits[c]['train'] for c in contract_splits])
        train_loader = make_loader(train_concat, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
        val_loaders = {f"val_{c}": make_loader(contract_splits[c]['val'], batch_size=BATCH_SIZE, shuffle=False, drop_last=False) for c in contract_splits}

        hist = train_model(
            model,
            train_loader=train_loader,
            val_loaders=val_loaders,
            epochs=BENCH_FINAL_EPOCHS,
            lr=best_params['lr'],
            device=device,
            patience=min(EARLY_STOPPING_PATIENCE, 6),
            max_train_batches_per_epoch=200,
        )
        val_score = float(hist['best_val'])

        test_loaders = {f"test_{c}": make_loader(contract_splits[c]['test'], batch_size=BATCH_SIZE, shuffle=False, drop_last=False) for c in contract_splits}
        test_report = {k: evaluate_abs_mse(model, v, device) for k, v in test_loaders.items()}
        test_mean = float(np.mean(list(test_report.values())))
        print('bench_test:', test_report, 'mean=', test_mean)

        results.append({
            'model': m,
            'val_best_mean': val_score,
            'test_mean': test_mean,
            **{k: float(v) for k, v in test_report.items()},
        })
        artifacts[m] = {
            'best_params': best_params,
            'hist': hist,
            'trials_df': tune_out['trials_df'],
            'model_state': {k: v.detach().cpu().clone() for k, v in model.state_dict().items()},
        }

    df = pd.DataFrame(results).sort_values(['val_best_mean', 'test_mean']).reset_index(drop=True)
    winner_name = str(df.iloc[0]['model'])
    return df, {'winner_name': winner_name, **artifacts[winner_name]}

bench_df, winner = run_benchmark(BENCH_MODELS)
display(bench_df)
print('WINNER:', winner['winner_name'], 'params:', winner['best_params'])

In [None]:
# Train the best model longer + full reporting
best_model_name = winner['winner_name']
best_params = winner['best_params']

model = build_model(name=best_model_name, horizon=HORIZON, output_size=len(FEATURE_COLS), **best_params).to(device)

train_concat = ConcatDataset([contract_splits[c]['train'] for c in contract_splits])
train_loader = make_loader(train_concat, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
val_loaders = {f"val_{c}": make_loader(contract_splits[c]['val'], batch_size=BATCH_SIZE, shuffle=False, drop_last=False) for c in contract_splits}

final_hist = train_model(
    model,
    train_loader=train_loader,
    val_loaders=val_loaders,
    epochs=FINAL_EPOCHS,
    lr=best_params['lr'],
    device=device,
    patience=EARLY_STOPPING_PATIENCE,
    max_train_batches_per_epoch=MAX_TRAIN_BATCHES_PER_EPOCH,
)

plot_curves(final_hist, title=f"Final Train/Val Curves — {best_model_name} (target={TARGET_MODE})")

# Test curves + metrics
test_sets = {c: contract_splits[c]['test'] for c in contract_splits}

def per_sample_mse(model: nn.Module, dataset: WeekToHourDataset, device: torch.device, max_samples: int = 200) -> np.ndarray:
    n = min(len(dataset), max_samples)
    out = []
    for i in range(n):
        pred, actual = predict_one_unscaled(model, dataset, i, device)
        out.append(float(np.mean((pred - actual) ** 2)))
    return np.array(out, dtype=np.float32)

plt.figure(figsize=(10,4))
for c, ds in test_sets.items():
    mses = per_sample_mse(model, ds, device, max_samples=200)
    plt.plot(np.arange(len(mses)), mses, label=f"{c} per-sample MSE")
plt.xlabel('sample index')
plt.ylabel('MSE (original units)')
plt.title('Test Per-Sample MSE Curves (subset)')
plt.grid(True, alpha=0.3)
plt.legend()
plt.show()

metrics_df = compute_test_feature_metrics(model, test_sets, device, max_samples_per_contract=150)
display(metrics_df.groupby('feature')[['mae','rmse','mse']].mean().sort_values('rmse'))

dir_metrics = compute_direction_metrics(model, test_sets, device, max_samples_per_contract=150)
print('direction_metrics:', {k: v for k, v in dir_metrics.items() if k != 'cm'})
plot_confusion_matrix(dir_metrics['cm'], title=f"Close Direction Confusion Matrix — {best_model_name}\nacc={dir_metrics['accuracy']:.3f} f1_up={dir_metrics['f1_up']:.3f}")

# Save artifacts
ckpt_path = ARTIFACTS_DIR / 'best_forecaster.pt'
torch.save({
    'model_state_dict': model.state_dict(),
    'model_name': best_model_name,
    'best_params': best_params,
    'feature_cols': FEATURE_COLS,
    'mean': mean,
    'std': std,
    'config': {
        'target_mode': TARGET_MODE,
        'log_volume': LOG_VOLUME,
        'horizon': HORIZON,
        'days_per_sample': DAYS_PER_SAMPLE,
        'stride_days': STRIDE_DAYS,
    }
}, ckpt_path)
print('saved:', ckpt_path)