In [None]:
# ===== CONFIGURATION =====
# List of market IDs to train on

market_ids = [253591
    ,253592
    ,253593
    ,253594
    ,253595
    ,253596
    ,253598
    ,253597
    ,253634
    ,253639
    ,253640
    ,253635
    ,253641
    ,253642
    ,253643
    ,253609
    ,253610
    ,253730
    ,253728
    ,253727
    ,253706
    ,253722
    ,253723
    ,253726
    ,253725
    ,253724
    ,253729
    ,253731
    ,253733
    ,253734
    ,253732
    ,253735
    ,253736
    ,253737
    ,511755
    ,511753
    ,511754
    ,538928
    ,538929
    ,538930
    ,538931
    ,538932
    ,538939
    ,538933
    ,538937
    ,538934
    ,538935
    ,538936
    ,538938
    ,253842
    ,253843
    ,253697
    ,253698
    ,253699
    ,253701
    ,253702
    ,253703
    ,253874]
MARKET_IDS = [str(mid) for mid in market_ids]

# Training hyperparameters
D_MODEL = 128
DROPOUT = 0.08
LEARNING_RATE = 2e-4
WEIGHT_DECAY = 3e-3
BATCH_SIZE = 256
EPOCHS = 30
PATIENCE = 7
GRAD_CLIP = 0.7

# Data path
BASE_DIR = "../IEOR4212/exploration/datasets"

# Model selection (set to False to skip)
TRAIN_MAMBA = True
TRAIN_LSTM = True

print(f"Will train on {len(MARKET_IDS)} market(s)")
print(f"Markets: {MARKET_IDS}")
print(f"Train Mamba: {TRAIN_MAMBA} | Train LSTM: {TRAIN_LSTM}")

Will train on 58 market(s)
Markets: ['253591', '253592', '253593', '253594', '253595', '253596', '253598', '253597', '253634', '253639', '253640', '253635', '253641', '253642', '253643', '253609', '253610', '253730', '253728', '253727', '253706', '253722', '253723', '253726', '253725', '253724', '253729', '253731', '253733', '253734', '253732', '253735', '253736', '253737', '511755', '511753', '511754', '538928', '538929', '538930', '538931', '538932', '538939', '538933', '538937', '538934', '538935', '538936', '538938', '253842', '253843', '253697', '253698', '253699', '253701', '253702', '253703', '253874']
Train Mamba: True | Train LSTM: True


In [43]:
import os
import json
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

from mamba_ssm import Mamba

import sys
import pandas as pd

# Add both datacollection and IEOR4212 to path
datacollection_path = str(Path.cwd().parent.parent / 'datacollection')
ieor4212_path = str(Path.cwd().parent / 'IEOR4212')

sys.path.insert(0, datacollection_path)
sys.path.insert(0, ieor4212_path)
from src.preprocessing.data_prep import load_dataset, pnl_report_for_split

print(f"PyTorch version: {torch.__version__}")
print(f"Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")

PyTorch version: 2.9.1+cu128
Device: cuda


## Model Definition

In [44]:
class MambaRegressor(nn.Module):
    def __init__(self, d_in: int, d_model: int = 64, dropout: float = 0.1):
        super().__init__()
        self.inp = nn.Linear(d_in, d_model)
        self.mamba = Mamba(d_model=d_model, d_state=16, d_conv=4, expand=2)
        self.drop = nn.Dropout(dropout)
        self.head = nn.Linear(d_model, 1)

    def forward(self, x):
        # x: (B, L, F)
        x = self.inp(x)              # (B, L, d_model)
        x = self.mamba(x)            # (B, L, d_model)
        x = x[:, -1, :]              # last token summary
        x = self.drop(x)
        return self.head(x).squeeze(-1)  # (B,)

print("✓ Model class defined")

✓ Model class defined


In [45]:
class LSTMRegressor(nn.Module):
    def __init__(self, d_in: int, d_model: int = 64, dropout: float = 0.1, num_layers: int = 2):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=d_in,
            hidden_size=d_model,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0.0,
            batch_first=True
        )
        self.drop = nn.Dropout(dropout)
        self.head = nn.Linear(d_model, 1)

    def forward(self, x):
        # x: (B, L, F)
        out, (h_n, c_n) = self.lstm(x)  # out: (B, L, d_model)
        x = out[:, -1, :]                # last timestep
        x = self.drop(x)
        return self.head(x).squeeze(-1)  # (B,)

print("✓ LSTM model class defined")

✓ LSTM model class defined


## Data Loading

In [46]:
def load_market_dataset(ds_dir: Path):
    """Load dataset from dir/ or move/ subdirectory."""
    data_path = ds_dir
    
    if not data_path.exists():
        raise FileNotFoundError(f"Dataset path not found: {data_path}")
    
    X_train = np.load(data_path / "X_train.npy").astype(np.float32)
    y_train = np.load(data_path / "y_train.npy").astype(np.float32)
    X_val   = np.load(data_path / "X_val.npy").astype(np.float32)
    y_val   = np.load(data_path / "y_val.npy").astype(np.float32)
    X_test  = np.load(data_path / "X_test.npy").astype(np.float32)
    y_test  = np.load(data_path / "y_test.npy").astype(np.float32)

    end_idx_train = np.load(data_path / "end_idx_train.npy").astype(np.float32)
    end_idx_val   = np.load(data_path / "end_idx_val.npy").astype(np.float32)
    end_idx_test  = np.load(data_path / "end_idx_test.npy").astype(np.float32)

    ev = pd.read_csv(data_path / "ev.csv", index_col=0, parse_dates=True)

    meta = {}
    meta_path = data_path / "metadata.json"
    if meta_path.exists():
        with open(meta_path, "r") as f:
            meta = json.load(f)

    return (X_train, y_train, X_val, y_val, X_test, y_test, meta, ev, end_idx_train, end_idx_val, end_idx_test)

print("✓ Data loading function defined")

✓ Data loading function defined


## Training Functions

In [47]:
@torch.no_grad()
def eval_regression(model, loader, device):
    """Evaluate regression model: MAE and directional sign accuracy."""
    model.eval()
    preds, ys = [], []
    for xb, yb in loader:
        xb = xb.to(device)
        yb = yb.to(device)
        pr = model(xb)
        preds.append(pr.detach().cpu().numpy())
        ys.append(yb.detach().cpu().numpy())
    p = np.concatenate(preds)
    y = np.concatenate(ys)

    mae = float(np.mean(np.abs(p - y)))
    sign_acc = float(np.mean(np.sign(p) == np.sign(y)))
    return mae, sign_acc

print("✓ Evaluation function defined")

✓ Evaluation function defined


In [48]:
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

def pnl_from_dz_preds(
    ev: pd.DataFrame,
    dz_pred: np.ndarray,
    n_train: int,
    n_val: int,
    n_test: int,
    seq_len: int,
    split: str,
    thr_dp: float = 0.0,
    cost_per_trade: float = 0.0,   # absolute cost in price units (e.g. 0.0005)
):
    """
    Computes 1-step mark-to-market PnL:
      position = sign(predicted dp) if |predicted dp| >= thr_dp else 0
      pnl      = position * (p_next - p_now) - cost_per_trade * |position|

    dz_pred is model output (predicted z_{t+1}-z_t) for the chosen split.

    Assumes dataset windows were made sequentially:
      sample i ends at event index end_idx = (seq_len - 1) + i
      next price is at end_idx + 1
    """

    # choose split ranges in "sample index space"
    if split == "train":
        start_i, end_i = 0, n_train
    elif split == "val":
        start_i, end_i = n_train, n_train + n_val
    elif split == "test":
        start_i, end_i = n_train + n_val, n_train + n_val + n_test
    else:
        raise ValueError("split must be one of: train, val, test")

    dz_pred = np.asarray(dz_pred).reshape(-1)
    dz_pred = dz_pred[: (end_i - start_i)]  # safety

    # map samples -> event indices (where window ends)
    end_idx = (seq_len - 1) + np.arange(start_i, end_i)
    # we need end_idx+1 to exist
    end_idx = end_idx[end_idx + 1 < len(ev)]
    if len(end_idx) == 0:
        return {"n_samples": 0, "n_trades": 0, "trade_rate": 0.0,
                "avg_pnl": 0.0, "sum_pnl": 0.0, "median_pnl": 0.0, "hit_rate_on_trades": 0.0}

    dz_pred = dz_pred[:len(end_idx)]

    p_now  = ev["price"].values[end_idx]
    p_next = ev["price"].values[end_idx + 1]
    z_now  = ev["z"].values[end_idx]

    # convert predicted dz -> predicted dp in price space
    p_pred_next = sigmoid(z_now + dz_pred)
    dp_pred = p_pred_next - p_now

    # trading rule
    pos = np.sign(dp_pred)
    pos[np.abs(dp_pred) < thr_dp] = 0.0

    dp_real = (p_next - p_now)
    pnl = pos * dp_real - cost_per_trade * np.abs(pos)

    n_trades = float(np.sum(pos != 0.0))
    trade_rate = n_trades / float(len(end_idx))

    # "hit rate" only on executed trades
    hit = (pos * dp_real) > 0
    hit_rate = float(np.mean(hit[pos != 0.0])) if n_trades > 0 else 0.0

    return {
        "n_samples": float(len(end_idx)),
        "n_trades": n_trades,
        "trade_rate": float(trade_rate),
        "avg_pnl": float(np.mean(pnl)),
        "sum_pnl": float(np.sum(pnl)),
        "median_pnl": float(np.median(pnl)),
        "hit_rate_on_trades": hit_rate,
    }

def pick_best_threshold_on_val(ev, dz_pred_val, n_train, n_val, n_test, seq_len,
                               thr_grid=(0.0, 0.0005, 0.001, 0.002, 0.005),
                               cost_per_trade=0.0):
    best = None
    for thr in thr_grid:
        stats = pnl_from_dz_preds(ev, dz_pred_val, n_train, n_val, n_test, seq_len,
                                  split="val", thr_dp=thr, cost_per_trade=cost_per_trade)
        if best is None or stats["sum_pnl"] > best["sum_pnl"]:
            best = dict(stats)
            best["thr_dp"] = thr
    return best

print("✓ PnL computation functions defined")

✓ PnL computation functions defined


In [49]:
def train_one_market(
    market_id: str,
    base_dir: Path,
    *,
    d_model: int = 64,
    dropout: float = 0.15,
    lr: float = 1e-4,
    weight_decay: float = 5e-2,
    batch_size: int = 256,
    epochs: int = 30,
    patience: int = 5,
    grad_clip: float = 1.0,
):
    """Train Mamba regressor on one market."""
    ds_dir = base_dir / f"market_{market_id}" / "move_reg"
    X_train, y_train, X_val, y_val, X_test, y_test, meta, ev, end_idx_train, end_idx_val, end_idx_test = load_market_dataset(ds_dir)

    if len(X_train) == 0 or len(X_val) == 0:
        raise ValueError(f"Empty train/val in {ds_dir}")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    d_in = X_train.shape[-1]

    # Get seq_len from metadata
    seq_len = meta.get("seq_len", 64)
    n_train = X_train.shape[0]
    n_val = X_val.shape[0]
    n_test = X_test.shape[0]

    model = MambaRegressor(d_in=d_in, d_model=d_model, dropout=dropout).to(device)

    train_loader = DataLoader(
        TensorDataset(torch.tensor(X_train), torch.tensor(y_train)),
        batch_size=batch_size,
        shuffle=False,
        drop_last=False
    )
    val_loader = DataLoader(
        TensorDataset(torch.tensor(X_val), torch.tensor(y_val)),
        batch_size=batch_size,
        shuffle=False
    )
    test_loader = DataLoader(
        TensorDataset(torch.tensor(X_test), torch.tensor(y_test)),
        batch_size=batch_size,
        shuffle=False
    )

    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    loss_fn = nn.SmoothL1Loss()  # Huber loss

    best_val = float("inf")
    best_state = None
    bad = 0

    print(f"\n{'='*80}")
    print(f"Training Market: {market_id}")
    print(f"{'='*80}")
    print(f"Shapes: X_train={X_train.shape} X_val={X_val.shape} X_test={X_test.shape}")
    print(f"Features: {d_in} | seq_len: {seq_len}")
    print(f"Device: {device}")
    
    for ep in range(1, epochs + 1):
        model.train()
        losses = []

        for xb, yb in train_loader:
            xb = xb.to(device)
            yb = yb.to(device)

            opt.zero_grad(set_to_none=True)
            pr = model(xb)
            loss = loss_fn(pr, yb)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            opt.step()
            losses.append(loss.item())

        val_mae, val_sign = eval_regression(model, val_loader, device)
        tr_loss = float(np.mean(losses))

        print(f"Epoch {ep:02d}/{epochs} | train_loss={tr_loss:.6f} | val_MAE={val_mae:.6f} | val_sign_acc={val_sign:.3f}", end="")

        if val_mae < best_val - 1e-6:
            best_val = val_mae
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            bad = 0
            print(" ← best")
        else:
            bad += 1
            print()
            if bad >= patience:
                print(f"Early stopping (no val MAE improvement for {patience} epochs).")
                break

    if best_state is not None:
        model.load_state_dict(best_state)
        model.to(device)

    # Final eval
    val_mae, val_sign = eval_regression(model, val_loader, device)
    test_mae, test_sign = eval_regression(model, test_loader, device)

    print(f"\n{'='*80}")
    print(f"FINAL RESULTS:")
    print(f"  Val:  MAE={val_mae:.6f} | Sign Acc={val_sign:.3f}")
    print(f"  Test: MAE={test_mae:.6f} | Sign Acc={test_sign:.3f}")
    print(f"{'='*80}")

    # Get predictions (dz predictions from model)
    @torch.no_grad()
    def predict_numpy(model, X):
        model.eval()
        X_t = torch.tensor(X, dtype=torch.float32, device=device)
        return model(X_t).detach().cpu().numpy()
    
    yhat_val  = predict_numpy(model, X_val)
    yhat_test = predict_numpy(model, X_test)
    
    # ===== CORRECT PnL COMPUTATION IN PRICE SPACE =====
    print(f"\n{'='*80}")
    print(f"PnL Analysis (price-space, mark-to-market)")
    print(f"{'='*80}")
    
    # Find best threshold on validation set
    best = pick_best_threshold_on_val(
        ev,
        dz_pred_val=yhat_val,
        n_train=n_train, n_val=n_val, n_test=n_test,
        seq_len=seq_len,
        thr_grid=(0.0, 0.0005, 0.001, 0.002, 0.005, 0.01),
        cost_per_trade=0.0,   # set to e.g. 0.0005 to model trading costs
    )
    
    thr = best["thr_dp"]
    print(f"\nBest VAL threshold (price space): {thr:.6f}")
    print(f"  sum_pnl={best['sum_pnl']:.6f} | trade_rate={best['trade_rate']:.3f} | hit_rate={best['hit_rate_on_trades']:.3f}")
    
    # Evaluate with best threshold on val and test
    val_stats = pnl_from_dz_preds(
        ev, yhat_val, n_train, n_val, n_test, seq_len, 
        "val", thr_dp=thr, cost_per_trade=0.0
    )
    test_stats = pnl_from_dz_preds(
        ev, yhat_test, n_train, n_val, n_test, seq_len, 
        "test", thr_dp=thr, cost_per_trade=0.0
    )
    
    print(f"\nVAL  PnL: {val_stats}")
    print(f"TEST PnL: {test_stats}")
    print(f"{'='*80}")

    # Save model
    out_dir = ds_dir / "checkpoints"
    out_dir.mkdir(parents=True, exist_ok=True)
    ckpt_path = out_dir / "mamba_regressor.pt"
    
    torch.save({
        "state_dict": model.state_dict(),
        "d_in": d_in,
        "d_model": d_model,
        "dropout": dropout,
        "meta": meta,
        "metrics": {
            "best_val_mae": val_mae,
            "val_sign_acc": val_sign,
            "test_mae": test_mae,
            "test_sign_acc": test_sign,
            "val_pnl": val_stats,
            "test_pnl": test_stats,
            "best_threshold_dp": thr,
        }
    }, ckpt_path)

    with open(out_dir / "metrics.json", "w") as f:
        json.dump({
            "best_val_mae": val_mae,
            "val_sign_acc": val_sign,
            "test_mae": test_mae,
            "test_sign_acc": test_sign,
            "market_id": market_id,
            "val_pnl": val_stats,
            "test_pnl": test_stats,
            "best_threshold_dp": thr,
        }, f, indent=2)
    
    print(f"✓ Model saved to: {ckpt_path}")

    return val_mae, test_mae, val_sign, test_sign, val_stats, test_stats

print("✓ Training function defined")

✓ Training function defined


In [50]:
def train_lstm_baseline(
    market_id: str,
    base_dir: Path,
    *,
    d_model: int = 64,
    dropout: float = 0.15,
    lr: float = 1e-4,
    weight_decay: float = 5e-2,
    batch_size: int = 256,
    epochs: int = 30,
    patience: int = 5,
    grad_clip: float = 1.0,
):
    """Train LSTM baseline regressor on one market."""
    ds_dir = base_dir / f"market_{market_id}" / "move_reg"
    X_train, y_train, X_val, y_val, X_test, y_test, meta, ev, end_idx_train, end_idx_val, end_idx_test = load_market_dataset(ds_dir)

    if len(X_train) == 0 or len(X_val) == 0:
        raise ValueError(f"Empty train/val in {ds_dir}")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    d_in = X_train.shape[-1]

    # Get seq_len from metadata
    seq_len = meta.get("seq_len", 64)
    n_train = X_train.shape[0]
    n_val = X_val.shape[0]
    n_test = X_test.shape[0]

    model = LSTMRegressor(d_in=d_in, d_model=d_model, dropout=dropout, num_layers=2).to(device)

    train_loader = DataLoader(
        TensorDataset(torch.tensor(X_train), torch.tensor(y_train)),
        batch_size=batch_size,
        shuffle=False,
        drop_last=False
    )
    val_loader = DataLoader(
        TensorDataset(torch.tensor(X_val), torch.tensor(y_val)),
        batch_size=batch_size,
        shuffle=False
    )
    test_loader = DataLoader(
        TensorDataset(torch.tensor(X_test), torch.tensor(y_test)),
        batch_size=batch_size,
        shuffle=False
    )

    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    loss_fn = nn.SmoothL1Loss()  # Huber loss

    best_val = float("inf")
    best_state = None
    bad = 0

    print(f"\n{'='*80}")
    print(f"Training LSTM Baseline: {market_id}")
    print(f"{'='*80}")
    print(f"Shapes: X_train={X_train.shape} X_val={X_val.shape} X_test={X_test.shape}")
    print(f"Features: {d_in} | seq_len: {seq_len}")
    print(f"Device: {device}")
    
    for ep in range(1, epochs + 1):
        model.train()
        losses = []

        for xb, yb in train_loader:
            xb = xb.to(device)
            yb = yb.to(device)

            opt.zero_grad(set_to_none=True)
            pr = model(xb)
            loss = loss_fn(pr, yb)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            opt.step()
            losses.append(loss.item())

        val_mae, val_sign = eval_regression(model, val_loader, device)
        tr_loss = float(np.mean(losses))

        if ep % 5 == 0 or ep == 1:  # Print every 5 epochs to reduce clutter
            print(f"Epoch {ep:02d}/{epochs} | train_loss={tr_loss:.6f} | val_MAE={val_mae:.6f} | val_sign_acc={val_sign:.3f}", end="")

            if val_mae < best_val - 1e-6:
                print(" ← best")
            else:
                print()
        
        if val_mae < best_val - 1e-6:
            best_val = val_mae
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            bad = 0
        else:
            bad += 1
            if bad >= patience:
                print(f"Early stopping (no val MAE improvement for {patience} epochs).")
                break

    if best_state is not None:
        model.load_state_dict(best_state)
        model.to(device)

    # Final eval
    val_mae, val_sign = eval_regression(model, val_loader, device)
    test_mae, test_sign = eval_regression(model, test_loader, device)

    print(f"\n{'='*80}")
    print(f"LSTM FINAL RESULTS:")
    print(f"  Val:  MAE={val_mae:.6f} | Sign Acc={val_sign:.3f}")
    print(f"  Test: MAE={test_mae:.6f} | Sign Acc={test_sign:.3f}")
    print(f"{'='*80}")

    # Get predictions (dz predictions from model)
    @torch.no_grad()
    def predict_numpy(model, X):
        model.eval()
        X_t = torch.tensor(X, dtype=torch.float32, device=device)
        return model(X_t).detach().cpu().numpy()
    
    yhat_val  = predict_numpy(model, X_val)
    yhat_test = predict_numpy(model, X_test)
    
    # ===== PnL COMPUTATION =====
    print(f"\n{'='*80}")
    print(f"LSTM PnL Analysis")
    print(f"{'='*80}")
    
    # Find best threshold on validation set
    best = pick_best_threshold_on_val(
        ev,
        dz_pred_val=yhat_val,
        n_train=n_train, n_val=n_val, n_test=n_test,
        seq_len=seq_len,
        thr_grid=(0.0, 0.0005, 0.001, 0.002, 0.005, 0.01),
        cost_per_trade=0.0,
    )
    
    thr = best["thr_dp"]
    print(f"\nBest VAL threshold: {thr:.6f}")
    print(f"  sum_pnl={best['sum_pnl']:.6f} | trade_rate={best['trade_rate']:.3f} | hit_rate={best['hit_rate_on_trades']:.3f}")
    
    val_stats = pnl_from_dz_preds(
        ev, yhat_val, n_train, n_val, n_test, seq_len, 
        "val", thr_dp=thr, cost_per_trade=0.0
    )
    test_stats = pnl_from_dz_preds(
        ev, yhat_test, n_train, n_val, n_test, seq_len, 
        "test", thr_dp=thr, cost_per_trade=0.0
    )
    
    print(f"\nVAL  PnL: {val_stats}")
    print(f"TEST PnL: {test_stats}")
    print(f"{'='*80}")

    # Save LSTM model
    out_dir = ds_dir / "checkpoints"
    out_dir.mkdir(parents=True, exist_ok=True)
    ckpt_path = out_dir / "lstm_regressor.pt"
    
    torch.save({
        "state_dict": model.state_dict(),
        "d_in": d_in,
        "d_model": d_model,
        "dropout": dropout,
        "meta": meta,
        "metrics": {
            "best_val_mae": val_mae,
            "val_sign_acc": val_sign,
            "test_mae": test_mae,
            "test_sign_acc": test_sign,
            "val_pnl": val_stats,
            "test_pnl": test_stats,
            "best_threshold_dp": thr,
        }
    }, ckpt_path)

    with open(out_dir / "lstm_metrics.json", "w") as f:
        json.dump({
            "best_val_mae": val_mae,
            "val_sign_acc": val_sign,
            "test_mae": test_mae,
            "test_sign_acc": test_sign,
            "market_id": market_id,
            "val_pnl": val_stats,
            "test_pnl": test_stats,
            "best_threshold_dp": thr,
        }, f, indent=2)
    
    print(f"✓ LSTM model saved to: {ckpt_path}")

    return val_mae, test_mae, val_sign, test_sign, val_stats, test_stats

print("✓ LSTM training function defined")

✓ LSTM training function defined


## Train All Markets

In [51]:
base_dir = Path(BASE_DIR)
mamba_results = []
lstm_results = []

print(f"\nTraining {len(MARKET_IDS)} market(s)...")
print(f"Base directory: {base_dir}")
print(f"Mamba: {'ENABLED' if TRAIN_MAMBA else 'DISABLED'} | LSTM: {'ENABLED' if TRAIN_LSTM else 'DISABLED'}\n")

for market_id in MARKET_IDS:
    print(f"\n{'#'*80}")
    print(f"# Market: {market_id}")
    print(f"{'#'*80}")
    
    # Train Mamba
    if TRAIN_MAMBA:
        try:
            val_mae, test_mae, val_sign, test_sign, val_pnl, test_pnl = train_one_market(
                market_id=market_id,
                base_dir=base_dir,
                d_model=D_MODEL,
                dropout=DROPOUT,
                lr=LEARNING_RATE,
                weight_decay=WEIGHT_DECAY,
                batch_size=BATCH_SIZE,
                epochs=EPOCHS,
                patience=PATIENCE,
                grad_clip=GRAD_CLIP,
            )
            mamba_results.append((market_id, val_mae, test_mae, val_sign, test_sign, val_pnl, test_pnl))
        except Exception as e:
            import traceback
            print(f"\n❌ [SKIP MAMBA] {market_id}")
            print(f"Error: {e}")
            print(f"Traceback:\n{traceback.format_exc()}")
            mamba_results.append((market_id, None, None, None, None, None, None))
    else:
        mamba_results.append((market_id, None, None, None, None, None, None))
    
    # Train LSTM baseline
    if TRAIN_LSTM:
        try:
            val_mae_lstm, test_mae_lstm, val_sign_lstm, test_sign_lstm, val_pnl_lstm, test_pnl_lstm = train_lstm_baseline(
                market_id=market_id,
                base_dir=base_dir,
                d_model=D_MODEL,
                dropout=DROPOUT,
                lr=LEARNING_RATE,
                weight_decay=WEIGHT_DECAY,
                batch_size=BATCH_SIZE,
                epochs=EPOCHS,
                patience=PATIENCE,
                grad_clip=GRAD_CLIP,
            )
            lstm_results.append((market_id, val_mae_lstm, test_mae_lstm, val_sign_lstm, test_sign_lstm, val_pnl_lstm, test_pnl_lstm))
        except Exception as e:
            import traceback
            print(f"\n❌ [SKIP LSTM] {market_id}")
            print(f"Error: {e}")
            print(f"Traceback:\n{traceback.format_exc()}")
            lstm_results.append((market_id, None, None, None, None, None, None))
    else:
        lstm_results.append((market_id, None, None, None, None, None, None))

print(f"\n{'='*80}")
print("ALL TRAINING COMPLETE")
print(f"{'='*80}")
print(f"Mamba: {sum(1 for x in mamba_results if x[1] is not None)}/{len(MARKET_IDS)} successful")
print(f"LSTM:  {sum(1 for x in lstm_results if x[1] is not None)}/{len(MARKET_IDS)} successful")


Training 58 market(s)...
Base directory: ../IEOR4212/exploration/datasets
Mamba: ENABLED | LSTM: ENABLED


################################################################################
# Market: 253591
################################################################################

Training Market: 253591
Shapes: X_train=(644, 64, 8) X_val=(3985, 64, 8) X_test=(11762, 64, 8)
Features: 8 | seq_len: 64
Device: cuda
Epoch 01/30 | train_loss=0.003122 | val_MAE=0.068760 | val_sign_acc=0.507 ← best
Epoch 02/30 | train_loss=0.002978 | val_MAE=0.065880 | val_sign_acc=0.507 ← best
Epoch 01/30 | train_loss=0.003122 | val_MAE=0.068760 | val_sign_acc=0.507 ← best
Epoch 02/30 | train_loss=0.002978 | val_MAE=0.065880 | val_sign_acc=0.507 ← best
Epoch 03/30 | train_loss=0.002812 | val_MAE=0.063099 | val_sign_acc=0.507 ← best
Epoch 04/30 | train_loss=0.002705 | val_MAE=0.060374 | val_sign_acc=0.507 ← best
Epoch 03/30 | train_loss=0.002812 | val_MAE=0.063099 | val_sign_acc=0.507 ← best
Epoch 04/30

## Summary Results

In [52]:
# Filter out failed runs
mamba_valid = [(m, vm, tm, vs, ts, vp, tp) for m, vm, tm, vs, ts, vp, tp in mamba_results if vm is not None]
lstm_valid = [(m, vm, tm, vs, ts, vp, tp) for m, vm, tm, vs, ts, vp, tp in lstm_results if vm is not None]

if mamba_valid and lstm_valid:
    print("\n" + "="*120)
    print("COMPARISON: Mamba vs LSTM")
    print("="*120)
    print(f"{'Market':<12s} | {'Mamba MAE':>10s} | {'LSTM MAE':>10s} | {'Mamba Sign':>11s} | {'LSTM Sign':>11s} | {'Mamba PnL':>10s} | {'LSTM PnL':>10s} | {'Winner':>8s}")
    print("-"*120)
    
    mamba_wins_mae = 0
    lstm_wins_mae = 0
    mamba_wins_pnl = 0
    lstm_wins_pnl = 0
    
    # Create lookup dictionaries
    mamba_dict = {m: (vm, tm, vs, ts, vp, tp) for m, vm, tm, vs, ts, vp, tp in mamba_valid}
    lstm_dict = {m: (vm, tm, vs, ts, vp, tp) for m, vm, tm, vs, ts, vp, tp in lstm_valid}
    
    # Compare markets that have both results
    common_markets = set(mamba_dict.keys()) & set(lstm_dict.keys())
    
    for mid in sorted(common_markets):
        m_val_mae, m_test_mae, m_val_sign, m_test_sign, m_val_pnl, m_test_pnl = mamba_dict[mid]
        l_val_mae, l_test_mae, l_val_sign, l_test_sign, l_val_pnl, l_test_pnl = lstm_dict[mid]
        
        # Extract test sum_pnl
        m_test_sum_pnl = m_test_pnl.get('sum_pnl', 0.0)
        l_test_sum_pnl = l_test_pnl.get('sum_pnl', 0.0)
        
        # Winner based on test sum_pnl (higher is better)
        winner = "Mamba" if m_test_sum_pnl > l_test_sum_pnl else "LSTM"
        if m_test_mae < l_test_mae:
            mamba_wins_mae += 1
        else:
            lstm_wins_mae += 1
        
        if m_test_sum_pnl > l_test_sum_pnl:
            mamba_wins_pnl += 1
        else:
            lstm_wins_pnl += 1
        
        print(f"{mid:<12s} | {m_test_mae:10.6f} | {l_test_mae:10.6f} | {m_test_sign:11.3f} | {l_test_sign:11.3f} | {m_test_sum_pnl:10.5f} | {l_test_sum_pnl:10.5f} | {winner:>8s}")
    
    print("-"*120)
    
    # Overall statistics
    mamba_avg_test_mae = np.mean([mamba_dict[m][1] for m in common_markets])
    lstm_avg_test_mae = np.mean([lstm_dict[m][1] for m in common_markets])
    
    mamba_avg_sign = np.mean([mamba_dict[m][3] for m in common_markets])
    lstm_avg_sign = np.mean([lstm_dict[m][3] for m in common_markets])
    
    mamba_avg_pnl = np.mean([mamba_dict[m][5].get('sum_pnl', 0.0) for m in common_markets])
    lstm_avg_pnl = np.mean([lstm_dict[m][5].get('sum_pnl', 0.0) for m in common_markets])
    
    print(f"{'MAMBA AVG':<12s} | {mamba_avg_test_mae:10.6f} | {'---':>10s} | {mamba_avg_sign:11.3f} | {'---':>11s} | {mamba_avg_pnl:10.5f} | {'---':>10s} | {'':<8s}")
    print(f"{'LSTM AVG':<12s} | {'---':>10s} | {lstm_avg_test_mae:10.6f} | {'---':>11s} | {lstm_avg_sign:11.3f} | {'---':>10s} | {lstm_avg_pnl:10.5f} | {'':<8s}")
    print("="*120)
    
    print(f"\n📊 SUMMARY:")
    print(f"\n  Test MAE (lower is better):")
    print(f"    Mamba wins: {mamba_wins_mae}/{len(common_markets)} markets ({100*mamba_wins_mae/len(common_markets):.1f}%)")
    print(f"    LSTM wins:  {lstm_wins_mae}/{len(common_markets)} markets ({100*lstm_wins_mae/len(common_markets):.1f}%)")
    print(f"    Average improvement: {100*(lstm_avg_test_mae - mamba_avg_test_mae)/lstm_avg_test_mae:+.2f}% (Mamba vs LSTM)")
    
    print(f"\n  Test PnL (higher is better):")
    print(f"    Mamba wins: {mamba_wins_pnl}/{len(common_markets)} markets ({100*mamba_wins_pnl/len(common_markets):.1f}%)")
    print(f"    LSTM wins:  {lstm_wins_pnl}/{len(common_markets)} markets ({100*lstm_wins_pnl/len(common_markets):.1f}%)")
    print(f"    Mamba avg test PnL: {mamba_avg_pnl:+.5f}")
    print(f"    LSTM avg test PnL:  {lstm_avg_pnl:+.5f}")
    
    print(f"\n  Test sign accuracy:")
    print(f"    Mamba: {mamba_avg_sign:.3f}")
    print(f"    LSTM:  {lstm_avg_sign:.3f}")
    
    # Detailed PnL metrics
    mamba_avg_hit = np.mean([mamba_dict[m][5].get('hit_rate_on_trades', 0.0) for m in common_markets])
    lstm_avg_hit = np.mean([lstm_dict[m][5].get('hit_rate_on_trades', 0.0) for m in common_markets])
    mamba_avg_trade_rate = np.mean([mamba_dict[m][5].get('trade_rate', 0.0) for m in common_markets])
    lstm_avg_trade_rate = np.mean([lstm_dict[m][5].get('trade_rate', 0.0) for m in common_markets])
    
    print(f"\n  Trading metrics:")
    print(f"    Mamba: hit_rate={mamba_avg_hit:.3f} | trade_rate={mamba_avg_trade_rate:.3f}")
    print(f"    LSTM:  hit_rate={lstm_avg_hit:.3f} | trade_rate={lstm_avg_trade_rate:.3f}")
    
elif mamba_valid:
    print("\n⚠ Only Mamba results available (LSTM training failed or skipped)")
    print(f"Trained {len(mamba_valid)} markets successfully with Mamba")
elif lstm_valid:
    print("\n⚠ Only LSTM results available (Mamba training failed or skipped)")
    print(f"Trained {len(lstm_valid)} markets successfully with LSTM")
else:
    print("\n⚠ No markets were successfully trained")


COMPARISON: Mamba vs LSTM
Market       |  Mamba MAE |   LSTM MAE |  Mamba Sign |   LSTM Sign |  Mamba PnL |   LSTM PnL |   Winner
------------------------------------------------------------------------------------------------------------------------
253592       |   0.073317 |   0.078690 |       0.509 |       0.504 |    0.61050 |    0.15000 |    Mamba
253593       |   0.331463 |   0.364242 |       0.754 |       0.465 |    0.05450 |   -0.00250 |    Mamba
253594       |   0.064360 |   0.069023 |       0.529 |       0.536 |   -0.08250 |    0.08150 |     LSTM
253595       |   0.217240 |   0.220299 |       0.499 |       0.476 |   -0.00200 |   -0.01000 |    Mamba
253596       |   1.426464 |   1.458275 |       0.750 |       0.500 |    0.00200 |   -0.00100 |    Mamba
253598       |   0.611074 |   0.631006 |       0.796 |       0.490 |    0.01750 |    0.00000 |    Mamba
253609       |   0.115777 |   0.119860 |       0.501 |       0.495 |   -0.00700 |   -0.00500 |     LSTM
253610       |   0.6

## Interpretation

**Models:**
- **Mamba**: State-space model with selective attention (faster, more parameter efficient)
- **LSTM**: Standard 2-layer LSTM baseline

**Metrics:**
- **MAE (Mean Absolute Error)**: Average absolute difference between predicted and actual logit changes (lower is better)
- **Sign Accuracy**: Percentage of times the model correctly predicts the direction (up/down)
- **PnL (Profit & Loss)**: Mark-to-market returns using optimal threshold from validation set

**Good performance:**
- Sign accuracy > 0.55 (better than random 0.50)
- Positive sum_pnl with reasonable trade_rate (0.3-0.7)
- hit_rate_on_trades > 0.55

**What to look for:**
- If Mamba consistently wins: State-space modeling captures price dynamics better than LSTM
- If LSTM wins: Sequential dependencies may be more important than selective attention
- Similar performance: Both architectures adequate, consider simpler model for production

**PnL interpretation:**
- `sum_pnl > 0`: Model has directional edge
- `hit_rate_on_trades > 0.55`: Better than random coin flip
- `trade_rate`: Fraction of samples where |predicted_dp| > threshold (too low = undertrading, too high = overtrading)