# 📘 Multimodal Option Price Prediction Workbook

This notebook implements the full pipeline for multimodal option price prediction, including data ingestion, feature engineering, model training (with hyperparameter search), and evaluation. It follows a modular, professional, and academically-aligned coding structure and served as the main development and experimentation environment for this project.

---

### ⚠️ **Note on Data Access**

This notebook references input files stored in a private Google Drive directory. These files include licensed datasets (from **OptionMetrics IvyDB USA**) that **cannot be redistributed or publicly shared** due to legal and licensing restrictions clarified by the library. As such, this notebook will not execute end-to-end without access to those files.  To comply with licensing terms, the data has been kept private and is **not included in this repository**. Only derived, non-reversible processed data (such as `inference_eval.csv`) is made available for safe model evaluation.

However, the code is still included for **transparency and inspection**.

### **Key Design Principles**

1. **Centralized configuration**: All hyperparameters and constants are defined in a `Config` class.
2. **Modular functions**: Data processing and modeling logic are encapsulated in reusable, testable functions.
3. **Unified logging**: Consistent use of logging throughout, with optional progress bars.
4. **Structured layout**: Section headers follow the IMRaD convention (Introduction → Methods → Results → Discussion) for clarity and organization.

> 💡 **To reproduce final results**, please run the streamlined evaluation notebook: `Inference_Demo.ipynb`.  
> This notebook is intended for experimentation and **not optimized for CPU-only environments**.


## 0  Environment & Dependencies

In [None]:
!pip install -qU fredapi yfinance pytrends pandas requests beautifulsoup4 matplotlib seaborn transformers sentencepiece tqdm optuna ray[tune] torch torchvision

In [None]:
# ==== Google Drive mount (Colab) ===========================================
# Run this once per session; it will prompt you to grant Colab access.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)   # force_remount avoids "already mounted" errors

# (optional) quick sanity-check that the data files are visible
!ls -lh /content/drive/MyDrive | grep -E 'spx_price|vix_price|spx_options' || echo "Files not found – double-check paths"


## 1  Imports & Global Configuration

In [None]:
import os
import gc
import logging
import time
from pathlib import Path
from dataclasses import dataclass, field
from typing import List, Dict, Tuple
import joblib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import yfinance as yf
from fredapi import Fred

import optuna
from tqdm.auto import tqdm
from ray import tune, train

# ---------- Configuration ----------
@dataclass
class Config:
    # Use cache? (for faster running time, set False for Black Scholes check)
    USE_CACHED = True

    # Paths
    DRIVE_ROOT: Path = Path('/content/drive/MyDrive')
    FIN_NEWS_PATH: Path = DRIVE_ROOT / 'FinSen_US_Categorized_Timestamp.csv'
    OPTIONS_CSV_PATH: Path = DRIVE_ROOT / 'spx_options.csv'
    SPX_CSV_PATH: Path = DRIVE_ROOT / 'spx_price.csv'
    VIX_CSV_PATH: Path = DRIVE_ROOT / 'vix_price.csv'
    OUTPUT_DIR: Path = Path('/content/option_price_plots')
    SAVED_MODEL_DIR: Path ='trained_models'

    # FRED
    FRED_API_KEY: str = 'b73d6a590f43c3d04bde3404b960c821'

    # Time‑window
    START_DATE: str = '2018-01-01'
    END_DATE:   str = '2023-12-31'
    WINDOW: int = 10

    # Hardware
    DEVICE: str = 'cuda' if torch.cuda.is_available() else 'cpu'

    # Features
    MACRO_COLS:  List[str] = field(default_factory=lambda: [
        '10y_treasury'
    ])
    SENTIMENT_COLS: List[str] = field(default_factory=lambda: ['sentiment_score', 'market_sentiment'])
    OPTION_COLS: List[str] = field(default_factory=lambda: [
        'vix_price', 'spx_price', 'impl_volatility',
        'moneyness', 'open_interest', 'greeks_signal_1'
    ])

CFG = Config()
os.makedirs(CFG.OUTPUT_DIR, exist_ok=True)

# ---------- Logging ----------
logging.basicConfig(level=logging.INFO,
                    format='[%(asctime)s] %(levelname)s — %(message)s',
                    datefmt='%H:%M:%S')
logger = logging.getLogger(__name__)
logger.info(f'Using device → {CFG.DEVICE}')


## 2  Utility Functions

In [None]:
def scale_series(series: pd.DataFrame) -> np.ndarray:
    """Min‑max scale a single or multi‑column DataFrame and return NumPy array."""
    return MinMaxScaler().fit_transform(series)

def forward_pass(model: nn.Module, batch):
    """Helper to accommodate the three dataset return formats."""
    if len(batch) == 2:                           # FNN / early‑fusion flat
        x, y = (t.to(CFG.DEVICE) for t in batch)
        preds = model(x).squeeze(-1)
    elif len(batch) == 4:                         # Multimodal
        m, s, o, y = (t.to(CFG.DEVICE) for t in batch)
        preds = model(m, s, o).squeeze(-1)
    else:
        raise ValueError(f'Unexpected batch size {len(batch)}')
    return preds, y.to(CFG.DEVICE)

def split_dataset(df, window, train_frac=0.7, val_frac=0.15):
    total = len(df) - window
    t_idx = int(train_frac * total)
    v_idx = int((train_frac + val_frac) * total)

    train_df = df.iloc[:t_idx]
    val_df   = df.iloc[t_idx - window:v_idx]
    test_df  = df.iloc[v_idx - window:]

    return train_df, val_df, test_df

def scale_train_test(train_df, test_df, cols):
    scaler = StandardScaler()
    train_scaled = train_df.copy()
    test_scaled = test_df.copy()
    train_scaled[cols] = scaler.fit_transform(train_df[cols])
    test_scaled[cols] = scaler.transform(test_df[cols])
    return train_scaled, test_scaled, scaler  # <- add scaler as third return

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

## 3  Data Acquisition & Feature Engineering

In [None]:
# --- 3.1 Macro‑economic time‑series ---
def fetch_fast_macro_data(start: str, end: str) -> pd.DataFrame:
    fred = Fred(api_key=CFG.FRED_API_KEY)
    data = {
        '10y_treasury': fred.get_series('DGS10', start, end),
        '2y_treasury':  fred.get_series('DGS2',  start, end),
        'high_yield_spread': fred.get_series('BAMLH0A0HYM2EY', start, end)
    }
    df = pd.DataFrame(data)
    df.index = pd.to_datetime(df.index)
    df['yield_curve_slope'] = df['10y_treasury'] - df['2y_treasury']
    return df.resample('D').ffill()

# --- 3.3 FinBERT pipeline (news sentiment) ---
tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finbert')
finbert   = AutoModelForSequenceClassification.from_pretrained('ProsusAI/finbert').to(CFG.DEVICE)
sentiment_pipeline = pipeline('sentiment-analysis', model=finbert, tokenizer=tokenizer,
                              device=0 if CFG.DEVICE=='cuda' else -1, batch_size=32)

def fetch_news_sentiment(csv_path: Path, start: str, end: str) -> pd.DataFrame:
    raw = pd.read_csv(csv_path, parse_dates=['Time'], dayfirst=True)
    raw.columns = [c.strip().lower() for c in raw.columns]
    raw = raw[(raw['time'] >= start) & (raw['time'] <= end)].dropna(subset=['title'])
    raw['title'] = raw['title'].str[:512]

    scores = []
    for i in range(0, len(raw), 32):
        scores.extend(sentiment_pipeline(raw['title'].iloc[i:i+32].tolist()))

    # Map back
    probs = pd.DataFrame(scores)
    mapping = {'positive': 'positive', 'LABEL_1': 'positive',
               'negative': 'negative', 'LABEL_0': 'negative'}
    raw['positive'] = (probs['label'].map(mapping) == 'positive') * probs['score']
    raw['negative'] = (probs['label'].map(mapping) == 'negative') * probs['score']
    raw['neutral']  = 1 - (raw['positive'] + raw['negative'])

    daily = raw.groupby(raw['time'].dt.floor('D'))[['positive','negative','neutral']].mean()
    daily['sentiment_score'] = daily['positive'] - (daily['negative'] + 0.5 * daily['neutral'])
    return daily[['sentiment_score']]

# --- 3.4 Market‑sentiment (VIX & SPX) ---
def calculate_market_sentiment(market: pd.DataFrame) -> pd.DataFrame:
    df = market.copy().rename(columns={'vix_price':'vix','spx_price':'spx'})
    df['vix_s'] = 1 - scale_series(df[['vix']])
    df['spx_s'] = scale_series(df['spx'].pct_change().fillna(0).to_frame())
    comp = (df['vix_s'] + df['spx_s']) / 2
    df['market_sentiment'] = MinMaxScaler(feature_range=(-1,1)).fit_transform(comp.values.reshape(-1,1))
    return df[['market_sentiment']].resample('D').ffill().bfill()

# --- 3.5 Options processing (ATM calls) ---
def process_options(path: Path, price_df: pd.DataFrame) -> pd.DataFrame:
    opts = pd.read_csv(path, parse_dates=['date', 'exdate'])
    opts = opts[opts['cp_flag'] == 'C'].copy()
    opts['days_to_expiry'] = (opts['exdate'] - opts['date']).dt.days
    opts['mid_price'] = (opts['best_bid'] + opts['best_offer']) / 2
    opts['strike_price'] /= 1000

    merged = opts.merge(price_df[['spx_price']], left_on='date', right_index=True, how='left')
    merged['moneyness'] = merged['spx_price'] / merged['strike_price']

    atm = merged[
        merged['delta'].abs().between(0.4, 0.6) &
        merged['days_to_expiry'].between(10, 30)
    ]
    features = {
        'impl_volatility':'mean', 'delta':'mean', 'gamma':'mean',
        'vega':'mean', 'theta':'mean', 'open_interest':'mean',
        'volume':'mean', 'mid_price':'mean', 'days_to_expiry':'mean',
        'moneyness':'mean'
    }
    agg = atm.groupby('date').agg(features).rename(columns={'mid_price':'target_option_price'})
    return agg.assign(target_option_price=lambda x: x['target_option_price'].shift(-1)).dropna()


In [None]:
# ---------- 3.6 Run data pipeline ----------
if USE_CACHED == False:
    logger.info('Fetching data…')

    macro_df   = fetch_fast_macro_data(CFG.START_DATE, CFG.END_DATE)

    spx = pd.read_csv(CFG.SPX_CSV_PATH, header=3,
                      names=['Date','spx_price','High','Low','Open','Volume'],
                      parse_dates=['Date']).set_index('Date')
    vix = pd.read_csv(CFG.VIX_CSV_PATH, header=3,
                      names=['Date','vix_price','High','Low','Open','Volume'],
                      parse_dates=['Date']).set_index('Date')

    market_df  = spx[['spx_price']].join(vix[['vix_price']], how='outer').loc[CFG.START_DATE:CFG.END_DATE].resample('D').ffill()

    news_sent  = fetch_news_sentiment(CFG.FIN_NEWS_PATH, CFG.START_DATE, CFG.END_DATE)
    market_sent= calculate_market_sentiment(market_df)
    options_df = process_options(CFG.OPTIONS_CSV_PATH, market_df)

    # Composite DF
    data_base = (macro_df
                .join(market_df, how='left')
                .join(news_sent, how='left')
                .join(market_sent, how='left')
                .join(options_df, how='inner')
                .ffill().bfill().dropna())

    # Greeks PCA
    greeks = ['delta','gamma','vega','theta']
    pca = PCA(n_components=1)
    data_base['greeks_signal_1'] = pca.fit_transform(data_base[greeks])

else:
    data_base = pd.read_csv(CFG.DRIVE_ROOT / 'processed_data.csv', index_col=0, parse_dates=True)


In [None]:
# Columns to keep (only scaled features + scaled target)
inference_cols = (
    CFG.MACRO_COLS +
    CFG.SENTIMENT_COLS +
    CFG.OPTION_COLS +
    ['target_option_price']
)

# Save to inference_eval.csv
test_full[inference_cols].to_csv('inference_eval.csv', index=False)
print("✅ Saved safe inference_eval.csv")


## 4  PyTorch Dataset Construction

In [None]:
class UnifiedSequenceDataset(Dataset):
    """Flexible dataset returning flat, early‑fusion or multimodal outputs."""
    def __init__(self, df: pd.DataFrame, window: int, mode: str,
                 feature_groups: List[str]):
        self.window = window
        self.mode   = mode.lower()
        fmap = {'macro': CFG.MACRO_COLS,
                'sentiment': CFG.SENTIMENT_COLS,
                'options': CFG.OPTION_COLS}
        self.cols = sum([fmap[g] for g in feature_groups], [])
        self.modal_cols = {k: [c for c in self.cols if c in v] for k, v in fmap.items()}

        self.rows = []
        for i in range(len(df) - window):
            win = df.iloc[i:i+window]
            y   = df.iloc[i+window]['target_option_price']
            if self.mode == 'flat':
                x = win[self.cols].values.flatten()
                self.rows.append((x, y))
            elif self.mode == 'early_fusion':
                x = win[self.cols].values
                self.rows.append((x, y))
            elif self.mode == 'multimodal':
                m = win[self.modal_cols['macro']].values
                s = win[self.modal_cols['sentiment']].values
                o = win[self.modal_cols['options']].values
                self.rows.append((m, s, o, y))
            else:
                raise ValueError(f'Unsupported mode {self.mode}')

    def __len__(self): return len(self.rows)

    def __getitem__(self, idx):
        item = self.rows[idx]
        if self.mode in ('flat','early_fusion'):
            x, y = item
            return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)
        m, s, o, y = item
        return (torch.tensor(m, dtype=torch.float32),
                torch.tensor(s, dtype=torch.float32),
                torch.tensor(o, dtype=torch.float32),
                torch.tensor(y, dtype=torch.float32))


## 5  Model Architectures

In [None]:
# ---- Helper to build MLP ----
def build_mlp(in_dim, hidden, drop):
    layers = []
    for h, d in zip(hidden, drop):
        layers += [nn.Linear(in_dim, h), nn.ReLU()]
        if d > 0: layers.append(nn.Dropout(d))
        in_dim = h
    layers.append(nn.Linear(in_dim, 1))
    return nn.Sequential(*layers)

# ---- Sub‑modules ----
class LSTMSubNet(nn.Module):
    def __init__(self, in_size, h_size, n_layers, dropout):
        super().__init__()
        self.lstm = nn.LSTM(in_size, h_size, num_layers=n_layers,
                            batch_first=True, dropout=dropout if n_layers>1 else 0.)
        self.fc = nn.Linear(h_size, h_size)
    def forward(self, x):
        _, (h, _) = self.lstm(x)
        return self.fc(h[-1])

# ---- 1) Feed‑forward (options only) ----
class OptionsOnlyFNN(nn.Module):
    def __init__(self, in_dim, hidden, drop):
        super().__init__()
        self.net = build_mlp(in_dim, hidden, drop)
    def forward(self, x): return self.net(x).squeeze(-1)

# ---- 2) Feed‑forward (all features) ----
class EarlyFusionFNN(nn.Module):
    def __init__(self, in_dim, hidden, drop):
        super().__init__()
        self.net = build_mlp(in_dim, hidden, drop)
    def forward(self, x): return self.net(x).squeeze(-1)

# ---- 3) LSTM (options only) ----
class LSTMOptionsOnly(nn.Module):
    def __init__(self, in_size, h, layers, dropout):
        super().__init__()
        self.lstm = nn.LSTM(in_size, h, num_layers=layers, batch_first=True,
                            dropout=dropout if layers>1 else 0.)
        self.fc = nn.Sequential(nn.BatchNorm1d(h), nn.ReLU(), nn.Dropout(dropout),
                                nn.Linear(h,1))
    def forward(self, x):
        _, (h, _) = self.lstm(x)
        return self.fc(h[-1]).squeeze(-1)

# ---- 4) Bidirectional LSTM (early fusion) ----
class LSTMEarlyFusion(nn.Module):
    def __init__(self, in_size, h, layers, dropout):
        super().__init__()
        self.lstm = nn.LSTM(in_size, h, num_layers=layers, batch_first=True,
                            dropout=dropout if layers>1 else 0., bidirectional=True)
        self.fc = nn.Sequential(nn.Linear(h*2, h), nn.ReLU(), nn.Linear(h,1))
    def forward(self, x):
        _, (h, _) = self.lstm(x)
        h_cat = torch.cat((h[-2], h[-1]), dim=1)
        return self.fc(h_cat).squeeze(-1)

# ---- 5) Multimodal LSTM ----
class LSTMMultimodal(nn.Module):
    def __init__(self, sizes, h, layers, drop):
        super().__init__()
        self.mac = LSTMSubNet(sizes[0], h, layers, drop)
        self.sent= LSTMSubNet(sizes[1], h, layers, drop)
        self.opt = LSTMSubNet(sizes[2], h, layers, drop)
        self.fuse= nn.Sequential(nn.Linear(h*3,128), nn.ReLU(), nn.Dropout(drop),
                                 nn.Linear(128,h), nn.ReLU(), nn.Linear(h,1))
    def forward(self, m, s, o):
        x = torch.cat([self.mac(m), self.sent(s), self.opt(o)], dim=1)
        return self.fuse(x).squeeze(-1)

# ---- 6) Attention‑fusion LSTM ----
class AttentionFusionLSTM(nn.Module):
    def __init__(self, sizes, h, layers, drop):
        super().__init__()
        self.mac = LSTMSubNet(sizes[0], h, layers, drop)
        self.sent= LSTMSubNet(sizes[1], h, layers, drop)
        self.opt = LSTMSubNet(sizes[2], h, layers, drop)

        self.pos = nn.Parameter(torch.randn(1, 3, h))
        self.attn = nn.MultiheadAttention(h, 1, batch_first=True)
        self.norm = nn.LayerNorm(h)

        self.out = nn.Sequential(
            nn.Linear(h, h//2),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Linear(h//2, 1)
        )

    def forward(self, m, s, o, return_attention=False):
        x = torch.stack([self.mac(m), self.sent(s), self.opt(o)], dim=1) + self.pos
        z, attn_weights = self.attn(x, x, x, need_weights=True)
        z = self.norm(z + x).mean(1)
        out = self.out(z).squeeze(-1)

        if return_attention:
            return out, attn_weights  # shape: (batch_size, num_heads=1, tokens=3)
        return out


# ---- 7) Cross‑attention LSTM ----
class CrossAttentionLSTM(nn.Module):
    def __init__(self, sizes, h, layers, drop):
        super().__init__()
        self.mac_lstm = nn.LSTM(sizes[0], h, num_layers=layers, batch_first=True, dropout=drop if layers>1 else 0.)
        self.sent_lstm = nn.LSTM(sizes[1], h, num_layers=layers, batch_first=True, dropout=drop if layers>1 else 0.)
        self.opt_lstm = nn.LSTM(sizes[2], h, num_layers=layers, batch_first=True, dropout=drop if layers>1 else 0.)

        # Sent attends to Macro, and Macro attends to Sent
        self.sent2mac_attn = nn.MultiheadAttention(h, 1, batch_first=True)
        self.mac2sent_attn = nn.MultiheadAttention(h, 1, batch_first=True)

        self.fuse = nn.Sequential(
            nn.LayerNorm(h * 3),  # Concatenate: opt, sent->mac, mac->sent
            nn.Linear(h * 3, 128),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Linear(128, 1)
        )

    def forward(self, mac, sent, opt):
        mac_seq, _ = self.mac_lstm(mac)     # (B, T, H)
        sent_seq, _ = self.sent_lstm(sent)  # (B, T, H)
        opt_seq, _ = self.opt_lstm(opt)     # (B, T, H)

        # Sentiment attends to Macro
        sent2mac, _ = self.sent2mac_attn(sent_seq, mac_seq, mac_seq)

        # Macro attends to Sentiment
        mac2sent, _ = self.mac2sent_attn(mac_seq, sent_seq, sent_seq)

        # Feature summary: mean of sequences
        sent2mac_feat = sent2mac.mean(1)
        mac2sent_feat = mac2sent.mean(1)
        opt_feat = opt_seq.mean(1)

        feat = torch.cat([opt_feat, sent2mac_feat, mac2sent_feat], dim=1)
        return self.fuse(feat).squeeze(-1)

## 6  Data Loaders & Model Registry

In [None]:
datasets = {
    # Feedforward models (flat)
    'OptionsOnlyFNN': (
        UnifiedSequenceDataset(train_full, CFG.WINDOW, 'flat', ['options']),
        UnifiedSequenceDataset(val_full,   CFG.WINDOW, 'flat', ['options']),
        UnifiedSequenceDataset(test_full,  CFG.WINDOW, 'flat', ['options'])
    ),

    'FNN_AllFeatures': (
        UnifiedSequenceDataset(train_full, CFG.WINDOW, 'flat', ['macro', 'sentiment', 'options']),
        UnifiedSequenceDataset(val_full,   CFG.WINDOW, 'flat', ['macro', 'sentiment', 'options']),
        UnifiedSequenceDataset(test_full,  CFG.WINDOW, 'flat', ['macro', 'sentiment', 'options'])
    ),

    # LSTM models (early fusion)
    'LSTM_OptionsOnly': (
        UnifiedSequenceDataset(train_full, CFG.WINDOW, 'early_fusion', ['options']),
        UnifiedSequenceDataset(val_full,   CFG.WINDOW, 'early_fusion', ['options']),
        UnifiedSequenceDataset(test_full,  CFG.WINDOW, 'early_fusion', ['options'])
    ),

    'LSTM_EarlyFusion': (
        UnifiedSequenceDataset(train_full, CFG.WINDOW, 'early_fusion', ['macro', 'sentiment', 'options']),
        UnifiedSequenceDataset(val_full,   CFG.WINDOW, 'early_fusion', ['macro', 'sentiment', 'options']),
        UnifiedSequenceDataset(test_full,  CFG.WINDOW, 'early_fusion', ['macro', 'sentiment', 'options'])
    ),

    # LSTM models (multimodal)
    'Multimodal_LSTM': (
        UnifiedSequenceDataset(train_red, CFG.WINDOW, 'multimodal', ['macro', 'sentiment', 'options']),
        UnifiedSequenceDataset(val_red,   CFG.WINDOW, 'multimodal', ['macro', 'sentiment', 'options']),
        UnifiedSequenceDataset(test_red,  CFG.WINDOW, 'multimodal', ['macro', 'sentiment', 'options'])
    ),

    'LSTM_AttnFusion': (
        UnifiedSequenceDataset(train_red, CFG.WINDOW, 'multimodal', ['macro', 'sentiment', 'options']),
        UnifiedSequenceDataset(val_red,   CFG.WINDOW, 'multimodal', ['macro', 'sentiment', 'options']),
        UnifiedSequenceDataset(test_red,  CFG.WINDOW, 'multimodal', ['macro', 'sentiment', 'options'])
    ),

    'CrossAttention_LSTM': (
        UnifiedSequenceDataset(train_red, CFG.WINDOW, 'multimodal', ['macro', 'sentiment', 'options']),
        UnifiedSequenceDataset(val_red,   CFG.WINDOW, 'multimodal', ['macro', 'sentiment', 'options']),
        UnifiedSequenceDataset(test_red,  CFG.WINDOW, 'multimodal', ['macro', 'sentiment', 'options'])
    )

}


# --- 6.4 Build loaders ---
loaders = {}
for key, (train_ds, val_ds, test_ds) in datasets.items():
    bs = 32 if 'Cross' in key or 'Early' in key else 64
    loaders[key] = (
        DataLoader(train_ds, batch_size=bs, shuffle=True),
        DataLoader(val_ds,   batch_size=bs, shuffle=False),
        DataLoader(test_ds,  batch_size=bs, shuffle=False)
    )

## 7  Training & Evaluation Utilities

In [None]:
# --- 7  Training & Evaluation Utilities  (updated) --------------------------

def train_model(model, train_loader, val_loader, epochs:int, lr:float,
                patience:int=8):
    """
    Returns dict with epoch-wise train & val losses so that we can inspect
    over-fitting afterwards.
    """
    optim     = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                    optim, 'min', factor=.5, patience=10)
    loss_fn   = nn.MSELoss()
    history   = {'train': [], 'val': []}
    best, counter = float('inf'), 0

    for ep in range(1, epochs+1):
        # ---- training step ----
        model.train(); running = 0
        for batch in train_loader:
            optim.zero_grad()
            pred, y = forward_pass(model, batch)
            loss = loss_fn(pred, y); loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.)
            optim.step()
            running += loss.item()
        train_loss = running / len(train_loader)

        # ---- validation step ----
        model.eval(); running = 0
        with torch.no_grad():
            for batch in val_loader:
                pred, y = forward_pass(model, batch)
                running += loss_fn(pred, y).item()
        val_loss = running / len(val_loader)
        history['train'].append(train_loss)
        history['val'  ].append(val_loss)

        logger.info(f'E{ep:02d}/{epochs} — train {train_loss:.5f} | '
                    f'val {val_loss:.5f}')
        scheduler.step(val_loss)

        # ----- early stop -----
        if val_loss < best:
            best, counter = val_loss, 0
        else:
            counter += 1
            if counter >= patience:
                logger.info('Early-stopping ✓'); break

    torch.cuda.empty_cache(); gc.collect()
    return history


def evaluate(model, loader) -> Dict[str,float]:
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for batch in loader:
            p, y = forward_pass(model, batch)
            preds.extend(p.cpu().numpy()); trues.extend(y.cpu().numpy())

    preds = np.array(preds).reshape(-1, 1)
    trues = np.array(trues).reshape(-1, 1)

    # Inverse-transform to original scale
    preds = target_scaler.inverse_transform(preds).flatten()
    trues = target_scaler.inverse_transform(trues).flatten()

    return {
        'mae':  mean_absolute_error(trues, preds),
        'rmse': np.sqrt(mean_squared_error(trues, preds)),
        'r2':   r2_score(trues, preds),
        'y': trues, 'y_hat': preds
    }



## 8  Experiment Loop

In [None]:
model_cfg = {
    'OptionsOnlyFNN': dict(
        cls=OptionsOnlyFNN,
        args=dict(in_dim=len(CFG.OPTION_COLS)*CFG.WINDOW,
                  hidden=[448, 320], drop=[0.2589736052138709, 0.2932864807416329]),
        lr=5.606599300905802e-05, epochs=200  # slightly longer training, lower LR
    ),

    'FNN_AllFeatures': dict(
        cls=EarlyFusionFNN,
        args=dict(in_dim=(len(CFG.MACRO_COLS) + len(CFG.SENTIMENT_COLS) + len(CFG.OPTION_COLS)) * CFG.WINDOW,
                  hidden=[64,192, 384, 512], drop=[0.21690647686746356, 0.17333303298100486, 0.051245528433709775, 0.28461553354003577]),
        lr=3.619917167764799e-05, epochs=200  # reduce overfit tendency
    ),

    'LSTM_OptionsOnly': dict(
        cls=LSTMOptionsOnly,
        args=dict(in_size=len(CFG.OPTION_COLS), h=160, layers=1, dropout=0.2548163478893326),
        lr=0.0022592222144183815, epochs=100
    ),

    'LSTM_EarlyFusion': dict(
        cls=LSTMEarlyFusion,
        args=dict(in_size=len(CFG.MACRO_COLS) + len(CFG.SENTIMENT_COLS) + len(CFG.OPTION_COLS),
                  h=224, layers=2, dropout=0.22346200203706412),
        lr=0.0003977138694027377, epochs=60
    ),

    'Multimodal_LSTM': dict(
        cls=LSTMMultimodal,
        args=dict(sizes=[len(CFG.MACRO_COLS), len(CFG.SENTIMENT_COLS), len(CFG.OPTION_COLS)],
                  h=64, layers=1, drop=0.42764880293625906),
        lr=0.00012394645403086292, epochs=60  # best-performing model: increase rep capacity
    ),

    'LSTM_AttnFusion': dict(
        cls=AttentionFusionLSTM,
        args=dict(sizes=[len(CFG.MACRO_COLS), len(CFG.SENTIMENT_COLS), len(CFG.OPTION_COLS)],
                  h=96, layers=1, drop=0.22167129082131226),
        lr=0.00031653476644270535, epochs=60
    ),

    'CrossAttention_LSTM': dict(
        cls=CrossAttentionLSTM,
        args=dict(sizes=[len(CFG.MACRO_COLS), len(CFG.SENTIMENT_COLS), len(CFG.OPTION_COLS)],
                  h=128, layers=3, drop=0.4087067551985453),
        lr=0.000357889995198984, epochs=60
    )

}

results       = {}   # metrics for the scoreboard
models_cache  = {}   # keep refs for later explainability / saving

for name, cfg in model_cfg.items():
    logger.info(f'\n▶ Training {name}')
    model = cfg['cls'](**cfg['args']).to(CFG.DEVICE)
    train_loader, val_loader, test_loader = loaders[name]

    # ---- train & capture loss curves ------------------------------------
    history = train_model(model, train_loader, val_loader,
                          cfg['epochs'], cfg['lr'])

    # ---- quick over-fitting visual --------------------------------------
    plt.figure(figsize=(4, 3))
    plt.plot(history['train'], label='train')
    plt.plot(history['val'],   label='val')
    plt.xlabel('epoch'); plt.ylabel('MSE')
    plt.title(f'{name} loss curves'); plt.legend(); plt.tight_layout()

    if (len(history['val']) > 3 and
        history['val'][-1] > min(history['val']) * 1.05):
        logger.warning(f'{name}: validation loss is rising → possible over-fit')
    plt.savefig(CFG.OUTPUT_DIR / f'{name}_loss_curves.png'); plt.close()

    # ---- final evaluation on test set -----------------------------
    res = evaluate(model, test_loader)
    results[name] = res
    models_cache[name] = model.cpu()  # stash for later analysis / saving
    save_path = Path(CFG.SAVED_MODEL_DIR) / f"{name}.pt"
    os.makedirs(CFG.SAVED_MODEL_DIR, exist_ok=True)
    torch.save(model.state_dict(), save_path)

    # ---- scatter plot: prediction vs actual -----------------------------
    fig = plt.figure(figsize=(5, 5))
    plt.scatter(res['y'], res['y_hat'], alpha=.3)
    lim = max(res['y'].max(), res['y_hat'].max())
    plt.plot([0, lim], [0, lim], 'r--')
    plt.xlabel('Actual'); plt.ylabel('Predicted')
    plt.title(f'{name} — Pred vs True'); plt.grid(True); plt.tight_layout()
    fig.savefig(CFG.OUTPUT_DIR / f'{name}_pred_vs_actual.png'); plt.close(fig)
    logger.info(f'Plots saved → {CFG.OUTPUT_DIR}')

# ---- leaderboard print-out -----------------------------------------------
print('\n=== Model Comparison ===')
for n, m in results.items():
    print(f"{n:<22} MAE {m['mae']:.4f}  RMSE {m['rmse']:.4f}  R² {m['r2']:.4f}")


## 9  Hyper‑Parameter Optimisation (Optuna)

In [None]:
def get_objective(model_name):
    def objective(trial):
        # Dataset mapping with 3-way split
        dataset_map = {
            'OptionsOnlyFNN': ('flat', ['options'], train_full, val_full, test_full),
            'FNN_AllFeatures': ('flat', ['macro', 'sentiment', 'options'], train_full, val_full, test_full),
            'LSTM_OptionsOnly': ('early_fusion', ['options'], train_full, val_full, test_full),
            'LSTM_EarlyFusion': ('early_fusion', ['macro', 'sentiment', 'options'], train_full, val_full, test_full),
            'Multimodal_LSTM': ('multimodal', ['macro', 'sentiment', 'options'], train_red, val_red, test_red),
            'LSTM_AttnFusion': ('multimodal', ['macro', 'sentiment', 'options'], train_red, val_red, test_red),
            'CrossAttention_LSTM': ('multimodal', ['macro', 'sentiment', 'options'], train_red, val_red, test_red),
        }

        mode, groups, train_df_used, val_df_used, test_df_used = dataset_map[model_name]

        # Create datasets and dataloaders
        train_ds = UnifiedSequenceDataset(train_df_used, CFG.WINDOW, mode, groups)
        val_ds   = UnifiedSequenceDataset(val_df_used,   CFG.WINDOW, mode, groups)
        test_ds  = UnifiedSequenceDataset(test_df_used,  CFG.WINDOW, mode, groups)

        batch_size = trial.suggest_categorical('batch_size', [32, 64])
        train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
        val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False)
        test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False)

        # Model-specific search
        if model_name in ['OptionsOnlyFNN', 'FNN_AllFeatures']:
            input_dim = len(sum([CFG.MACRO_COLS, CFG.SENTIMENT_COLS, CFG.OPTION_COLS], [])) if model_name == 'FNN_AllFeatures' else len(CFG.OPTION_COLS)
            input_dim *= CFG.WINDOW
            num_layers = trial.suggest_int('num_layers', 2, 4)
            hidden = [trial.suggest_int(f'h{i}', 64, 512, step=64) for i in range(num_layers)]
            drop   = [trial.suggest_float(f'd{i}', 0.05, 0.3) for i in range(num_layers)]
            lr     = trial.suggest_float('lr', 1e-5, 1e-3, log=True)
            model_cls = EarlyFusionFNN if model_name == 'FNN_AllFeatures' else OptionsOnlyFNN
            model = model_cls(input_dim, hidden, drop).to(CFG.DEVICE)

        elif model_name == 'LSTM_OptionsOnly':
            in_size = len(CFG.OPTION_COLS)
            h = trial.suggest_int('hidden_size', 64, 256, step=32)
            layers = trial.suggest_int('layers', 1, 3)
            drop = trial.suggest_float('dropout', 0.1, 0.4)
            lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)
            model = LSTMOptionsOnly(in_size, h, layers, drop).to(CFG.DEVICE)

        elif model_name == 'LSTM_EarlyFusion':
            in_size = len(CFG.MACRO_COLS) + len(CFG.SENTIMENT_COLS) + len(CFG.OPTION_COLS)
            h = trial.suggest_int('hidden_size', 64, 256, step=32)
            layers = trial.suggest_int('layers', 1, 3)
            drop = trial.suggest_float('dropout', 0.2, 0.5)
            lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)
            model = LSTMEarlyFusion(in_size, h, layers, drop).to(CFG.DEVICE)

        elif model_name == 'Multimodal_LSTM':
            h = trial.suggest_int('hidden_size', 64, 256, step=32)
            layers = trial.suggest_int('layers', 1, 3)
            drop = trial.suggest_float('dropout', 0.2, 0.5)
            lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)
            sizes = [len(CFG.MACRO_COLS), len(CFG.SENTIMENT_COLS), len(CFG.OPTION_COLS)]
            model = LSTMMultimodal(sizes, h, layers, drop).to(CFG.DEVICE)

        elif model_name == 'LSTM_AttnFusion':
            h = trial.suggest_int('hidden_size', 64, 256, step=32)
            layers = trial.suggest_int('layers', 1, 3)
            drop = trial.suggest_float('dropout', 0.2, 0.5)
            lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)
            sizes = [len(CFG.MACRO_COLS), len(CFG.SENTIMENT_COLS), len(CFG.OPTION_COLS)]
            model = AttentionFusionLSTM(sizes, h, layers, drop).to(CFG.DEVICE)

        elif model_name == 'CrossAttention_LSTM':
            h = trial.suggest_int('hidden_size', 64, 256, step=32)
            layers = trial.suggest_int('layers', 1, 3)
            drop = trial.suggest_float('dropout', 0.2, 0.5)
            lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)
            sizes = [len(CFG.MACRO_COLS), len(CFG.SENTIMENT_COLS), len(CFG.OPTION_COLS)]
            model = CrossAttentionLSTM(sizes, h, layers, drop).to(CFG.DEVICE)
        else:
            raise ValueError(f"Unknown model: {model_name}")

        # Train and evaluate
        try:
            train_model(model, train_loader, val_loader, epochs=100, lr=lr, patience=10)
            result = evaluate(model, test_loader)  # Evaluate on TEST data
            return result['rmse']
        except Exception as e:
            print(f'Trial failed for {model_name}: {e}')
            return float('inf')

    return objective


In [None]:
model_names = [
    'OptionsOnlyFNN',
    'FNN_AllFeatures',
    'LSTM_OptionsOnly',
    'LSTM_EarlyFusion',
    'Multimodal_LSTM',
    'LSTM_AttnFusion',
    'CrossAttention_LSTM'
]

studies = {}

for model_name in model_names:
    print(f"\n🔍 Starting Optuna tuning for: {model_name}")
    study = optuna.create_study(direction='minimize')
    study.optimize(get_objective(model_name), n_trials=10)
    studies[model_name] = study
    print(f"\n✅ Best RMSE for {model_name}: {study.best_value:.4f}")
    print("Best hyperparameters:")
    for k, v in study.best_params.items():
        print(f"  {k}: {v}")


## 10  Black‑Scholes Benchmark

In [None]:
from scipy.stats import norm

# Recover the minimal required options_df from data_base
options_df = data_base[[
    'impl_volatility', 'moneyness', 'days_to_expiry', 'target_option_price'
]].copy()

# Reconstruct columns needed for BS model
options_df['strike_price'] = data_base['spx_price'] / data_base['moneyness']
options_df['spx_price']    = data_base['spx_price']
options_df['T']            = data_base['days_to_expiry'] / 365
options_df['r']            = data_base['10y_treasury'] / 100

# Drop rows with missing values (for safety)
options_df = options_df.dropna()

def black_scholes_call(S,K,T,r,sigma):
    if T<=0 or sigma<=0: return np.maximum(S-K,0)
    d1 = (np.log(S/K)+(r+.5*sigma**2)*T)/(sigma*np.sqrt(T))
    d2 = d1 - sigma*np.sqrt(T)
    return S*norm.cdf(d1) - K*np.exp(-r*T)*norm.cdf(d2)

bs = (options_df
      .join(macro_df[['10y_treasury']], how='left')
      .join(market_df[['spx_price']], how='left')
      .assign(strike_price=lambda d: d['spx_price']/d['moneyness'],
              T=lambda d: d['days_to_expiry']/365,
              r=lambda d: d['10y_treasury']/100)
      .dropna())

bs['bs_pred_price'] = bs.apply(lambda r: black_scholes_call(r['spx_price'],r['strike_price'],
                                                            r['T'],r['r'],r['impl_volatility']), axis=1)
actual, pred = bs['target_option_price'].values, bs['bs_pred_price'].values
print('Black‑Scholes — MAE %.4f  RMSE %.4f  R² %.4f' %
      (mean_absolute_error(actual,pred),
       np.sqrt(mean_squared_error(actual,pred)),
       r2_score(actual,pred)))

plt.figure(figsize=(6,6))
plt.scatter(actual, pred, alpha=.4)
lim=max(actual.max(),pred.max())
plt.plot([0,lim],[0,lim],'r--')
plt.xlabel('Actual'); plt.ylabel('BS Predicted'); plt.title('Black‑Scholes vs Actual'); plt.grid(True)
plt.show()
