In [None]:
# Small-data turn-on/off for all 6 notebook runs (01→06). Set once; applies to full pipeline.
USE_SMALL_DATA = False  # True = small data (N_SAMPLES); False = full data
N_SAMPLES = 10       # Max observations when USE_SMALL_DATA (e.g. 10 for quick test)
N_EPOCHS = 1       # Max training epochs when USE_SMALL_DATA (02, 03, 04)
# 01: applied automatically below. 02-04: epochs/n_epochs/num_epochs set automatically.

# Multi-Head Attention Robustness: Final Models Demo

This notebook demonstrates how to load and use the final trained models from the paper:
- **"Inherent Robustness of Multi-Head Attention in Cross-Sectional Asset Pricing: Theory and Empirical Evidence from Finance-Valid Adversarial Attacks"**

## Reproducibility

**Random seed is set to 42** for reproducible results. All models are evaluated in deterministic mode (dropout disabled). Results should be identical across runs.

## Data Splitting

This notebook uses the **same data splitting logic** as the evaluation script (`scripts/evaluate_adversarial_models.py`):
- **Training period**: 2005-01-01 to 2017-12-31
- **Validation period**: 2018-01-01 to 2019-12-31
- **Data preprocessing**: Matches the `CrossSectionalDataSplitter` class from the evaluation script

## Models Available

1. **Linear Baselines**: OLS, Ridge
2. **Tree-Based Baselines**: XGBoost
3. **Neural Baselines**: MLP
4. **Transformer Models**: Single-Head, Multi-Head, Multi-Head Diversity
5. **Adversarially Trained Models**: Models trained with A1, A2, A3, A4 attacks at various epsilons

## Workflow

1. Load data
2. **Train models from scratch** (OLS, Ridge, XGBoost, MLP, Single-Head, Multi-Head, Multi-Head Diversity)
3. **Adversarial training** for transformer models (A1-A4 attacks)
4. Compare standard vs adversarially trained models
5. Make predictions on validation set
6. Evaluate model performance (RMSE, R²)
7. Visualize predictions and training curves

In [1]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import random
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

print(f"Set random seed to {RANDOM_SEED} for reproducibility")

# Find repo root (multihead-attention-robustness); enables "from src.models..." 
# In Colab: mount Drive first so paths exist
try:
    from google.colab import drive
    drive.mount("/content/drive", force_remount=False)
except Exception:
    pass

def _find_repo_root():
    cwd = Path.cwd().resolve()
    candidates = [
        Path("/content/drive/MyDrive/multihead-attention-robustness"),
        Path("/content/drive/My Drive/multihead-attention-robustness"),
        Path("/content/repo_run"),
    ]
    for p in candidates:
        if (p / "src").exists():
            return p
    # Search under Drive root (MyDrive, My Drive, or root)
    drive_root = Path("/content/drive")
    if drive_root.exists():
        for base in [drive_root / "MyDrive", drive_root / "My Drive", drive_root]:
            p = base / "multihead-attention-robustness"
            if p.exists() and (p / "src").exists():
                return p
    # Walk up from cwd
    p = cwd
    for _ in range(10):
        if (p / "src").exists():
            return p
        if p.parent == p:
            break
        p = p.parent
    return cwd.parent if cwd.name == "notebooks" else cwd

repo_root = _find_repo_root()
sys.path.insert(0, str(repo_root))

# Import model definitions
from src.models.feature_token_transformer import FeatureTokenTransformer, SingleHeadTransformer

print(f"Working directory: {Path.cwd()}")
print(f"Repository root: {repo_root}")


Set random seed to 42 for reproducibility
Working directory: /content/drive/MyDrive/multihead-attention-robustness
Repository root: /content/drive/MyDrive/multihead-attention-robustness


## 1. Load Data

In [2]:
# Load cross-sectional data
data_path = repo_root / 'data' / 'cross_sectional' / 'master_table.csv'
print(f"Loading data from: {data_path}")

df = pd.read_csv(data_path)
print(f"Data shape: {df.shape}")
print(f"Columns: {list(df.columns[:10])}...")

# Set date as index for proper time-series splitting (matching evaluation script)
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])
    df = df.set_index('date')
    print(f"\nDate range: {df.index.min()} to {df.index.max()}")

df.head()

Loading data from: /content/drive/MyDrive/multihead-attention-robustness/data/cross_sectional/master_table.csv


Data shape: (31534, 27)
Columns: ['date', 'symbol', 'mom_1m', 'mom_6m', 'mom_12m', 'mom_12_1m', 'vol_3m', 'vol_12m', 'price', 'log_price']...

Date range: 2005-01-31 00:00:00 to 2025-12-31 00:00:00


Unnamed: 0_level_0,symbol,mom_1m,mom_6m,mom_12m,mom_12_1m,vol_3m,vol_12m,price,log_price,returns_1d,...,pb_ratio,dividend_yield,eps,roe,profit_margin,revenue_per_share,market_cap,covid_period,ret_fwd_1m,mktcap
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-01-31,AAP,-0.007141,0.161412,0.1074,0.114541,0.177014,0.272406,24.240944,3.188043,0.012926,...,1.126753,2.42,-10.19,-0.23868,-0.04369,144.046,2474117000.0,0,0.168909,2474117000.0
2005-01-31,ABCB,-0.054833,0.176429,0.130269,0.185102,0.303383,0.286354,12.617585,2.535091,0.02006,...,1.307446,1.04,5.77,0.10345,0.35227,16.466,5272271000.0,0,0.05703,5272271000.0
2005-01-31,AEO,0.073315,0.543977,1.736371,1.663056,0.253181,0.316677,8.883611,2.184208,0.026263,...,2.83375,1.84,1.13,0.12362,0.03903,30.098,4605641000.0,0,0.065551,4605641000.0
2005-01-31,ALGN,-0.201843,-0.489387,-0.582651,-0.380807,0.446573,0.566262,8.66,2.158715,-0.011416,...,3.036553,,5.14,0.09577,0.09501,54.387,12091450000.0,0,-0.125866,12091450000.0
2005-01-31,AMAT,-0.066902,-0.067995,-0.266944,-0.200042,0.288008,0.323094,11.51773,2.443888,-0.008729,...,11.350218,0.62,8.67,0.35508,0.24669,35.284,232778900000.0,0,0.097484,232778900000.0


In [3]:
# Use the same CrossSectionalDataSplitter as the evaluation script
class CrossSectionalDataSplitter:
    """Simple data splitter for cross-sectional data (matching evaluation script)."""
    
    def __init__(self, train_start='2005-01-01', train_end='2017-12-31',
                 val_start='2018-01-01', val_end='2019-12-31'):
        self.train_start = train_start
        self.train_end = train_end
        self.val_start = val_start
        self.val_end = val_end
    
    def split(self, master_table):
        """Split data into train and validation sets."""
        master_table.index = pd.to_datetime(master_table.index)
        
        train_data = master_table.loc[self.train_start:self.train_end]
        val_data = master_table.loc[self.val_start:self.val_end]
        
        return {
            'train': train_data,
            'val': val_data
        }
    
    def prepare_features_labels(self, data):
        """Prepare features and labels from data (matching evaluation script logic)."""
        if data.empty:
            return pd.DataFrame(), pd.Series()
        
        numeric_data = data.select_dtypes(include=[np.number])
        
        if numeric_data.empty:
            print("Warning: No numeric columns found in data")
            return pd.DataFrame(), pd.Series()
        
        exclude_cols = ['mktcap', 'market_cap', 'date', 'year', 'month', 'ticker', 'permno', 'gvkey']
        target_cols = ['return', 'returns', 'ret', 'target', 'y', 'next_return', 'forward_return', 
                      'ret_1', 'ret_1m', 'ret_12m', 'future_return', 'returns_1d']
        
        target_col = None
        for tc in target_cols:
            for col in numeric_data.columns:
                if tc.lower() in col.lower() and col.lower() not in [ec.lower() for ec in exclude_cols]:
                    target_col = col
                    break
            if target_col:
                break
        
        if target_col is None:
            potential_targets = [col for col in numeric_data.columns 
                               if col.lower() not in [ec.lower() for ec in exclude_cols]]
            if potential_targets:
                target_col = potential_targets[-2] if len(potential_targets) > 1 else potential_targets[-1]
            else:
                target_col = numeric_data.columns[-1]
        
        feature_cols = [col for col in numeric_data.columns 
                       if col != target_col and col.lower() not in [ec.lower() for ec in exclude_cols]]
        
        if not feature_cols:
            feature_cols = [col for col in numeric_data.columns if col != target_col]
        
        if not feature_cols:
            feature_cols = numeric_data.columns[:-1].tolist()
            target_col = numeric_data.columns[-1]
        
        X = numeric_data[feature_cols]
        y = numeric_data[target_col]
        
        return X, y

# Initialize splitter and split data
splitter = CrossSectionalDataSplitter()
data_splits = splitter.split(df)

train_df = data_splits['train']
val_df = data_splits['val']

print(f"Train period: {splitter.train_start} to {splitter.train_end}")
print(f"Validation period: {splitter.val_start} to {splitter.val_end}")
print(f"Train set: {train_df.shape[0]} samples")
print(f"Validation set: {val_df.shape[0]} samples")

# Prepare features and labels using the same logic as evaluation script
X_train_df, y_train = splitter.prepare_features_labels(train_df)
X_val_df, y_val = splitter.prepare_features_labels(val_df)

print(f"\nNumber of features: {X_train_df.shape[1]}")
print(f"Target column: {y_train.name}")

# Fill NaN values and convert to numpy arrays
X_train = X_train_df.fillna(0).values.astype(np.float32)
y_train = y_train.fillna(0).values.astype(np.float32)
X_val = X_val_df.fillna(0).values.astype(np.float32)
y_val = y_val.fillna(0).values.astype(np.float32)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

print(f"\nTrain features shape: {X_train_scaled.shape}")
print(f"Validation features shape: {X_val_scaled.shape}")
print(f"Feature columns: {list(X_train_df.columns[:5])}... ({len(X_train_df.columns)} total)")

Train period: 2005-01-01 to 2017-12-31
Validation period: 2018-01-01 to 2019-12-31
Train set: 18826 samples
Validation set: 3408 samples

Number of features: 22
Target column: returns_1d

Train features shape: (18826, 22)
Validation features shape: (3408, 22)
Feature columns: ['mom_1m', 'mom_6m', 'mom_12m', 'mom_12_1m', 'vol_3m']... (22 total)


## 2. Train Models

Train all models from scratch on the training data.