In [None]:
# Small-data turn-on/off for all 6 notebook runs (01→06). Set once; applies to full pipeline.
USE_SMALL_DATA = False  # True = small data (N_SAMPLES); False = full data
N_SAMPLES = 10       # Max observations when USE_SMALL_DATA (e.g. 10 for quick test)
N_EPOCHS = 1       # Max training epochs when USE_SMALL_DATA (02, 03, 04)
# 01: applied automatically below. 02-04: epochs/n_epochs/num_epochs set automatically.

In [None]:
# Imports and setup (needed when 02-06 run in separate kernel)
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import random
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# Repo root for src imports
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=False)
except Exception:
    pass
def _find_repo_root():
    cwd = Path.cwd().resolve()
    for p in [Path('/content/drive/MyDrive/multihead-attention-robustness'),
              Path('/content/drive/My Drive/multihead-attention-robustness'),
              Path('/content/repo_run')]:
        if (p / 'src').exists():
            return p
    drive_root = Path('/content/drive')
    if drive_root.exists():
        for base in [drive_root / 'MyDrive', drive_root / 'My Drive', drive_root]:
            p = base / 'multihead-attention-robustness'
            if p.exists() and (p / 'src').exists():
                return p
    p = cwd
    for _ in range(10):
        if (p / 'src').exists():
            return p
        if p.parent == p:
            break
        p = p.parent
    return cwd.parent if cwd.name == 'notebooks' else cwd
repo_root = _find_repo_root()
sys.path.insert(0, str(repo_root))
from src.models.feature_token_transformer import FeatureTokenTransformer, SingleHeadTransformer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
models = {}
training_history = {}
TRAINING_CONFIG = {
    'ols': {}, 'ridge': {'alpha': 1.0},
    'mlp': {'hidden_dims': [128, 64], 'learning_rate': 0.001, 'batch_size': 64, 'epochs': 100, 'patience': 10},
    'transformer': {'d_model': 72, 'num_heads': 8, 'num_layers': 2, 'd_ff': 512, 'dropout': 0.1,
                   'learning_rate': 0.0001, 'batch_size': 32, 'epochs': 100, 'patience': 20}
}


In [None]:
# Load fresh data from master_table.csv (standalone: each notebook pulls its own data)
data_path = repo_root / 'data' / 'cross_sectional' / 'master_table.csv'
df = pd.read_csv(data_path)
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])
    df = df.set_index('date')
class CrossSectionalDataSplitter:
    def __init__(self, train_start='2005-01-01', train_end='2017-12-31', val_start='2018-01-01', val_end='2019-12-31'):
        self.train_start, self.train_end = train_start, train_end
        self.val_start, self.val_end = val_start, val_end
    def split(self, master_table):
        master_table = master_table.copy()
        master_table.index = pd.to_datetime(master_table.index)
        return {'train': master_table.loc[self.train_start:self.train_end], 'val': master_table.loc[self.val_start:self.val_end]}
    def prepare_features_labels(self, data):
        if data.empty:
            return pd.DataFrame(), pd.Series()
        numeric_data = data.select_dtypes(include=[np.number])
        if numeric_data.empty:
            return pd.DataFrame(), pd.Series()
        exclude_cols = ['mktcap', 'market_cap', 'date', 'year', 'month', 'ticker', 'permno', 'gvkey']
        target_cols = ['return', 'returns', 'ret', 'target', 'y', 'next_return', 'forward_return', 'ret_1', 'ret_1m', 'ret_12m', 'future_return', 'returns_1d']
        target_col = None
        for tc in target_cols:
            for col in numeric_data.columns:
                if tc.lower() in col.lower() and col.lower() not in [ec.lower() for ec in exclude_cols]:
                    target_col = col
                    break
            if target_col:
                break
        if target_col is None:
            potential = [c for c in numeric_data.columns if c.lower() not in [ec.lower() for ec in exclude_cols]]
            target_col = potential[-2] if len(potential) > 1 else (potential[-1] if potential else numeric_data.columns[-1])
        feature_cols = [c for c in numeric_data.columns if c != target_col and c.lower() not in [ec.lower() for ec in exclude_cols]]
        if not feature_cols:
            feature_cols = [c for c in numeric_data.columns if c != target_col]
        if not feature_cols:
            feature_cols = numeric_data.columns[:-1].tolist()
            target_col = numeric_data.columns[-1]
        return numeric_data[feature_cols], numeric_data[target_col]
splitter = CrossSectionalDataSplitter()
data_splits = splitter.split(df)
train_df, val_df = data_splits['train'], data_splits['val']
X_train_df, y_train = splitter.prepare_features_labels(train_df)
X_val_df, y_val = splitter.prepare_features_labels(val_df)
X_train = X_train_df.fillna(0).values.astype(np.float32)
y_train = y_train.fillna(0).values.astype(np.float32)
X_val = X_val_df.fillna(0).values.astype(np.float32)
y_val = y_val.fillna(0).values.astype(np.float32)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
print(f'Loaded fresh data: train {X_train_scaled.shape[0]}, val {X_val_scaled.shape[0]}')


In [None]:
epochs = n_epochs = num_epochs = 100  # full training (standalone mode)


In [13]:
# Adversarial Attack Implementations (A1-A4)
def apply_a1_attack(X, epsilon=0.01):
    """A1: Measurement Error - bounded perturbations."""
    noise = np.random.normal(0, epsilon, X.shape)
    # Scale noise by feature standard deviation
    feature_std = np.std(X, axis=0, keepdims=True) + 1e-8
    noise = noise * feature_std
    return X + noise


def apply_a2_attack(X, missing_rate=0.1):
    """A2: Missingness/Staleness - set random features to zero."""
    X_adv = X.copy()
    n_samples, n_features = X.shape
    n_missing = int(n_features * missing_rate)
    
    for i in range(n_samples):
        missing_indices = np.random.choice(n_features, n_missing, replace=False)
        X_adv[i, missing_indices] = 0.0
    
    return X_adv


def apply_a3_attack(X, epsilon=0.01):
    """A3: Rank Manipulation - cross-sectional perturbation preserving ranks."""
    X_adv = X.copy()
    n_samples = X.shape[0]
    
    # Add small random perturbation that preserves relative ordering
    for i in range(n_samples):
        perturbation = np.random.normal(0, epsilon, X.shape[1])
        # Scale by feature std to maintain relative magnitudes
        feature_std = np.std(X[i], axis=0) + 1e-8
        perturbation = perturbation * feature_std
        X_adv[i] = X[i] + perturbation
    
    return X_adv


def apply_a4_attack(X, epsilon=1.0):
    """A4: Regime Shift - distribution shift attack."""
    # A4 simulates regime shift by scaling volatility
    # epsilon acts as volatility multiplier
    X_adv = X.copy()
    feature_std = np.std(X, axis=0, keepdims=True) + 1e-8
    # Generate noise with std = epsilon, then scale by feature std
    noise = np.random.normal(0, epsilon, X.shape) * feature_std
    X_adv = X + noise
    return X_adv





In [None]:
# Bootstrap Confidence Intervals for R² and Robustness Metrics
from scipy import stats

def bootstrap_confidence_interval(data, n_bootstrap=1000, confidence=0.95, method='percentile'):
    """
    Compute bootstrap confidence interval for a metric.
    
    Parameters:
    -----------
    data : array-like
        Sample data
    n_bootstrap : int
        Number of bootstrap samples
    confidence : float
        Confidence level (e.g., 0.95 for 95% CI)
    method : str
        'percentile' or 'bca' (bias-corrected and accelerated)
    
    Returns:
    --------
    mean : float
        Mean of the data
    std : float
        Standard error (standard deviation)
    ci_lower : float
        Lower bound of confidence interval
    ci_upper : float
        Upper bound of confidence interval
    """
    data = np.array(data)
    n = len(data)
    alpha = 1 - confidence
    
    # Bootstrap samples
    bootstrap_samples = []
    for _ in range(n_bootstrap):
        # Resample with replacement
        indices = np.random.choice(n, size=n, replace=True)
        bootstrap_samples.append(np.mean(data[indices]))
    
    bootstrap_samples = np.array(bootstrap_samples)
    
    # Compute statistics
    mean = np.mean(data)
    std = np.std(bootstrap_samples)  # Standard error
    
    if method == 'percentile':
        ci_lower = np.percentile(bootstrap_samples, 100 * alpha / 2)
        ci_upper = np.percentile(bootstrap_samples, 100 * (1 - alpha / 2))
    else:  # bca method
        # Bias-corrected and accelerated bootstrap
        z0 = stats.norm.ppf(np.mean(bootstrap_samples < mean))
        # Acceleration (simplified - using jackknife)
        jackknife_means = []
        for i in range(n):
            jackknife_data = np.delete(data, i)
            jackknife_means.append(np.mean(jackknife_data))
        jackknife_means = np.array(jackknife_means)
        a = np.sum((np.mean(jackknife_means) - jackknife_means)**3) / (6 * np.sum((np.mean(jackknife_means) - jackknife_means)**2)**1.5)
        
        # BCa adjustment
        z_alpha_lower = stats.norm.ppf(alpha / 2)
        z_alpha_upper = stats.norm.ppf(1 - alpha / 2)
        z_lower = z0 + (z0 + z_alpha_lower) / (1 - a * (z0 + z_alpha_lower))
        z_upper = z0 + (z0 + z_alpha_upper) / (1 - a * (z0 + z_alpha_upper))
        
        ci_lower = np.percentile(bootstrap_samples, 100 * stats.norm.cdf(z_lower))
        ci_upper = np.percentile(bootstrap_samples, 100 * stats.norm.cdf(z_upper))
    
    return {
        'mean': mean,
        'std': std,
        'ci_lower': ci_lower,
        'ci_upper': ci_upper,
        'confidence': confidence
    }

def compute_r2_with_ci(y_true, y_pred, n_bootstrap=1000, confidence=0.95):
    """
    Compute R² with bootstrap confidence interval.
    
    Parameters:
    -----------
    y_true : array-like
        True target values
    y_pred : array-like
        Predicted values
    n_bootstrap : int
        Number of bootstrap samples
    confidence : float
        Confidence level
    
    Returns:
    --------
    dict with r2, std, ci_lower, ci_upper
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    n = len(y_true)
    
    # Original R²
    r2_original = r2_score(y_true, y_pred)
    
    # Bootstrap R² values
    r2_bootstrap = []
    for _ in range(n_bootstrap):
        indices = np.random.choice(n, size=n, replace=True)
        y_true_boot = y_true[indices]
        y_pred_boot = y_pred[indices]
        r2_boot = r2_score(y_true_boot, y_pred_boot)
        r2_bootstrap.append(r2_boot)
    
    r2_bootstrap = np.array(r2_bootstrap)
    
    # Compute statistics
    std = np.std(r2_bootstrap)
    alpha = 1 - confidence
    ci_lower = np.percentile(r2_bootstrap, 100 * alpha / 2)
    ci_upper = np.percentile(r2_bootstrap, 100 * (1 - alpha / 2))
    
    return {
        'r2': r2_original,
        'std': std,
        'ci_lower': ci_lower,
        'ci_upper': ci_upper,
        'confidence': confidence
    }

print("✅ Bootstrap confidence interval functions loaded")

In [None]:
# Enhanced evaluation function with confidence intervals
def evaluate_model_under_attack_with_ci(model, model_name, X_val, y_val, attack_type, epsilon, 
                                        device='cpu', is_sklearn=False, num_runs=5, n_bootstrap=1000):
    """
    Evaluate a model under a specific attack with bootstrap confidence intervals.
    
    Returns:
    --------
    dict with metrics including confidence intervals for robustness
    """
    # Set model to eval mode
    if not is_sklearn:
        model.eval()
        for module in model.modules():
            if isinstance(module, nn.Dropout):
                module.eval()
    
    # Make clean predictions
    if is_sklearn:
        y_pred_clean = model.predict(X_val)
    else:
        with torch.no_grad():
            X_tensor = torch.FloatTensor(X_val).to(device)
            output = model(X_tensor)
            if isinstance(output, tuple):
                y_pred_tensor = output[0]
            else:
                y_pred_tensor = output
            y_pred_clean = y_pred_tensor.cpu().numpy().flatten()
    
    # Calculate clean RMSE and R² with CI
    clean_rmse = np.sqrt(mean_squared_error(y_val, y_pred_clean))
    r2_stats = compute_r2_with_ci(y_val, y_pred_clean, n_bootstrap=n_bootstrap)
    
    # Run attack multiple times and collect robustness values
    robustness_values = []
    adv_rmses = []
    
    for run in range(num_runs):
        # Apply attack
        if attack_type == 'a1':
            X_adv = apply_a1_attack(X_val, epsilon=epsilon)
        elif attack_type == 'a2':
            missing_rate = min(epsilon / 10.0, 0.8)
            X_adv = apply_a2_attack(X_val, missing_rate=missing_rate)
        elif attack_type == 'a3':
            X_adv = apply_a3_attack(X_val, epsilon=epsilon)
        elif attack_type == 'a4':
            X_adv = apply_a4_attack(X_val, epsilon=epsilon)
        else:
            X_adv = X_val.copy()
        
        # Make adversarial predictions
        if is_sklearn:
            y_pred_adv = model.predict(X_adv)
        else:
            with torch.no_grad():
                X_adv_tensor = torch.FloatTensor(X_adv).to(device)
                output_adv = model(X_adv_tensor)
                if isinstance(output_adv, tuple):
                    y_pred_adv_tensor = output_adv[0]
                else:
                    y_pred_adv_tensor = output_adv
                y_pred_adv = y_pred_adv_tensor.cpu().numpy().flatten()
        
        # Calculate adversarial RMSE
        adv_rmse = np.sqrt(mean_squared_error(y_val, y_pred_adv))
        adv_rmses.append(adv_rmse)
        
        # Calculate robustness for this run
        delta_rmse = adv_rmse - clean_rmse
        if clean_rmse > 0:
            robustness = min(1.0, 1.0 - (delta_rmse / clean_rmse))
        else:
            robustness = 1.0
        robustness_values.append(robustness)
    
    # Average across runs
    avg_adv_rmse = np.mean(adv_rmses)
    delta_rmse = avg_adv_rmse - clean_rmse
    avg_robustness = np.mean(robustness_values)
    
    # Compute robustness confidence interval
    robustness_ci = bootstrap_confidence_interval(
        robustness_values, n_bootstrap=n_bootstrap, confidence=0.95
    )
    
    return {
        'clean_rmse': clean_rmse,
        'adv_rmse': avg_adv_rmse,
        'delta_rmse': delta_rmse,
        'robustness': avg_robustness,
        'robustness_std': robustness_ci['std'],
        'robustness_ci_lower': robustness_ci['ci_lower'],
        'robustness_ci_upper': robustness_ci['ci_upper'],
        'r2': r2_stats['r2'],
        'r2_std': r2_stats['std'],
        'r2_ci_lower': r2_stats['ci_lower'],
        'r2_ci_upper': r2_stats['ci_upper']
    }

print("✅ Enhanced evaluation function with confidence intervals loaded")

In [None]:
# Compute Confidence Intervals for R² and Robustness Metrics
# R² CI is computed whenever models + validation data exist (end-to-end safe).
# Robustness CI is computed when robustness_df exists (from evaluation cell).

print("=" * 80)
print("COMPUTING CONFIDENCE INTERVALS FOR R² AND ROBUSTNESS")
print("=" * 80)

r2_ci_results = {}
robustness_ci_df = pd.DataFrame()
robustness_by_attack_ci_df = pd.DataFrame()

# --- Part 1: Always compute R² confidence intervals when models + val data exist ---
if 'models' in locals() and 'X_val_scaled' in locals() and 'y_val' in locals() and len(models) > 0:
    print("\nComputing R² confidence intervals for standard models...")
    _device = device if 'device' in locals() else ('cuda' if torch.cuda.is_available() else 'cpu')
    for model_name in ['OLS', 'Ridge', 'XGBoost', 'MLP', 'Single-Head', 'Multi-Head', 'Multi-Head Diversity']:
        if model_name in models:
            try:
                model = models[model_name]
                is_sklearn = model_name in ['OLS', 'Ridge', 'XGBoost']
                if is_sklearn:
                    y_pred = model.predict(X_val_scaled)
                else:
                    model.eval()
                    with torch.no_grad():
                        X_tensor = torch.FloatTensor(X_val_scaled).to(_device)
                        output = model(X_tensor)
                        y_pred = (output[0] if isinstance(output, tuple) else output).cpu().numpy().flatten()
                r2_stats = compute_r2_with_ci(y_val, y_pred, n_bootstrap=1000, confidence=0.95)
                r2_ci_results[model_name] = r2_stats
                print(f"  {model_name}: R² = {r2_stats['r2']:.6f} [95% CI: {r2_stats['ci_lower']:.6f}, {r2_stats['ci_upper']:.6f}], SE = {r2_stats['std']:.6f}")
            except Exception as e:
                print(f"  ⚠ Error computing R² CI for {model_name}: {e}")
    print(f"\n✓ R² CIs computed for {len(r2_ci_results)} models")
else:
    print("\n⚠ Skipping R² CI: need models, X_val_scaled, and y_val. Run data load + training cells first.")

# --- Part 2: Compute robustness CIs when robustness_df exists (from evaluation cell) ---
if 'robustness_df' in locals() and len(robustness_df) > 0:
    print("\nComputing robustness confidence intervals...")
    robustness_ci_summary = []
    for model_name in robustness_df['model_name'].unique():
        model_data = robustness_df[robustness_df['model_name'] == model_name]
        if len(model_data) > 0:
            robustness_values = model_data['robustness'].values
            robustness_ci = bootstrap_confidence_interval(robustness_values, n_bootstrap=1000, confidence=0.95)
            robustness_ci_summary.append({
                'model_name': model_name, 'mean_robustness': robustness_ci['mean'], 'robustness_std': robustness_ci['std'],
                'robustness_ci_lower': robustness_ci['ci_lower'], 'robustness_ci_upper': robustness_ci['ci_upper'], 'n_evaluations': len(model_data)
            })
            print(f"  {model_name}: Robustness = {robustness_ci['mean']:.4f} [95% CI: {robustness_ci['ci_lower']:.4f}, {robustness_ci['ci_upper']:.4f}], SE = {robustness_ci['std']:.4f} (n={len(model_data)})")
    robustness_ci_df = pd.DataFrame(robustness_ci_summary)
    print("\nComputing robustness CIs by attack type and epsilon...")
    robustness_by_attack_ci = []
    for model_name in robustness_df['model_name'].unique():
        for attack_type in robustness_df['attack_type'].unique():
            for epsilon in robustness_df['epsilon'].unique():
                subset = robustness_df[(robustness_df['model_name'] == model_name) & (robustness_df['attack_type'] == attack_type) & (robustness_df['epsilon'] == epsilon)]
                if len(subset) > 0:
                    rci = bootstrap_confidence_interval(subset['robustness'].values, n_bootstrap=1000, confidence=0.95)
                    robustness_by_attack_ci.append({'model_name': model_name, 'attack_type': attack_type, 'epsilon': epsilon, 'mean_robustness': rci['mean'], 'robustness_std': rci['std'], 'robustness_ci_lower': rci['ci_lower'], 'robustness_ci_upper': rci['ci_upper']})
    robustness_by_attack_ci_df = pd.DataFrame(robustness_by_attack_ci)
    print(f"✓ Robustness CIs: {len(robustness_ci_summary)} models, {len(robustness_by_attack_ci)} attack/epsilon combinations")
else:
    print("\n⚠ No robustness_df (or empty). Run 'Evaluate Existing Models Under Adversarial Attacks' for robustness CIs.")

print("\n" + "=" * 80)
print("CONFIDENCE INTERVALS COMPUTED")
print("=" * 80)
print("  - r2_ci_results: R² statistics with 95% CI")
print("  - robustness_ci_df: model-level robustness CIs (if evaluation was run)")
print("  - robustness_by_attack_ci_df: attack/epsilon-level robustness CIs (if evaluation was run)")

In [None]:
# Create Summary Table with Confidence Intervals
# Runs when r2_ci_results exists (from CI cell). Robustness columns N/A if evaluation was skipped.

if 'r2_ci_results' in locals() and len(r2_ci_results) > 0:
    print("=" * 80)
    print("SUMMARY TABLE: R² AND ROBUSTNESS WITH CONFIDENCE INTERVALS")
    print("=" * 80)
    
    summary_data = []
    _robustness_ci_df = robustness_ci_df if 'robustness_ci_df' in locals() and len(robustness_ci_df) > 0 else pd.DataFrame()
    
    for model_name in ['OLS', 'Ridge', 'XGBoost', 'MLP', 'Single-Head', 'Multi-Head', 'Multi-Head Diversity']:
        row = {'Model': model_name}
        if model_name in r2_ci_results:
            r2_stats = r2_ci_results[model_name]
            row['R²'] = f"{r2_stats['r2']:.6f}"
            row['R²_SE'] = f"{r2_stats['std']:.6f}"
            row['R²_CI_95'] = f"[{r2_stats['ci_lower']:.6f}, {r2_stats['ci_upper']:.6f}]"
        else:
            row['R²'] = row['R²_SE'] = row['R²_CI_95'] = "N/A"
        
        if len(_robustness_ci_df) > 0:
            model_robustness = _robustness_ci_df[_robustness_ci_df['model_name'] == model_name]
            if len(model_robustness) > 0:
                rob_stats = model_robustness.iloc[0]
                row['Robustness'] = f"{rob_stats['mean_robustness']:.4f}"
                row['Robustness_SE'] = f"{rob_stats['robustness_std']:.4f}"
                row['Robustness_CI_95'] = f"[{rob_stats['robustness_ci_lower']:.4f}, {rob_stats['robustness_ci_upper']:.4f}]"
                row['N_Evaluations'] = int(rob_stats['n_evaluations'])
            else:
                row['Robustness'] = row['Robustness_SE'] = row['Robustness_CI_95'] = "N/A"; row['N_Evaluations'] = 0
        else:
            row['Robustness'] = row['Robustness_SE'] = row['Robustness_CI_95'] = "N/A"; row['N_Evaluations'] = 0
        summary_data.append(row)
    
    summary_df = pd.DataFrame(summary_data)
    print("\n" + summary_df.to_string(index=False))
    
    if 'tables_dir' in locals():
        ci_summary_path = tables_dir / 'model_performance_with_ci.csv'
        summary_df.to_csv(ci_summary_path, index=False)
        print(f"\n✓ Summary table saved to: {ci_summary_path}")
    
    print("\n" + "=" * 80)
    print("LaTeX TABLE: MODEL PERFORMANCE WITH CONFIDENCE INTERVALS")
    print("=" * 80)
    latex_table = """\\begin{table}[h]
\\centering
\\caption{Model Performance with 95\\% Confidence Intervals}
\\label{tab:model_performance_ci}
\\footnotesize
\\begin{tabular}{lcccccc}
\\toprule
Model & R² & R² SE & R² 95\\% CI & Robustness & Robustness SE & Robustness 95\\% CI \\\\
\\midrule
"""
    for _, row in summary_df.iterrows():
        latex_table += f"{row['Model']} & {row['R²']} & {row['R²_SE']} & {row['R²_CI_95']} & {row['Robustness']} & {row['Robustness_SE']} & {row['Robustness_CI_95']} \\\\\\n"
    latex_table += """\\bottomrule
\\end{tabular}
\\vspace{0.1cm}
\\footnotesize
\\begin{minipage}{\\columnwidth}
\\textit{Note: R² and Robustness metrics with 95\\% bootstrap confidence intervals (1000 bootstrap samples). SE = Standard Error. Robustness is averaged across all attack types (A1-A4) and epsilon values (0.25, 0.5, 1.0).}
\\end{minipage}
\\end{table}
"""
    print(latex_table)
    if 'tables_dir' in locals():
        latex_path = tables_dir / 'model_performance_with_ci.tex'
        with open(latex_path, 'w') as f:
            f.write(latex_table)
        print(f"\n✓ LaTeX table saved to: {latex_path}")
else:
    print("⚠ Run the 'Compute Confidence Intervals for R² and Robustness' cell first.")

In [None]:
# Economically Interpretable Metrics: Spearman IC, Portfolio Returns, Turnover
from scipy.stats import spearmanr
import pandas as pd

def compute_spearman_ic(y_true, y_pred):
    """
    Compute Spearman rank Information Coefficient (IC).
    
    Parameters:
    -----------
    y_true : array-like
        True returns
    y_pred : array-like
        Predicted returns
    
    Returns:
    --------
    dict with ic, p_value, ic_ir (IC/Std(IC) if time series)
    """
    # Remove NaN values
    mask = ~(np.isnan(y_true) | np.isnan(y_pred))
    y_true_clean = np.array(y_true)[mask]
    y_pred_clean = np.array(y_pred)[mask]
    
    if len(y_true_clean) < 2:
        return {'ic': np.nan, 'p_value': np.nan, 'ic_ir': np.nan}
    
    # Compute Spearman correlation
    ic, p_value = spearmanr(y_true_clean, y_pred_clean)
    
    return {
        'ic': ic,
        'p_value': p_value,
        'ic_ir': np.nan  # Will compute IC-IR for time series
    }

def compute_ic_ir(ic_series):
    """
    Compute IC Information Ratio (mean IC / std IC).
    
    Parameters:
    -----------
    ic_series : array-like
        Time series of IC values
    
    Returns:
    --------
    float: IC-IR
    """
    ic_series = np.array(ic_series)
    ic_series = ic_series[~np.isnan(ic_series)]
    
    if len(ic_series) == 0:
        return np.nan
    
    mean_ic = np.mean(ic_series)
    std_ic = np.std(ic_series)
    
    if std_ic == 0:
        return np.nan
    
    return mean_ic / std_ic

def construct_long_short_portfolio(predictions, returns, quantile=0.2, equal_weight=True):
    """
    Construct long-short portfolio based on predictions.
    
    Parameters:
    -----------
    predictions : array-like
        Model predictions
    returns : array-like
        Actual returns
    quantile : float
        Top/bottom quantile for long/short positions (default 0.2 = top/bottom 20%)
    equal_weight : bool
        If True, equal weight positions; if False, weight by prediction strength
    
    Returns:
    --------
    dict with portfolio metrics
    """
    predictions = np.array(predictions)
    returns = np.array(returns)
    
    # Remove NaN
    mask = ~(np.isnan(predictions) | np.isnan(returns))
    predictions = predictions[mask]
    returns = returns[mask]
    
    if len(predictions) == 0:
        return {
            'long_short_return': np.nan,
            'long_return': np.nan,
            'short_return': np.nan,
            'sharpe_ratio': np.nan,
            'n_long': 0,
            'n_short': 0
        }
    
    # Determine long and short positions
    n = len(predictions)
    n_positions = max(1, int(n * quantile))
    
    # Long: top predictions
    long_indices = np.argsort(predictions)[-n_positions:]
    # Short: bottom predictions
    short_indices = np.argsort(predictions)[:n_positions]
    
    if equal_weight:
        # Equal weights
        long_weights = np.ones(len(long_indices)) / len(long_indices)
        short_weights = np.ones(len(short_indices)) / len(short_indices)
    else:
        # Weight by prediction strength (normalized)
        long_preds = predictions[long_indices]
        short_preds = predictions[short_indices]
        long_weights = long_preds / np.sum(long_preds) if np.sum(long_preds) != 0 else np.ones(len(long_indices)) / len(long_indices)
        short_weights = -short_preds / np.sum(np.abs(short_preds)) if np.sum(np.abs(short_preds)) != 0 else -np.ones(len(short_indices)) / len(short_indices)
    
    # Compute returns
    long_return = np.sum(long_weights * returns[long_indices])
    short_return = np.sum(short_weights * returns[short_indices])
    long_short_return = long_return - short_return
    
    # Compute Sharpe ratio (annualized, assuming monthly returns)
    if np.std(returns) > 0:
        sharpe_ratio = (long_short_return / np.std(returns)) * np.sqrt(12)  # Annualized
    else:
        sharpe_ratio = np.nan
    
    return {
        'long_short_return': long_short_return,
        'long_return': long_return,
        'short_return': short_return,
        'sharpe_ratio': sharpe_ratio,
        'n_long': len(long_indices),
        'n_short': len(short_indices)
    }

def compute_turnover(current_weights, previous_weights):
    """
    Compute portfolio turnover.
    
    Parameters:
    -----------
    current_weights : array-like
        Current period portfolio weights
    previous_weights : array-like or None
        Previous period portfolio weights (None for first period)
    
    Returns:
    --------
    float: Turnover (sum of absolute weight changes)
    """
    if previous_weights is None:
        # First period: turnover = sum of absolute weights (full rebalancing)
        return np.sum(np.abs(current_weights))
    
    current_weights = np.array(current_weights)
    previous_weights = np.array(previous_weights)
    
    # Align lengths (pad with zeros if needed)
    max_len = max(len(current_weights), len(previous_weights))
    if len(current_weights) < max_len:
        current_weights = np.pad(current_weights, (0, max_len - len(current_weights)), 'constant')
    if len(previous_weights) < max_len:
        previous_weights = np.pad(previous_weights, (0, max_len - len(previous_weights)), 'constant')
    
    # Turnover = sum of absolute weight changes
    turnover = np.sum(np.abs(current_weights - previous_weights))
    
    return turnover

def compute_portfolio_metrics_time_series(predictions_df, returns_df, dates, quantile=0.2, 
                                         transaction_cost=0.001):
    """
    Compute portfolio metrics over time series with turnover adjustment.
    
    Parameters:
    -----------
    predictions_df : DataFrame
        Predictions with columns for each model, indexed by date
    returns_df : DataFrame
        Returns with same index as predictions_df
    dates : array-like
        Date index
    quantile : float
        Top/bottom quantile for long/short
    transaction_cost : float
        Transaction cost per unit of turnover (default 0.1% = 0.001)
    
    Returns:
    --------
    dict with time series metrics for each model
    """
    results = {}
    
    # Group by date if not already grouped
    if isinstance(predictions_df, pd.DataFrame):
        date_groups = predictions_df.groupby(dates)
    else:
        # If predictions_df is a Series, create groups manually
        date_groups = {}
        for date in np.unique(dates):
            mask = dates == date
            date_groups[date] = mask
    
    for model_name in predictions_df.columns if isinstance(predictions_df, pd.DataFrame) else ['Model']:
        if isinstance(predictions_df, pd.DataFrame):
            model_predictions = predictions_df[model_name]
        else:
            model_predictions = predictions_df
        
        # Time series metrics
        period_returns = []
        period_turnovers = []
        period_ics = []
        previous_weights = None
        
        for date in sorted(date_groups.keys()):
            mask = date_groups[date]
            if isinstance(mask, np.ndarray):
                date_predictions = model_predictions[mask] if hasattr(model_predictions, '__getitem__') else model_predictions
                date_returns = returns_df[mask] if hasattr(returns_df, '__getitem__') else returns_df
            else:
                date_predictions = model_predictions.loc[mask] if hasattr(model_predictions, 'loc') else model_predictions[mask]
                date_returns = returns_df.loc[mask] if hasattr(returns_df, 'loc') else returns_df[mask]
            
            # Compute IC for this period
            ic_result = compute_spearman_ic(date_returns, date_predictions)
            period_ics.append(ic_result['ic'])
            
            # Construct portfolio
            portfolio_result = construct_long_short_portfolio(
                date_predictions, date_returns, quantile=quantile
            )
            period_return = portfolio_result['long_short_return']
            
            # Compute weights for turnover calculation
            n = len(date_predictions)
            n_positions = max(1, int(n * quantile))
            long_indices = np.argsort(date_predictions)[-n_positions:]
            short_indices = np.argsort(date_predictions)[:n_positions]
            
            current_weights = np.zeros(n)
            current_weights[long_indices] = 1.0 / n_positions
            current_weights[short_indices] = -1.0 / n_positions
            
            # Compute turnover
            turnover = compute_turnover(current_weights, previous_weights)
            period_turnovers.append(turnover)
            
            # Adjust return for transaction costs
            net_return = period_return - (turnover * transaction_cost)
            period_returns.append(net_return)
            
            previous_weights = current_weights
        
        # Aggregate metrics
        period_returns = np.array(period_returns)
        period_turnovers = np.array(period_turnovers)
        period_ics = np.array(period_ics)
        
        # Total return (cumulative)
        cumulative_return = np.prod(1 + period_returns) - 1 if len(period_returns) > 0 else 0
        
        # Annualized return (assuming monthly periods)
        n_periods = len(period_returns)
        if n_periods > 0:
            annualized_return = (1 + cumulative_return) ** (12 / n_periods) - 1
        else:
            annualized_return = np.nan
        
        # Average turnover
        avg_turnover = np.mean(period_turnovers) if len(period_turnovers) > 0 else np.nan
        
        # IC statistics
        mean_ic = np.nanmean(period_ics)
        ic_ir = compute_ic_ir(period_ics)
        
        # Sharpe ratio
        if len(period_returns) > 0 and np.std(period_returns) > 0:
            sharpe_ratio = (np.mean(period_returns) / np.std(period_returns)) * np.sqrt(12)
        else:
            sharpe_ratio = np.nan
        
        results[model_name] = {
            'cumulative_return': cumulative_return,
            'annualized_return': annualized_return,
            'mean_ic': mean_ic,
            'ic_ir': ic_ir,
            'sharpe_ratio': sharpe_ratio,
            'avg_turnover': avg_turnover,
            'period_returns': period_returns,
            'period_ics': period_ics,
            'period_turnovers': period_turnovers
        }
    
    return results

print("✅ Economically interpretable metrics functions loaded")

In [None]:
# Compute and display business-related metrics for all baseline models (including XGBoost)
# Requires: predictions, y_val from "Make predictions on validation set" cell

print("=" * 80)
print("BUSINESS-RELATED METRICS: IC, Long-Short Return, Sharpe Ratio")
print("=" * 80)

BASELINE_MODELS = ['OLS', 'Ridge', 'XGBoost', 'MLP', 'Single-Head', 'Multi-Head', 'Multi-Head Diversity']
business_metrics_rows = []

if 'predictions' in locals() and 'y_val' in locals():
    y_val_arr = y_val.values if hasattr(y_val, 'values') else np.array(y_val)
    for model_name in BASELINE_MODELS:
        if model_name not in predictions:
            continue
        pred = np.array(predictions[model_name]).flatten()
        # Spearman IC
        ic_result = compute_spearman_ic(y_val_arr, pred)
        # Long-short portfolio (cross-sectional)
        port_result = construct_long_short_portfolio(pred, y_val_arr, quantile=0.2, equal_weight=True)
        business_metrics_rows.append({
            'Model': model_name,
            'Spearman IC': round(ic_result['ic'], 6),
            'IC p-value': round(ic_result['p_value'], 6),
            'Long-Short Return': round(port_result['long_short_return'], 6),
            'Sharpe Ratio': round(port_result['sharpe_ratio'], 4) if not np.isnan(port_result['sharpe_ratio']) else np.nan,
        })
    business_metrics_df = pd.DataFrame(business_metrics_rows)
    print(business_metrics_df.to_string(index=False))
    if 'tables_dir' in locals():
        path = tables_dir / 'business_metrics_baselines.csv'
        business_metrics_df.to_csv(path, index=False)
        print(f"\n✓ Saved to {path}")
else:
    print("⚠ Run 'Make predictions on validation set' first so predictions and y_val exist.")

In [14]:
# Adversarial Training Configuration
ADVERSARIAL_CONFIG = {
    'epsilons': [0.25, 0.5, 1.0],  # Attack strengths
    'attacks': ['a1', 'a2', 'a3', 'a4'],  # Attack types
    'robust_weight': 0.3,  # Weight for adversarial loss (0.3 = 30% adversarial, 70% clean)
    'learning_rate': 0.0001,
    'batch_size': 32,
    'epochs': 100,
    'patience': 20,
    'warmup_epochs': 5  # Gradually increase adversarial weight
}

# Store adversarially trained models
adversarial_models = {}
adversarial_training_history = {}

In [15]:
def adversarial_training_step(model, X_batch, y_batch, attack_type, epsilon, 
                             optimizer, device='cpu', robust_weight=0.3):
    """
    Perform one adversarial training step.
    
    Args:
        model: The model to train
        X_batch: Input batch (numpy array)
        y_batch: Target batch (numpy array)
        attack_type: 'a1', 'a2', 'a3', or 'a4'
        epsilon: Attack strength
        optimizer: Optimizer
        device: Device to use
        robust_weight: Weight for adversarial loss
    
    Returns:
        Dictionary with loss values or None if batch is invalid
    """
    model.train()
    optimizer.zero_grad()
    
    # Convert to tensors
    X_tensor = torch.FloatTensor(X_batch).to(device)
    y_tensor = torch.FloatTensor(y_batch).to(device)
    
    # Clean forward pass
    output_clean = model(X_tensor)
    if isinstance(output_clean, tuple):
        y_pred_clean = output_clean[0]
    else:
        y_pred_clean = output_clean
    
    # Check for NaN/Inf in predictions
    if torch.any(torch.isnan(y_pred_clean)) or torch.any(torch.isinf(y_pred_clean)):
        return None
    
    clean_loss = nn.MSELoss()(y_pred_clean.squeeze(), y_tensor)
    
    # Check if clean_loss is valid
    if torch.isnan(clean_loss) or torch.isinf(clean_loss):
        return None
    
    # Generate adversarial examples
    if attack_type == 'a1':
        X_adv = apply_a1_attack(X_batch, epsilon=epsilon)
    elif attack_type == 'a2':
        # For A2, epsilon controls missing rate
        missing_rate = min(epsilon / 10.0, 0.8)  # Convert epsilon to missing rate
        X_adv = apply_a2_attack(X_batch, missing_rate=missing_rate)
    elif attack_type == 'a3':
        X_adv = apply_a3_attack(X_batch, epsilon=epsilon)
    elif attack_type == 'a4':
        X_adv = apply_a4_attack(X_batch, epsilon=epsilon)
    else:
        raise ValueError(f"Unknown attack type: {attack_type}")
    
    # Adversarial forward pass
    X_adv_tensor = torch.FloatTensor(X_adv).to(device)
    output_adv = model(X_adv_tensor)
    if isinstance(output_adv, tuple):
        y_pred_adv = output_adv[0]
    else:
        y_pred_adv = output_adv
    
    # Check for NaN/Inf in adversarial predictions
    if torch.any(torch.isnan(y_pred_adv)) or torch.any(torch.isinf(y_pred_adv)):
        return None
    
    adv_loss = nn.MSELoss()(y_pred_adv.squeeze(), y_tensor)
    
    # Check if adv_loss is valid
    if torch.isnan(adv_loss) or torch.isinf(adv_loss):
        return None
    
    # Combined loss
    total_loss = (1 - robust_weight) * clean_loss + robust_weight * adv_loss
    
    # Check if total_loss is valid before backward pass
    if torch.isnan(total_loss) or torch.isinf(total_loss):
        return None
    
    # Ensure total_loss requires gradients
    if not total_loss.requires_grad:
        return None
    
    # Backward pass with error handling
    try:
        total_loss.backward()
    except RuntimeError as e:
        if "does not require grad" in str(e) or "does not have a grad_fn" in str(e):
            optimizer.zero_grad()
            return None
        else:
            raise
    
    # Gradient clipping to prevent exploding gradients
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    
    optimizer.step()
    
    return {
        'clean_loss': clean_loss.item(),
        'adversarial_loss': adv_loss.item(),
        'total_loss': total_loss.item()
    }

In [16]:
def train_adversarial_model(model, model_name, X_train, y_train, X_val, y_val, 
                           attack_type, epsilon, config, device='cpu'):
    """
    Train model with adversarial training.
    
    Args:
        model: Model to train (will be copied)
        model_name: Name of the model
        X_train: Training features
        y_train: Training targets
        X_val: Validation features
        y_val: Validation targets
        attack_type: 'a1', 'a2', 'a3', or 'a4'
        epsilon: Attack strength
        config: Training configuration
        device: Device to use
    
    Returns:
        Trained model, predictions, and training history
    """
    # Create a fresh copy of the model for adversarial training
    import copy
    model = copy.deepcopy(model)
    model = model.to(device)
    model.train()
    
    optimizer = torch.optim.Adam(model.parameters(), lr=config['learning_rate'])
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=5, min_lr=1e-6
    )
    
    # Convert to tensors
    X_train_tensor = torch.FloatTensor(X_train).to(device)
    y_train_tensor = torch.FloatTensor(y_train).to(device)
    X_val_tensor = torch.FloatTensor(X_val).to(device)
    y_val_tensor = torch.FloatTensor(y_val).to(device)
    
    # Handle feature dimension mismatch
    num_features = model.num_features if hasattr(model, 'num_features') else model.model.num_features
    
    if X_train.shape[1] != num_features:
        if X_train.shape[1] < num_features:
            # Pad
            padding_train = np.zeros((X_train.shape[0], num_features - X_train.shape[1]))
            padding_val = np.zeros((X_val.shape[0], num_features - X_val.shape[1]))
            X_train_tensor = torch.FloatTensor(np.hstack([X_train, padding_train])).to(device)
            X_val_tensor = torch.FloatTensor(np.hstack([X_val, padding_val])).to(device)
        else:
            # Truncate
            X_train_tensor = torch.FloatTensor(X_train[:, :num_features]).to(device)
            X_val_tensor = torch.FloatTensor(X_val[:, :num_features]).to(device)
    
    history = {
        'train_loss': [],
        'val_loss': [],
        'train_clean_loss': [],
        'train_adv_loss': []
    }
    
    best_val_loss = float('inf')
    patience_counter = 0
    warmup_epochs = config.get('warmup_epochs', 5)
    
    batch_size = config['batch_size']
    n_batches = (len(X_train_tensor) + batch_size - 1) // batch_size
    
    for epoch in range(config['epochs']):
        # Gradual warmup: increase robust_weight from 0.1 to target value
        if epoch < warmup_epochs:
            current_robust_weight = 0.1 + (config['robust_weight'] - 0.1) * (epoch / warmup_epochs)
        else:
            current_robust_weight = config['robust_weight']
        
        epoch_losses = {'clean': [], 'adv': [], 'total': []}
        
        # Training
        model.train()
        for i in range(0, len(X_train_tensor), batch_size):
            batch_X = X_train_tensor[i:i+batch_size].cpu().numpy()
            batch_y = y_train_tensor[i:i+batch_size].cpu().numpy()
            
            losses = adversarial_training_step(
                model, batch_X, batch_y, attack_type, epsilon,
                optimizer, device, current_robust_weight
            )
            
            # Skip batch if None (invalid batch)
            if losses is None:
                continue
            
            # Check for NaN/Inf in losses
            if (np.isnan(losses['total_loss']) or np.isinf(losses['total_loss']) or
                np.isnan(losses['clean_loss']) or np.isinf(losses['clean_loss']) or
                np.isnan(losses['adversarial_loss']) or np.isinf(losses['adversarial_loss'])):
                continue
            
            epoch_losses['clean'].append(losses['clean_loss'])
            epoch_losses['adv'].append(losses['adversarial_loss'])
            epoch_losses['total'].append(losses['total_loss'])
        
        # Skip epoch if all losses are invalid
        if len(epoch_losses['total']) == 0:
            continue
        
        # Validation
        model.eval()
        with torch.no_grad():
            output_val = model(X_val_tensor)
            if isinstance(output_val, tuple):
                y_pred_val = output_val[0]
            else:
                y_pred_val = output_val
            
            # Check for constant predictions (model collapse detection)
            y_pred_np = y_pred_val.squeeze().cpu().numpy()
            pred_std = np.std(y_pred_np)
            
            if pred_std < 1e-8:
                print(f"   MODEL COLLAPSE DETECTED at epoch {epoch+1}!")
                break
            
            val_loss = nn.MSELoss()(y_pred_val.squeeze(), y_val_tensor).item()
            
            # Check for NaN/Inf in validation loss
            if np.isnan(val_loss) or np.isinf(val_loss):
                val_loss = float('inf')
        
        # Record history
        avg_train_loss = np.mean(epoch_losses['total']) if epoch_losses['total'] else float('inf')
        avg_clean_loss = np.mean(epoch_losses['clean']) if epoch_losses['clean'] else 0.0
        avg_adv_loss = np.mean(epoch_losses['adv']) if epoch_losses['adv'] else 0.0
        
        history['train_loss'].append(avg_train_loss)
        history['val_loss'].append(val_loss)
        history['train_clean_loss'].append(avg_clean_loss)
        history['train_adv_loss'].append(avg_adv_loss)
        
        # Learning rate scheduling
        scheduler.step(val_loss)
        
        # Early stopping
        if not (np.isnan(val_loss) or np.isinf(val_loss)):
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= config['patience']:
                    print(f"  {model_name} ({attack_type.upper()}, ε={epsilon}): Early stopping at epoch {epoch+1}")
                    break
        
        if (epoch + 1) % 10 == 0:
            print(f"  {model_name} ({attack_type.upper()}, ε={epsilon}) - Epoch {epoch+1}/{config['epochs']}: "
                  f"Train Loss={avg_train_loss:.6f}, Val Loss={val_loss:.6f}, "
                  f"Robust Weight={current_robust_weight:.3f}")
    
    # Final evaluation
    model.eval()
    with torch.no_grad():
        final_pred = model(X_val_tensor)
        if isinstance(final_pred, tuple):
            final_pred = final_pred[0]
        final_pred = final_pred.squeeze().cpu().numpy()
    
    return model, final_pred, history

In [17]:
# Train adversarially trained models
print("=" * 80)
print("ADVERSARIAL TRAINING FOR TRANSFORMER MODELS")
print("=" * 80)
print(f"Training on attacks: {ADVERSARIAL_CONFIG['attacks']}")
print(f"Epsilons: {ADVERSARIAL_CONFIG['epsilons']}")
print(f"Robust weight: {ADVERSARIAL_CONFIG['robust_weight']}")
print()

# Models to train adversarially
transformer_model_names = ['Single-Head', 'Multi-Head', 'Multi-Head Diversity']
base_models = {
    'Single-Head': models['Single-Head'],
    'Multi-Head': models['Multi-Head'],
    'Multi-Head Diversity': models['Multi-Head Diversity']
}

# Train each model with each attack at each epsilon
for model_name in transformer_model_names:
    print(f"\n{'='*80}")
    print(f"Training {model_name} with Adversarial Training")
    print(f"{'='*80}")
    
    base_model = base_models[model_name]
    
    for attack_type in ADVERSARIAL_CONFIG['attacks']:
        for epsilon in ADVERSARIAL_CONFIG['epsilons']:
            model_key = f"{model_name} ({attack_type.upper()}, ε={epsilon})"
            print(f"\nTraining {model_key}...")
            
            try:
                adv_model, adv_pred, adv_history = train_adversarial_model(
                    base_model, model_name, X_train_scaled, y_train, 
                    X_val_scaled, y_val, attack_type, epsilon, 
                    ADVERSARIAL_CONFIG, device
                )
                
                # Evaluate
                adv_rmse = np.sqrt(mean_squared_error(y_val, adv_pred))
                adv_r2 = r2_score(y_val, adv_pred)
                
                adversarial_models[model_key] = adv_model
                adversarial_training_history[model_key] = {
                    'rmse': adv_rmse,
                    'r2': adv_r2,
                    'history': adv_history
                }
                
                print(f"  {model_key} trained - RMSE: {adv_rmse:.6f}, R²: {adv_r2:.6f}")
                
            except Exception as e:
                print(f" Error training {model_key}: {e}")
                import traceback
                traceback.print_exc()

print("\n" + "=" * 80)
print("ADVERSARIAL TRAINING COMPLETE")
print("=" * 80)
print(f"Total adversarially trained models: {len(adversarial_models)}")

ADVERSARIAL TRAINING FOR TRANSFORMER MODELS
Training on attacks: ['a1', 'a2', 'a3', 'a4']
Epsilons: [0.25, 0.5, 1.0]
Robust weight: 0.3


Training Single-Head with Adversarial Training

Training Single-Head (A1, ε=0.25)...


  Single-Head (A1, ε=0.25) - Epoch 10/100: Train Loss=0.000482, Val Loss=0.000271, Robust Weight=0.300


  Single-Head (A1, ε=0.25) - Epoch 20/100: Train Loss=0.000468, Val Loss=0.000274, Robust Weight=0.300


  Single-Head (A1, ε=0.25) - Epoch 30/100: Train Loss=0.000435, Val Loss=0.000274, Robust Weight=0.300


  Single-Head (A1, ε=0.25): Early stopping at epoch 37
  Single-Head (A1, ε=0.25) trained - RMSE: 0.016419, R²: 0.114956

Training Single-Head (A1, ε=0.5)...


  Single-Head (A1, ε=0.5) - Epoch 10/100: Train Loss=0.000506, Val Loss=0.000279, Robust Weight=0.300


  Single-Head (A1, ε=0.5) - Epoch 20/100: Train Loss=0.000455, Val Loss=0.000271, Robust Weight=0.300


  Single-Head (A1, ε=0.5) - Epoch 30/100: Train Loss=0.000443, Val Loss=0.000269, Robust Weight=0.300


  Single-Head (A1, ε=0.5) - Epoch 40/100: Train Loss=0.000432, Val Loss=0.000271, Robust Weight=0.300


  Single-Head (A1, ε=0.5): Early stopping at epoch 49
  Single-Head (A1, ε=0.5) trained - RMSE: 0.016371, R²: 0.120137

Training Single-Head (A1, ε=1.0)...


  Single-Head (A1, ε=1.0) - Epoch 10/100: Train Loss=0.000514, Val Loss=0.000280, Robust Weight=0.300


  Single-Head (A1, ε=1.0) - Epoch 20/100: Train Loss=0.000481, Val Loss=0.000279, Robust Weight=0.300


  Single-Head (A1, ε=1.0) - Epoch 30/100: Train Loss=0.000454, Val Loss=0.000273, Robust Weight=0.300


  Single-Head (A1, ε=1.0) - Epoch 40/100: Train Loss=0.000448, Val Loss=0.000272, Robust Weight=0.300


  Single-Head (A1, ε=1.0) - Epoch 50/100: Train Loss=0.000443, Val Loss=0.000270, Robust Weight=0.300


  Single-Head (A1, ε=1.0): Early stopping at epoch 51
  Single-Head (A1, ε=1.0) trained - RMSE: 0.016425, R²: 0.114316

Training Single-Head (A2, ε=0.25)...


  Single-Head (A2, ε=0.25) - Epoch 10/100: Train Loss=0.000497, Val Loss=0.000277, Robust Weight=0.300


  Single-Head (A2, ε=0.25) - Epoch 20/100: Train Loss=0.000460, Val Loss=0.000271, Robust Weight=0.300


  Single-Head (A2, ε=0.25) - Epoch 30/100: Train Loss=0.000425, Val Loss=0.000275, Robust Weight=0.300


  Single-Head (A2, ε=0.25): Early stopping at epoch 34
  Single-Head (A2, ε=0.25) trained - RMSE: 0.016517, R²: 0.104367

Training Single-Head (A2, ε=0.5)...


  Single-Head (A2, ε=0.5) - Epoch 10/100: Train Loss=0.000523, Val Loss=0.000282, Robust Weight=0.300


  Single-Head (A2, ε=0.5) - Epoch 20/100: Train Loss=0.000454, Val Loss=0.000274, Robust Weight=0.300


  Single-Head (A2, ε=0.5) - Epoch 30/100: Train Loss=0.000437, Val Loss=0.000271, Robust Weight=0.300


  Single-Head (A2, ε=0.5) - Epoch 40/100: Train Loss=0.000421, Val Loss=0.000273, Robust Weight=0.300


  Single-Head (A2, ε=0.5) - Epoch 50/100: Train Loss=0.000408, Val Loss=0.000273, Robust Weight=0.300


  Single-Head (A2, ε=0.5): Early stopping at epoch 52
  Single-Head (A2, ε=0.5) trained - RMSE: 0.016529, R²: 0.103143

Training Single-Head (A2, ε=1.0)...


  Single-Head (A2, ε=1.0) - Epoch 10/100: Train Loss=0.000506, Val Loss=0.000278, Robust Weight=0.300


  Single-Head (A2, ε=1.0) - Epoch 20/100: Train Loss=0.000508, Val Loss=0.000270, Robust Weight=0.300


  Single-Head (A2, ε=1.0) - Epoch 30/100: Train Loss=0.000452, Val Loss=0.000271, Robust Weight=0.300


  Single-Head (A2, ε=1.0): Early stopping at epoch 38
  Single-Head (A2, ε=1.0) trained - RMSE: 0.016395, R²: 0.117571

Training Single-Head (A3, ε=0.25)...


  Single-Head (A3, ε=0.25) - Epoch 10/100: Train Loss=0.000527, Val Loss=0.000285, Robust Weight=0.300


  Single-Head (A3, ε=0.25) - Epoch 20/100: Train Loss=0.000482, Val Loss=0.000273, Robust Weight=0.300


  Single-Head (A3, ε=0.25) - Epoch 30/100: Train Loss=0.000452, Val Loss=0.000272, Robust Weight=0.300


  Single-Head (A3, ε=0.25): Early stopping at epoch 36
  Single-Head (A3, ε=0.25) trained - RMSE: 0.016467, R²: 0.109760

Training Single-Head (A3, ε=0.5)...


  Single-Head (A3, ε=0.5) - Epoch 10/100: Train Loss=0.000501, Val Loss=0.000280, Robust Weight=0.300


  Single-Head (A3, ε=0.5) - Epoch 20/100: Train Loss=0.000478, Val Loss=0.000273, Robust Weight=0.300


  Single-Head (A3, ε=0.5) - Epoch 30/100: Train Loss=0.000464, Val Loss=0.000270, Robust Weight=0.300


  Single-Head (A3, ε=0.5) - Epoch 40/100: Train Loss=0.000452, Val Loss=0.000268, Robust Weight=0.300


  Single-Head (A3, ε=0.5) - Epoch 50/100: Train Loss=0.000445, Val Loss=0.000268, Robust Weight=0.300


  Single-Head (A3, ε=0.5): Early stopping at epoch 59
  Single-Head (A3, ε=0.5) trained - RMSE: 0.016412, R²: 0.115756

Training Single-Head (A3, ε=1.0)...


  Single-Head (A3, ε=1.0) - Epoch 10/100: Train Loss=0.000539, Val Loss=0.000290, Robust Weight=0.300


  Single-Head (A3, ε=1.0) - Epoch 20/100: Train Loss=0.000494, Val Loss=0.000273, Robust Weight=0.300


  Single-Head (A3, ε=1.0) - Epoch 30/100: Train Loss=0.000481, Val Loss=0.000272, Robust Weight=0.300


  Single-Head (A3, ε=1.0) - Epoch 40/100: Train Loss=0.000476, Val Loss=0.000268, Robust Weight=0.300


  Single-Head (A3, ε=1.0) - Epoch 50/100: Train Loss=0.000462, Val Loss=0.000268, Robust Weight=0.300


  Single-Head (A3, ε=1.0) - Epoch 60/100: Train Loss=0.000452, Val Loss=0.000268, Robust Weight=0.300


  Single-Head (A3, ε=1.0) - Epoch 70/100: Train Loss=0.000447, Val Loss=0.000268, Robust Weight=0.300


  Single-Head (A3, ε=1.0): Early stopping at epoch 71
  Single-Head (A3, ε=1.0) trained - RMSE: 0.016399, R²: 0.117187

Training Single-Head (A4, ε=0.25)...


  Single-Head (A4, ε=0.25) - Epoch 10/100: Train Loss=0.000501, Val Loss=0.000271, Robust Weight=0.300


  Single-Head (A4, ε=0.25) - Epoch 20/100: Train Loss=0.000455, Val Loss=0.000276, Robust Weight=0.300


  Single-Head (A4, ε=0.25): Early stopping at epoch 30
  Single-Head (A4, ε=0.25) trained - RMSE: 0.016517, R²: 0.104377

Training Single-Head (A4, ε=0.5)...


  Single-Head (A4, ε=0.5) - Epoch 10/100: Train Loss=0.000481, Val Loss=0.000279, Robust Weight=0.300


  Single-Head (A4, ε=0.5) - Epoch 20/100: Train Loss=0.000503, Val Loss=0.000278, Robust Weight=0.300


  Single-Head (A4, ε=0.5) - Epoch 30/100: Train Loss=0.000456, Val Loss=0.000272, Robust Weight=0.300


  Single-Head (A4, ε=0.5) - Epoch 40/100: Train Loss=0.000438, Val Loss=0.000269, Robust Weight=0.300


  Single-Head (A4, ε=0.5) - Epoch 50/100: Train Loss=0.000424, Val Loss=0.000272, Robust Weight=0.300


  Single-Head (A4, ε=0.5): Early stopping at epoch 56
  Single-Head (A4, ε=0.5) trained - RMSE: 0.016497, R²: 0.106508

Training Single-Head (A4, ε=1.0)...


  Single-Head (A4, ε=1.0) - Epoch 10/100: Train Loss=0.000502, Val Loss=0.000283, Robust Weight=0.300


  Single-Head (A4, ε=1.0) - Epoch 20/100: Train Loss=0.000476, Val Loss=0.000269, Robust Weight=0.300


  Single-Head (A4, ε=1.0) - Epoch 30/100: Train Loss=0.000456, Val Loss=0.000267, Robust Weight=0.300


  Single-Head (A4, ε=1.0) - Epoch 40/100: Train Loss=0.000450, Val Loss=0.000268, Robust Weight=0.300


  Single-Head (A4, ε=1.0) - Epoch 50/100: Train Loss=0.000449, Val Loss=0.000268, Robust Weight=0.300


  Single-Head (A4, ε=1.0): Early stopping at epoch 57
  Single-Head (A4, ε=1.0) trained - RMSE: 0.016371, R²: 0.120107

Training Multi-Head with Adversarial Training

Training Multi-Head (A1, ε=0.25)...


  Multi-Head (A1, ε=0.25) - Epoch 10/100: Train Loss=0.000449, Val Loss=0.000273, Robust Weight=0.300


  Multi-Head (A1, ε=0.25) - Epoch 20/100: Train Loss=0.000435, Val Loss=0.000271, Robust Weight=0.300


  Multi-Head (A1, ε=0.25) - Epoch 30/100: Train Loss=0.000422, Val Loss=0.000271, Robust Weight=0.300


  Multi-Head (A1, ε=0.25) - Epoch 40/100: Train Loss=0.000377, Val Loss=0.000268, Robust Weight=0.300


  Multi-Head (A1, ε=0.25) - Epoch 50/100: Train Loss=0.000358, Val Loss=0.000270, Robust Weight=0.300


  Multi-Head (A1, ε=0.25) - Epoch 60/100: Train Loss=0.000351, Val Loss=0.000269, Robust Weight=0.300


  Multi-Head (A1, ε=0.25): Early stopping at epoch 62
  Multi-Head (A1, ε=0.25) trained - RMSE: 0.016400, R²: 0.117010

Training Multi-Head (A1, ε=0.5)...


  Multi-Head (A1, ε=0.5) - Epoch 10/100: Train Loss=0.000458, Val Loss=0.000273, Robust Weight=0.300


  Multi-Head (A1, ε=0.5) - Epoch 20/100: Train Loss=0.000431, Val Loss=0.000267, Robust Weight=0.300


  Multi-Head (A1, ε=0.5) - Epoch 30/100: Train Loss=0.000416, Val Loss=0.000270, Robust Weight=0.300


  Multi-Head (A1, ε=0.5) - Epoch 40/100: Train Loss=0.000394, Val Loss=0.000261, Robust Weight=0.300


  Multi-Head (A1, ε=0.5) - Epoch 50/100: Train Loss=0.000387, Val Loss=0.000261, Robust Weight=0.300


  Multi-Head (A1, ε=0.5) - Epoch 60/100: Train Loss=0.000381, Val Loss=0.000259, Robust Weight=0.300


  Multi-Head (A1, ε=0.5) - Epoch 70/100: Train Loss=0.000375, Val Loss=0.000259, Robust Weight=0.300


  Multi-Head (A1, ε=0.5) - Epoch 80/100: Train Loss=0.000375, Val Loss=0.000259, Robust Weight=0.300


  Multi-Head (A1, ε=0.5) - Epoch 90/100: Train Loss=0.000374, Val Loss=0.000259, Robust Weight=0.300


  Multi-Head (A1, ε=0.5) - Epoch 100/100: Train Loss=0.000370, Val Loss=0.000259, Robust Weight=0.300
  Multi-Head (A1, ε=0.5) trained - RMSE: 0.016090, R²: 0.150109

Training Multi-Head (A1, ε=1.0)...


  Multi-Head (A1, ε=1.0) - Epoch 10/100: Train Loss=0.000485, Val Loss=0.000277, Robust Weight=0.300


  Multi-Head (A1, ε=1.0) - Epoch 20/100: Train Loss=0.000454, Val Loss=0.000272, Robust Weight=0.300


  Multi-Head (A1, ε=1.0) - Epoch 30/100: Train Loss=0.000447, Val Loss=0.000269, Robust Weight=0.300


  Multi-Head (A1, ε=1.0) - Epoch 40/100: Train Loss=0.000444, Val Loss=0.000265, Robust Weight=0.300


  Multi-Head (A1, ε=1.0) - Epoch 50/100: Train Loss=0.000425, Val Loss=0.000266, Robust Weight=0.300


  Multi-Head (A1, ε=1.0) - Epoch 60/100: Train Loss=0.000411, Val Loss=0.000265, Robust Weight=0.300


  Multi-Head (A1, ε=1.0) - Epoch 70/100: Train Loss=0.000401, Val Loss=0.000265, Robust Weight=0.300


  Multi-Head (A1, ε=1.0) - Epoch 80/100: Train Loss=0.000393, Val Loss=0.000261, Robust Weight=0.300


  Multi-Head (A1, ε=1.0) - Epoch 90/100: Train Loss=0.000389, Val Loss=0.000259, Robust Weight=0.300


  Multi-Head (A1, ε=1.0) - Epoch 100/100: Train Loss=0.000380, Val Loss=0.000259, Robust Weight=0.300
  Multi-Head (A1, ε=1.0) trained - RMSE: 0.016101, R²: 0.148894

Training Multi-Head (A2, ε=0.25)...


  Multi-Head (A2, ε=0.25) - Epoch 10/100: Train Loss=0.000448, Val Loss=0.000274, Robust Weight=0.300


  Multi-Head (A2, ε=0.25) - Epoch 20/100: Train Loss=0.000429, Val Loss=0.000274, Robust Weight=0.300


  Multi-Head (A2, ε=0.25) - Epoch 30/100: Train Loss=0.000400, Val Loss=0.000272, Robust Weight=0.300


  Multi-Head (A2, ε=0.25) - Epoch 40/100: Train Loss=0.000361, Val Loss=0.000268, Robust Weight=0.300


  Multi-Head (A2, ε=0.25) - Epoch 50/100: Train Loss=0.000345, Val Loss=0.000268, Robust Weight=0.300


  Multi-Head (A2, ε=0.25): Early stopping at epoch 59
  Multi-Head (A2, ε=0.25) trained - RMSE: 0.016418, R²: 0.115081

Training Multi-Head (A2, ε=0.5)...


  Multi-Head (A2, ε=0.5) - Epoch 10/100: Train Loss=0.000457, Val Loss=0.000277, Robust Weight=0.300


  Multi-Head (A2, ε=0.5) - Epoch 20/100: Train Loss=0.000416, Val Loss=0.000269, Robust Weight=0.300


  Multi-Head (A2, ε=0.5) - Epoch 30/100: Train Loss=0.000403, Val Loss=0.000270, Robust Weight=0.300


  Multi-Head (A2, ε=0.5) - Epoch 40/100: Train Loss=0.000379, Val Loss=0.000267, Robust Weight=0.300


  Multi-Head (A2, ε=0.5) - Epoch 50/100: Train Loss=0.000360, Val Loss=0.000270, Robust Weight=0.300


  Multi-Head (A2, ε=0.5): Early stopping at epoch 60
  Multi-Head (A2, ε=0.5) trained - RMSE: 0.016466, R²: 0.109904

Training Multi-Head (A2, ε=1.0)...


  Multi-Head (A2, ε=1.0) - Epoch 10/100: Train Loss=0.000464, Val Loss=0.000278, Robust Weight=0.300


  Multi-Head (A2, ε=1.0) - Epoch 20/100: Train Loss=0.000428, Val Loss=0.000271, Robust Weight=0.300


  Multi-Head (A2, ε=1.0) - Epoch 30/100: Train Loss=0.000400, Val Loss=0.000268, Robust Weight=0.300


  Multi-Head (A2, ε=1.0) - Epoch 40/100: Train Loss=0.000383, Val Loss=0.000269, Robust Weight=0.300


  Multi-Head (A2, ε=1.0) - Epoch 50/100: Train Loss=0.000373, Val Loss=0.000270, Robust Weight=0.300


  Multi-Head (A2, ε=1.0): Early stopping at epoch 52
  Multi-Head (A2, ε=1.0) trained - RMSE: 0.016434, R²: 0.113338

Training Multi-Head (A3, ε=0.25)...


  Multi-Head (A3, ε=0.25) - Epoch 10/100: Train Loss=0.000492, Val Loss=0.000281, Robust Weight=0.300


  Multi-Head (A3, ε=0.25) - Epoch 20/100: Train Loss=0.000441, Val Loss=0.000271, Robust Weight=0.300


  Multi-Head (A3, ε=0.25) - Epoch 30/100: Train Loss=0.000423, Val Loss=0.000269, Robust Weight=0.300


  Multi-Head (A3, ε=0.25) - Epoch 40/100: Train Loss=0.000406, Val Loss=0.000264, Robust Weight=0.300


  Multi-Head (A3, ε=0.25) - Epoch 50/100: Train Loss=0.000389, Val Loss=0.000264, Robust Weight=0.300


  Multi-Head (A3, ε=0.25) - Epoch 60/100: Train Loss=0.000380, Val Loss=0.000264, Robust Weight=0.300


  Multi-Head (A3, ε=0.25) - Epoch 70/100: Train Loss=0.000366, Val Loss=0.000262, Robust Weight=0.300


  Multi-Head (A3, ε=0.25) - Epoch 80/100: Train Loss=0.000351, Val Loss=0.000262, Robust Weight=0.300


  Multi-Head (A3, ε=0.25) - Epoch 90/100: Train Loss=0.000354, Val Loss=0.000262, Robust Weight=0.300


  Multi-Head (A3, ε=0.25): Early stopping at epoch 91
  Multi-Head (A3, ε=0.25) trained - RMSE: 0.016188, R²: 0.139762

Training Multi-Head (A3, ε=0.5)...


  Multi-Head (A3, ε=0.5) - Epoch 10/100: Train Loss=0.000497, Val Loss=0.000278, Robust Weight=0.300


  Multi-Head (A3, ε=0.5) - Epoch 20/100: Train Loss=0.000455, Val Loss=0.000269, Robust Weight=0.300


  Multi-Head (A3, ε=0.5) - Epoch 30/100: Train Loss=0.000447, Val Loss=0.000268, Robust Weight=0.300


  Multi-Head (A3, ε=0.5) - Epoch 40/100: Train Loss=0.000435, Val Loss=0.000261, Robust Weight=0.300


  Multi-Head (A3, ε=0.5) - Epoch 50/100: Train Loss=0.000411, Val Loss=0.000262, Robust Weight=0.300


  Multi-Head (A3, ε=0.5) - Epoch 60/100: Train Loss=0.000402, Val Loss=0.000262, Robust Weight=0.300


  Multi-Head (A3, ε=0.5) - Epoch 70/100: Train Loss=0.000398, Val Loss=0.000261, Robust Weight=0.300


  Multi-Head (A3, ε=0.5) - Epoch 80/100: Train Loss=0.000396, Val Loss=0.000262, Robust Weight=0.300


  Multi-Head (A3, ε=0.5): Early stopping at epoch 88
  Multi-Head (A3, ε=0.5) trained - RMSE: 0.016177, R²: 0.140912

Training Multi-Head (A3, ε=1.0)...


  Multi-Head (A3, ε=1.0) - Epoch 10/100: Train Loss=0.000504, Val Loss=0.000279, Robust Weight=0.300


  Multi-Head (A3, ε=1.0) - Epoch 20/100: Train Loss=0.000488, Val Loss=0.000266, Robust Weight=0.300


  Multi-Head (A3, ε=1.0) - Epoch 30/100: Train Loss=0.000460, Val Loss=0.000264, Robust Weight=0.300


  Multi-Head (A3, ε=1.0) - Epoch 40/100: Train Loss=0.000446, Val Loss=0.000262, Robust Weight=0.300


  Multi-Head (A3, ε=1.0) - Epoch 50/100: Train Loss=0.000431, Val Loss=0.000260, Robust Weight=0.300


  Multi-Head (A3, ε=1.0) - Epoch 60/100: Train Loss=0.000429, Val Loss=0.000258, Robust Weight=0.300


  Multi-Head (A3, ε=1.0) - Epoch 70/100: Train Loss=0.000420, Val Loss=0.000257, Robust Weight=0.300


  Multi-Head (A3, ε=1.0) - Epoch 80/100: Train Loss=0.000418, Val Loss=0.000256, Robust Weight=0.300


  Multi-Head (A3, ε=1.0) - Epoch 90/100: Train Loss=0.000416, Val Loss=0.000256, Robust Weight=0.300


  Multi-Head (A3, ε=1.0) - Epoch 100/100: Train Loss=0.000407, Val Loss=0.000255, Robust Weight=0.300
  Multi-Head (A3, ε=1.0) trained - RMSE: 0.015966, R²: 0.163185

Training Multi-Head (A4, ε=0.25)...


  Multi-Head (A4, ε=0.25) - Epoch 10/100: Train Loss=0.000452, Val Loss=0.000275, Robust Weight=0.300


  Multi-Head (A4, ε=0.25) - Epoch 20/100: Train Loss=0.000431, Val Loss=0.000271, Robust Weight=0.300


  Multi-Head (A4, ε=0.25) - Epoch 30/100: Train Loss=0.000403, Val Loss=0.000270, Robust Weight=0.300


  Multi-Head (A4, ε=0.25) - Epoch 40/100: Train Loss=0.000378, Val Loss=0.000267, Robust Weight=0.300


  Multi-Head (A4, ε=0.25) - Epoch 50/100: Train Loss=0.000362, Val Loss=0.000269, Robust Weight=0.300


  Multi-Head (A4, ε=0.25): Early stopping at epoch 57
  Multi-Head (A4, ε=0.25) trained - RMSE: 0.016384, R²: 0.118799

Training Multi-Head (A4, ε=0.5)...


  Multi-Head (A4, ε=0.5) - Epoch 10/100: Train Loss=0.000473, Val Loss=0.000278, Robust Weight=0.300


  Multi-Head (A4, ε=0.5) - Epoch 20/100: Train Loss=0.000436, Val Loss=0.000270, Robust Weight=0.300


  Multi-Head (A4, ε=0.5) - Epoch 30/100: Train Loss=0.000423, Val Loss=0.000268, Robust Weight=0.300


  Multi-Head (A4, ε=0.5) - Epoch 40/100: Train Loss=0.000397, Val Loss=0.000264, Robust Weight=0.300


  Multi-Head (A4, ε=0.5) - Epoch 50/100: Train Loss=0.000390, Val Loss=0.000264, Robust Weight=0.300


  Multi-Head (A4, ε=0.5) - Epoch 60/100: Train Loss=0.000379, Val Loss=0.000264, Robust Weight=0.300


  Multi-Head (A4, ε=0.5) - Epoch 70/100: Train Loss=0.000374, Val Loss=0.000263, Robust Weight=0.300


  Multi-Head (A4, ε=0.5) - Epoch 80/100: Train Loss=0.000366, Val Loss=0.000263, Robust Weight=0.300


  Multi-Head (A4, ε=0.5) - Epoch 90/100: Train Loss=0.000367, Val Loss=0.000262, Robust Weight=0.300


  Multi-Head (A4, ε=0.5): Early stopping at epoch 92
  Multi-Head (A4, ε=0.5) trained - RMSE: 0.016197, R²: 0.138753

Training Multi-Head (A4, ε=1.0)...


  Multi-Head (A4, ε=1.0) - Epoch 10/100: Train Loss=0.000492, Val Loss=0.000277, Robust Weight=0.300


  Multi-Head (A4, ε=1.0) - Epoch 20/100: Train Loss=0.000471, Val Loss=0.000275, Robust Weight=0.300


  Multi-Head (A4, ε=1.0) - Epoch 30/100: Train Loss=0.000453, Val Loss=0.000272, Robust Weight=0.300


  Multi-Head (A4, ε=1.0) - Epoch 40/100: Train Loss=0.000439, Val Loss=0.000274, Robust Weight=0.300


  Multi-Head (A4, ε=1.0) - Epoch 50/100: Train Loss=0.000422, Val Loss=0.000265, Robust Weight=0.300


  Multi-Head (A4, ε=1.0) - Epoch 60/100: Train Loss=0.000411, Val Loss=0.000261, Robust Weight=0.300


  Multi-Head (A4, ε=1.0) - Epoch 70/100: Train Loss=0.000402, Val Loss=0.000262, Robust Weight=0.300


  Multi-Head (A4, ε=1.0) - Epoch 80/100: Train Loss=0.000401, Val Loss=0.000262, Robust Weight=0.300


  Multi-Head (A4, ε=1.0): Early stopping at epoch 82
  Multi-Head (A4, ε=1.0) trained - RMSE: 0.016182, R²: 0.140344

Training Multi-Head Diversity with Adversarial Training

Training Multi-Head Diversity (A1, ε=0.25)...


  Multi-Head Diversity (A1, ε=0.25) - Epoch 10/100: Train Loss=0.000484, Val Loss=0.000276, Robust Weight=0.300


  Multi-Head Diversity (A1, ε=0.25) - Epoch 20/100: Train Loss=0.000432, Val Loss=0.000273, Robust Weight=0.300


  Multi-Head Diversity (A1, ε=0.25) - Epoch 30/100: Train Loss=0.000409, Val Loss=0.000272, Robust Weight=0.300


  Multi-Head Diversity (A1, ε=0.25): Early stopping at epoch 35
  Multi-Head Diversity (A1, ε=0.25) trained - RMSE: 0.016488, R²: 0.107536

Training Multi-Head Diversity (A1, ε=0.5)...


  Multi-Head Diversity (A1, ε=0.5) - Epoch 10/100: Train Loss=0.000477, Val Loss=0.000270, Robust Weight=0.300


  Multi-Head Diversity (A1, ε=0.5) - Epoch 20/100: Train Loss=0.000439, Val Loss=0.000271, Robust Weight=0.300


  Multi-Head Diversity (A1, ε=0.5) - Epoch 30/100: Train Loss=0.000423, Val Loss=0.000268, Robust Weight=0.300


  Multi-Head Diversity (A1, ε=0.5) - Epoch 40/100: Train Loss=0.000415, Val Loss=0.000267, Robust Weight=0.300


  Multi-Head Diversity (A1, ε=0.5) - Epoch 50/100: Train Loss=0.000411, Val Loss=0.000268, Robust Weight=0.300


  Multi-Head Diversity (A1, ε=0.5): Early stopping at epoch 60
  Multi-Head Diversity (A1, ε=0.5) trained - RMSE: 0.016355, R²: 0.121840

Training Multi-Head Diversity (A1, ε=1.0)...


  Multi-Head Diversity (A1, ε=1.0) - Epoch 10/100: Train Loss=0.000496, Val Loss=0.000280, Robust Weight=0.300


  Multi-Head Diversity (A1, ε=1.0) - Epoch 20/100: Train Loss=0.000472, Val Loss=0.000272, Robust Weight=0.300


  Multi-Head Diversity (A1, ε=1.0) - Epoch 30/100: Train Loss=0.000456, Val Loss=0.000272, Robust Weight=0.300


  Multi-Head Diversity (A1, ε=1.0) - Epoch 40/100: Train Loss=0.000443, Val Loss=0.000271, Robust Weight=0.300


  Multi-Head Diversity (A1, ε=1.0) - Epoch 50/100: Train Loss=0.000442, Val Loss=0.000270, Robust Weight=0.300


  Multi-Head Diversity (A1, ε=1.0): Early stopping at epoch 58
  Multi-Head Diversity (A1, ε=1.0) trained - RMSE: 0.016432, R²: 0.113603

Training Multi-Head Diversity (A2, ε=0.25)...


  Multi-Head Diversity (A2, ε=0.25) - Epoch 10/100: Train Loss=0.000448, Val Loss=0.000273, Robust Weight=0.300


  Multi-Head Diversity (A2, ε=0.25) - Epoch 20/100: Train Loss=0.000418, Val Loss=0.000274, Robust Weight=0.300


  Multi-Head Diversity (A2, ε=0.25): Early stopping at epoch 22
  Multi-Head Diversity (A2, ε=0.25) trained - RMSE: 0.016589, R²: 0.096617

Training Multi-Head Diversity (A2, ε=0.5)...


  Multi-Head Diversity (A2, ε=0.5) - Epoch 10/100: Train Loss=0.000466, Val Loss=0.000271, Robust Weight=0.300


  Multi-Head Diversity (A2, ε=0.5) - Epoch 20/100: Train Loss=0.000436, Val Loss=0.000275, Robust Weight=0.300


  Multi-Head Diversity (A2, ε=0.5): Early stopping at epoch 30
  Multi-Head Diversity (A2, ε=0.5) trained - RMSE: 0.016561, R²: 0.099664

Training Multi-Head Diversity (A2, ε=1.0)...


  Multi-Head Diversity (A2, ε=1.0) - Epoch 10/100: Train Loss=0.000455, Val Loss=0.000272, Robust Weight=0.300


  Multi-Head Diversity (A2, ε=1.0) - Epoch 20/100: Train Loss=0.000429, Val Loss=0.000275, Robust Weight=0.300


  Multi-Head Diversity (A2, ε=1.0): Early stopping at epoch 22
  Multi-Head Diversity (A2, ε=1.0) trained - RMSE: 0.016555, R²: 0.100240

Training Multi-Head Diversity (A3, ε=0.25)...


  Multi-Head Diversity (A3, ε=0.25) - Epoch 10/100: Train Loss=0.000463, Val Loss=0.000270, Robust Weight=0.300


  Multi-Head Diversity (A3, ε=0.25) - Epoch 20/100: Train Loss=0.000440, Val Loss=0.000268, Robust Weight=0.300


  Multi-Head Diversity (A3, ε=0.25) - Epoch 30/100: Train Loss=0.000425, Val Loss=0.000269, Robust Weight=0.300


  Multi-Head Diversity (A3, ε=0.25): Early stopping at epoch 40
  Multi-Head Diversity (A3, ε=0.25) trained - RMSE: 0.016447, R²: 0.111984

Training Multi-Head Diversity (A3, ε=0.5)...


  Multi-Head Diversity (A3, ε=0.5) - Epoch 10/100: Train Loss=0.000496, Val Loss=0.000275, Robust Weight=0.300


  Multi-Head Diversity (A3, ε=0.5) - Epoch 20/100: Train Loss=0.000477, Val Loss=0.000269, Robust Weight=0.300


  Multi-Head Diversity (A3, ε=0.5) - Epoch 30/100: Train Loss=0.000478, Val Loss=0.000268, Robust Weight=0.300


  Multi-Head Diversity (A3, ε=0.5) - Epoch 40/100: Train Loss=0.000444, Val Loss=0.000270, Robust Weight=0.300


  Multi-Head Diversity (A3, ε=0.5): Early stopping at epoch 47
  Multi-Head Diversity (A3, ε=0.5) trained - RMSE: 0.016417, R²: 0.115171

Training Multi-Head Diversity (A3, ε=1.0)...


  Multi-Head Diversity (A3, ε=1.0) - Epoch 10/100: Train Loss=0.000521, Val Loss=0.000275, Robust Weight=0.300


  Multi-Head Diversity (A3, ε=1.0) - Epoch 20/100: Train Loss=0.000495, Val Loss=0.000273, Robust Weight=0.300


  Multi-Head Diversity (A3, ε=1.0) - Epoch 30/100: Train Loss=0.000469, Val Loss=0.000276, Robust Weight=0.300


  Multi-Head Diversity (A3, ε=1.0) - Epoch 40/100: Train Loss=0.000458, Val Loss=0.000268, Robust Weight=0.300


  Multi-Head Diversity (A3, ε=1.0) - Epoch 50/100: Train Loss=0.000446, Val Loss=0.000270, Robust Weight=0.300


  Multi-Head Diversity (A3, ε=1.0): Early stopping at epoch 53
  Multi-Head Diversity (A3, ε=1.0) trained - RMSE: 0.016380, R²: 0.119191

Training Multi-Head Diversity (A4, ε=0.25)...


  Multi-Head Diversity (A4, ε=0.25) - Epoch 10/100: Train Loss=0.000460, Val Loss=0.000274, Robust Weight=0.300


  Multi-Head Diversity (A4, ε=0.25) - Epoch 20/100: Train Loss=0.000437, Val Loss=0.000269, Robust Weight=0.300


  Multi-Head Diversity (A4, ε=0.25) - Epoch 30/100: Train Loss=0.000411, Val Loss=0.000272, Robust Weight=0.300


  Multi-Head Diversity (A4, ε=0.25): Early stopping at epoch 34
  Multi-Head Diversity (A4, ε=0.25) trained - RMSE: 0.016448, R²: 0.111832

Training Multi-Head Diversity (A4, ε=0.5)...


  Multi-Head Diversity (A4, ε=0.5) - Epoch 10/100: Train Loss=0.000485, Val Loss=0.000268, Robust Weight=0.300


  Multi-Head Diversity (A4, ε=0.5) - Epoch 20/100: Train Loss=0.000447, Val Loss=0.000268, Robust Weight=0.300


  Multi-Head Diversity (A4, ε=0.5) - Epoch 30/100: Train Loss=0.000428, Val Loss=0.000268, Robust Weight=0.300


  Multi-Head Diversity (A4, ε=0.5) - Epoch 40/100: Train Loss=0.000417, Val Loss=0.000266, Robust Weight=0.300


  Multi-Head Diversity (A4, ε=0.5) - Epoch 50/100: Train Loss=0.000407, Val Loss=0.000267, Robust Weight=0.300


  Multi-Head Diversity (A4, ε=0.5): Early stopping at epoch 59
  Multi-Head Diversity (A4, ε=0.5) trained - RMSE: 0.016321, R²: 0.125523

Training Multi-Head Diversity (A4, ε=1.0)...


  Multi-Head Diversity (A4, ε=1.0) - Epoch 10/100: Train Loss=0.000504, Val Loss=0.000279, Robust Weight=0.300


  Multi-Head Diversity (A4, ε=1.0) - Epoch 20/100: Train Loss=0.000463, Val Loss=0.000270, Robust Weight=0.300


  Multi-Head Diversity (A4, ε=1.0) - Epoch 30/100: Train Loss=0.000452, Val Loss=0.000270, Robust Weight=0.300


  Multi-Head Diversity (A4, ε=1.0) - Epoch 40/100: Train Loss=0.000439, Val Loss=0.000268, Robust Weight=0.300


  Multi-Head Diversity (A4, ε=1.0): Early stopping at epoch 47
  Multi-Head Diversity (A4, ε=1.0) trained - RMSE: 0.016388, R²: 0.118332

ADVERSARIAL TRAINING COMPLETE
Total adversarially trained models: 36


## 4. Evaluate Existing Models Under Adversarial Attacks

Evaluate already-trained models (standard and adversarially trained) under A1-A4 attacks to generate robustness results.