# 08: Attention Model GPU Inference for Local Validation

Runs online inference for 8 new attention models (seeds 45-52) on GPU.
Outputs .npz prediction caches compatible with `validate_ensemble_local.py`.

**Key optimization**: Batches all 1,444 sequences at each timestep
(1,000 iterations vs 1,444,000). ~1-2 min/model on GPU vs ~25 min on CPU.

**Requires**: Checkpoint zips from notebook 07 sessions uploaded as dataset(s).

In [None]:
import os, subprocess

REPO = "/kaggle/working/competition_package"
os.chdir("/kaggle/working")
os.system(f"rm -rf {REPO}")
os.system(f"git clone https://github.com/vincentvdo6/competition_package.git {REPO}")
os.chdir(REPO)
os.makedirs("datasets", exist_ok=True)
os.system("ln -sf /kaggle/input/wunderfund-predictorium/valid.parquet datasets/valid.parquet")
print("Commit:", subprocess.check_output(["git", "rev-parse", "--short", "HEAD"], text=True).strip())
print("Ready!")

In [None]:
# Configuration: dataset directory (Kaggle auto-extracted the zip)
CKPT_DIR = '/kaggle/input/inference-training'

VALID_PATH = 'datasets/valid.parquet'
OUTPUT_DIR = '/kaggle/working'

# Models to run inference for: name -> (ckpt_file, norm_file, train_val)
MODELS = {
    'attn_nb07_s45': ('attn_clean_seed45.pt', 'normalizer_attn_clean_seed45.npz', 0.2599),
    'attn_nb07_s46': ('attn_clean_seed46.pt', 'normalizer_attn_clean_seed46.npz', 0.2659),
    'attn_nb07_s47': ('attn_clean_seed47.pt', 'normalizer_attn_clean_seed47.npz', 0.2598),
    'attn_nb07_s48': ('attn_clean_seed48.pt', 'normalizer_attn_clean_seed48.npz', 0.2706),
    'attn_nb07_s49': ('attn_clean_seed49.pt', 'normalizer_attn_clean_seed49.npz', 0.2560),
    'attn_nb07_s50': ('attn_clean_seed50.pt', 'normalizer_attn_clean_seed50.npz', 0.2752),
    'attn_nb07_s51': ('attn_clean_seed51.pt', 'normalizer_attn_clean_seed51.npz', 0.2600),
    'attn_nb07_s52': ('attn_clean_seed52.pt', 'normalizer_attn_clean_seed52.npz', 0.2641),
}

print(f'Will process {len(MODELS)} attention models from {CKPT_DIR}')
for name, (_, _, val) in MODELS.items():
    print(f'  {name}: train_val={val:.4f}')

In [None]:
import sys, time
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import yaml

sys.path.insert(0, REPO)
from src.models.gru_attention import GRUAttentionModel
from src.data.preprocessing import DerivedFeatureBuilder, Normalizer
from utils import weighted_pearson_correlation

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name()}')

with open('configs/gru_attention_clean_v1.yaml') as f:
    ATTN_CONFIG = yaml.safe_load(f)
print(f'Config loaded: hidden={ATTN_CONFIG["model"]["hidden_size"]}, '
      f'heads={ATTN_CONFIG["model"]["attention_heads"]}, '
      f'window={ATTN_CONFIG["model"]["attention_window"]}')

In [None]:
# Load and reshape validation data
print(f'Loading {VALID_PATH}...')
df = pd.read_parquet(VALID_PATH)
n_rows = len(df)
n_seqs = df['seq_ix'].nunique()
seq_len = n_rows // n_seqs
print(f'  {n_rows} rows, {n_seqs} sequences, {seq_len} steps/seq')

values = df.values
seq_ix_flat = values[:, 0].astype(int)
need_pred_flat = values[:, 2].astype(bool)
raw_features = values[:, 3:35].astype(np.float32).reshape(n_seqs, seq_len, 32)
targets_all = values[:, 35:].astype(np.float64).reshape(n_seqs, seq_len, 2)
need_pred_2d = need_pred_flat.reshape(n_seqs, seq_len)
unique_seq_ix = seq_ix_flat.reshape(n_seqs, seq_len)[:, 0]

# Pre-compute derived features for ALL steps (vectorized)
print('Computing derived features...')
raw_flat = raw_features.reshape(-1, 32)
derived_flat = DerivedFeatureBuilder.compute(raw_flat)
features_42_all = np.concatenate([raw_flat, derived_flat], axis=-1).reshape(n_seqs, seq_len, 42)

# Build mask for filtering predictions
mask = need_pred_2d.flatten()
seq_indices_filtered = np.repeat(unique_seq_ix, seq_len)[mask]
targets_filtered = targets_all.reshape(-1, 2)[mask]

print(f'  Features: {features_42_all.shape}, need_pred rows: {mask.sum()}/{len(mask)}')
print('Data ready.')

In [None]:
# Run batch-online inference for all 8 attention models
os.makedirs(f'{OUTPUT_DIR}/predictions', exist_ok=True)
results = {}

for name, (ckpt_file, norm_file, val_score) in MODELS.items():
    print(f'\n{"="*60}')
    print(f'Processing {name} (train val={val_score:.4f})')
    print(f'{"="*60}')

    ckpt_path = f'{CKPT_DIR}/{ckpt_file}'
    norm_path = f'{CKPT_DIR}/{norm_file}'

    if not os.path.exists(ckpt_path):
        print(f'  SKIP: checkpoint not found: {ckpt_path}')
        continue

    # Load model
    model = GRUAttentionModel(ATTN_CONFIG)
    ckpt = torch.load(ckpt_path, map_location='cpu', weights_only=False)
    state_dict = ckpt.get('model_state_dict', ckpt)
    model.load_state_dict(state_dict)
    model.to(device)
    model.eval()
    normalizer = Normalizer.load(norm_path)

    print(f'  Model loaded ({sum(p.numel() for p in model.parameters()):,} params)')

    # Batch-online inference: all sequences in parallel at each step
    all_preds = np.zeros((n_seqs, seq_len, 2), dtype=np.float32)
    hidden = None
    t0 = time.time()

    with torch.no_grad():
        for t in range(seq_len):
            step_features = features_42_all[:, t, :]
            normed = normalizer.transform(step_features)
            x = torch.from_numpy(normed).to(device)
            pred, hidden = model.forward_step(x, hidden)
            all_preds[:, t, :] = pred.cpu().numpy()
            if (t + 1) % 200 == 0:
                print(f'    Step {t+1}/{seq_len} in {time.time()-t0:.1f}s')

    elapsed = time.time() - t0
    preds_flat = np.clip(all_preds.reshape(-1, 2)[mask], -6, 6)

    # Score
    t0_score = weighted_pearson_correlation(targets_filtered[:, 0], preds_flat[:, 0])
    t1_score = weighted_pearson_correlation(targets_filtered[:, 1], preds_flat[:, 1])
    avg_score = (t0_score + t1_score) / 2.0

    print(f'  {elapsed:.1f}s | t0={t0_score:.4f}  t1={t1_score:.4f}  avg={avg_score:.4f} (train_val={val_score:.4f})')

    # Save .npz (compatible with validate_ensemble_local.py cache format)
    out_path = f'{OUTPUT_DIR}/predictions/{name}.npz'
    np.savez_compressed(out_path, preds=preds_flat, targets=targets_filtered, seq_indices=seq_indices_filtered)
    print(f'  Saved: {out_path}')

    results[name] = {'t0': t0_score, 't1': t1_score, 'avg': avg_score, 'train_val': val_score}
    del model
    torch.cuda.empty_cache()

print(f'\n{"="*60}')
print('ALL MODELS COMPLETE')
print(f'{"="*60}')

In [None]:
# Summary and zip for download
print(f'{"Model":<20} {"t0":>7} {"t1":>7} {"avg":>7} {"train_val":>10}')
print('-' * 55)
for name, sc in sorted(results.items(), key=lambda x: -x[1]['avg']):
    print(f'{name:<20} {sc["t0"]:>7.4f} {sc["t1"]:>7.4f} {sc["avg"]:>7.4f} {sc["train_val"]:>10.4f}')

import shutil
zip_path = f'{OUTPUT_DIR}/attn_predictions_cache'
shutil.make_archive(zip_path, 'zip', f'{OUTPUT_DIR}/predictions')
print(f'\nZip: {zip_path}.zip ({os.path.getsize(zip_path + ".zip")/1024/1024:.1f} MB)')
print('Download from Output tab on the right sidebar.')

## Post-Inference Checklist

1. Download `attn_predictions_cache.zip` from the Output tab
2. Extract all `.npz` files into `cache/predictions/` on your local machine
3. Run: `python scripts/validate_ensemble_local.py list` to verify cached
4. Run: `python scripts/validate_ensemble_local.py compare` to rank all presets
5. Run: `python scripts/validate_ensemble_local.py greedy --pool all --max-models 8 --weighted-attn`