# RevIN Kill Test (Causal + Streaming Eval)

**Phase 1**: Streaming eval of full-seq RevIN models on causal features (honest LB estimate)
**Phase 2**: Train fresh with causal RevIN (running stats, matches inference exactly)

**Why causal**: Full-sequence RevIN gave 0.339 val but uses future data for normalization.
Online inference can only use past steps. Causal RevIN eliminates this mismatch.

**Kill test**: 3 seeds, pass if mean val >= 0.2670 (tightwd_v2 baseline)

In [None]:
# Cell 0: Mount Drive, download data from Kaggle
import os, json

from google.colab import drive
drive.mount('/content/drive')
os.makedirs('/content/drive/MyDrive/wunderfund', exist_ok=True)

!pip install -q kaggle==1.6.14 --force-reinstall
os.makedirs('/root/.kaggle', exist_ok=True)
with open('/root/.kaggle/kaggle.json', 'w') as f:
    json.dump({"username": "vincentvdo6", "key": "FILL_IN"}, f)
os.chmod('/root/.kaggle/kaggle.json', 0o600)

os.makedirs('/content/data', exist_ok=True)
!kaggle datasets download -d vincentvdo6/wunderfund-predictorium -p /content/data/ --force
!unzip -o -q /content/data/wunderfund-predictorium.zip -d /content/data/
!ls /content/data/*.parquet

In [None]:
# Cell 1: Setup — clone repo, link data
import os, subprocess
REPO = "/content/competition_package"

os.chdir("/content")
subprocess.run(["rm", "-rf", REPO], check=False)
subprocess.run(["git", "clone", "https://github.com/vincentvdo6/competition_package.git", REPO], check=True)
os.chdir(REPO)
os.makedirs("datasets", exist_ok=True)
os.makedirs("logs", exist_ok=True)

subprocess.run(["ln", "-sf", "/content/data/train.parquet", "datasets/train.parquet"], check=True)
subprocess.run(["ln", "-sf", "/content/data/valid.parquet", "datasets/valid.parquet"], check=True)

assert os.path.exists("datasets/train.parquet"), "train.parquet not found!"
assert os.path.exists("datasets/valid.parquet"), "valid.parquet not found!"
commit = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"], text=True).strip()
print(f"Commit: {commit}")
print(f"GPU: {subprocess.check_output(['nvidia-smi', '--query-gpu=name', '--format=csv,noheader'], text=True).strip()}")
print("Ready!")

In [None]:
# Cell 2: Streaming eval — test full-seq RevIN models with CAUSAL features
# This gives the honest LB estimate for the already-trained full-seq models
import os, sys, glob, torch, numpy as np
os.chdir("/content/competition_package")
sys.path.insert(0, ".")

from src.data.dataset import LOBSequenceDataset
from src.models.gru_baseline import GRUBaseline

# Load val data with causal RevIN (matches online inference)
print("Loading val data with CAUSAL RevIN...")
ds_causal = LOBSequenceDataset(
    'datasets/valid.parquet',
    normalize=True,
    derived_features=True,
    revin='causal',
)
# Also load with full-seq RevIN (what training used)
print("Loading val data with FULL RevIN...")
ds_full = LOBSequenceDataset(
    'datasets/valid.parquet',
    normalize=True,
    normalizer=ds_causal.normalizer,  # same global normalizer
    derived_features=True,
    revin='full',
)
print(f"Shapes: causal={ds_causal.features.shape}, full={ds_full.features.shape}")

def weighted_pearson(preds, targets, masks):
    """Competition metric: weighted Pearson correlation."""
    scores = []
    for t_idx in range(2):
        p = preds[:, :, t_idx][masks]  # flat predictions
        y = targets[:, :, t_idx][masks]  # flat targets
        w = torch.abs(y).clamp(max=6.0)
        w = w / w.sum()
        pm = (p * w).sum()
        ym = (y * w).sum()
        p_centered = p - pm
        y_centered = y - ym
        cov = (w * p_centered * y_centered).sum()
        std_p = torch.sqrt((w * p_centered ** 2).sum().clamp(min=1e-12))
        std_y = torch.sqrt((w * y_centered ** 2).sum().clamp(min=1e-12))
        scores.append((cov / (std_p * std_y)).item())
    return sum(scores) / len(scores)

# Evaluate each full-seq model on BOTH feature sets
print("\n" + "="*80)
print(f"{'Model':<45} {'Full-seq Val':>12} {'Causal Val':>12} {'Gap':>8}")
print("-" * 80)

for pt in sorted(glob.glob("logs/gru_revin_v1_seed*.pt")):
    basename = os.path.basename(pt)
    if '_epoch' in basename:
        continue
    
    ckpt = torch.load(pt, map_location="cpu", weights_only=False)
    config = ckpt.get('config', {})
    model = GRUBaseline(config)
    model.load_state_dict(ckpt['model_state_dict'])
    model.eval()
    
    with torch.no_grad():
        # Eval on full-seq features (should match reported val)
        preds_full, _ = model(ds_full.features)
        score_full = weighted_pearson(preds_full, ds_full.targets, ds_full.masks)
        
        # Eval on causal features (honest LB estimate)
        preds_causal, _ = model(ds_causal.features)
        score_causal = weighted_pearson(preds_causal, ds_causal.targets, ds_causal.masks)
    
    gap = score_causal - score_full
    print(f"{basename:<45} {score_full:>12.4f} {score_causal:>12.4f} {gap:>+8.4f}")

print("\nIf causal val >> 0.266, RevIN genuinely helps even without lookahead.")
print("If causal val ~= 0.266, the full-seq improvement was ALL lookahead inflation.")

In [None]:
# Cell 3: Train 3 seeds with CAUSAL RevIN config
import os, subprocess, sys
os.chdir("/content/competition_package")

CONFIG = "configs/gru_revin_causal_v1.yaml"
SEEDS = [42, 43, 44]

print(f"Causal RevIN Kill Test")
print(f"Config: {CONFIG}")
print(f"Seeds: {SEEDS}")
print("=" * 60, flush=True)

for seed in SEEDS:
    print(f"\n{'='*60}")
    print(f"Training gru_revin_causal_v1 seed {seed}")
    print(f"{'='*60}", flush=True)
    proc = subprocess.Popen(
        [sys.executable, "-u", "scripts/train.py",
         "--config", CONFIG,
         "--seed", str(seed), "--device", "cuda"],
        stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
    )
    for line in proc.stdout:
        print(line, end="", flush=True)
    proc.wait()
    if proc.returncode != 0:
        print(f"ERROR: seed {seed} failed with return code {proc.returncode}")

print(f"\nAll seeds done!")

In [None]:
# Cell 4: Kill test evaluation for causal RevIN
import os, glob, torch
os.chdir("/content/competition_package")

BASELINE_VAL = 0.2660  # tightwd_v2 mean val
PASS_THRESHOLD = 0.2670

results = []
for pt in sorted(glob.glob("logs/gru_revin_causal_v1_seed*.pt")):
    basename = os.path.basename(pt)
    if '_epoch' in basename:
        continue
    ckpt = torch.load(pt, map_location="cpu", weights_only=False)
    score = float(ckpt.get("best_score", 0))
    epoch = ckpt.get("best_epoch", "N/A")
    results.append((basename, score, epoch))

print(f"{'Model':<50} {'Val Score':>10} {'Epoch':>6} {'Delta':>8}")
print("-" * 80)
for name, score, epoch in results:
    delta = score - BASELINE_VAL
    tag = "  +" if delta > 0 else "  "
    print(f"{name:<50} {score:>10.4f} {str(epoch):>6} {tag}{delta:+.4f}")

scores = [s for _, s, _ in results]
mean_val = sum(scores) / len(scores) if scores else 0
positive = sum(1 for s in scores if s > BASELINE_VAL)

print(f"\n{'='*60}")
print(f"Mean val: {mean_val:.4f} (threshold: {PASS_THRESHOLD})")
print(f"Positive seeds: {positive}/{len(scores)} (vs baseline {BASELINE_VAL})")
print(f"Delta vs baseline: {mean_val - BASELINE_VAL:+.4f}")

PASS = mean_val >= PASS_THRESHOLD
print(f"\nKILL TEST: {'PASS — proceed to seed expansion + LB test' if PASS else 'FAIL'}")
if not PASS:
    print("Fallback: try lag features")

In [None]:
# Cell 5: Strip checkpoints + zip + save to Drive
import os, torch, glob, shutil
os.chdir("/content/competition_package")
os.makedirs("logs/slim", exist_ok=True)

for pt in sorted(glob.glob("logs/gru_revin_causal_v1*.pt")):
    basename = os.path.basename(pt)
    if '_epoch' in basename:
        continue
    ckpt = torch.load(pt, map_location="cpu", weights_only=False)
    slim = {
        "model_state_dict": ckpt["model_state_dict"],
        "config": ckpt.get("config", {}),
        "best_score": ckpt.get("best_score", None),
        "best_epoch": ckpt.get("best_epoch", None),
    }
    out = f"logs/slim/{basename}"
    torch.save(slim, out)
    orig = os.path.getsize(pt) / 1e6
    new = os.path.getsize(out) / 1e6
    print(f"{basename}: {orig:.1f}MB -> {new:.1f}MB")

for npz in sorted(glob.glob("logs/normalizer_gru_revin_causal_v1*.npz")):
    basename = os.path.basename(npz)
    shutil.copy(npz, f"logs/slim/{basename}")
    print(f"Copied {basename}")

print(f"\n--- logs/slim/ contents ---")
for f in sorted(os.listdir("logs/slim")):
    sz = os.path.getsize(f"logs/slim/{f}") / 1e6
    print(f"  {f}: {sz:.1f}MB")

shutil.make_archive("/content/revin_causal_kill_test", "zip",
                     "/content/competition_package/logs/slim")
sz = os.path.getsize("/content/revin_causal_kill_test.zip") / 1e6
print(f"\nrevin_causal_kill_test.zip: {sz:.1f}MB")

shutil.copy("/content/revin_causal_kill_test.zip",
            "/content/drive/MyDrive/wunderfund/revin_causal_kill_test.zip")
print("Saved to Drive: MyDrive/wunderfund/revin_causal_kill_test.zip")