# Full-Data Training Gate Test (Exp A)
**Goal**: Train on train+val combined to unlock data the #1 player uses.

**Protocol**:
1. Phase 1: Train 1 seed on standard train/val split to find `best_epoch`
2. Phase 2: Train 3 seeds on train+val combined for `round(1.1 * best_epoch)` epochs
3. Export checkpoints for LB submission

**Kill test**: Compare full-data single-model LB score against standard single-model (0.2580).

In [None]:
# Cell 0: Mount Drive, download data from Kaggle
import os, json

from google.colab import drive
drive.mount('/content/drive')
os.makedirs('/content/drive/MyDrive/wunderfund', exist_ok=True)

!pip install -q kaggle==1.6.14 --force-reinstall
os.makedirs('/root/.kaggle', exist_ok=True)
with open('/root/.kaggle/kaggle.json', 'w') as f:
    json.dump({"username": "vincentvdo6", "key": "FILL_IN"}, f)
os.chmod('/root/.kaggle/kaggle.json', 0o600)

os.makedirs('/content/data', exist_ok=True)
!kaggle datasets download -d vincentvdo6/wunderfund-predictorium -p /content/data/ --force
!unzip -o -q /content/data/wunderfund-predictorium.zip -d /content/data/
!ls /content/data/*.parquet

In [None]:
# Cell 1: Setup — clone repo, link data
import os, subprocess
REPO = "/content/competition_package"

os.chdir("/content")
os.system(f"rm -rf {REPO}")
os.system(f"git clone https://github.com/vincentvdo6/competition_package.git {REPO}")
os.chdir(REPO)
os.makedirs("datasets", exist_ok=True)
os.makedirs("logs", exist_ok=True)

os.system('ln -sf /content/data/train.parquet datasets/train.parquet')
os.system('ln -sf /content/data/valid.parquet datasets/valid.parquet')

assert os.path.exists("datasets/train.parquet"), "train.parquet not found!"
assert os.path.exists("datasets/valid.parquet"), "valid.parquet not found!"
print("Commit:", subprocess.check_output(["git", "rev-parse", "--short", "HEAD"], text=True).strip())
print(f"GPU: {os.popen('nvidia-smi --query-gpu=name --format=csv,noheader').read().strip()}")
print("Ready!")

In [None]:
# Cell 2: Phase 1 — Train 1 seed with standard val to find best_epoch
# Using tightwd_v2 config (our best generalizing config)
import os
os.chdir("/content/competition_package")

CALIBRATION_SEED = 42
print(f"Phase 1: Calibrating best_epoch with seed {CALIBRATION_SEED}")
print("="*60)
os.system(
    f"python -u scripts/train.py "
    f"--config configs/gru_derived_tightwd_v2.yaml "
    f"--seed {CALIBRATION_SEED} --device cuda"
)

# Extract best_epoch from checkpoint
import torch
ckpt = torch.load(f"logs/gru_derived_tightwd_v2_seed{CALIBRATION_SEED}.pt",
                   map_location="cpu", weights_only=False)
best_epoch = ckpt.get('best_epoch', 20)
best_score = ckpt.get('best_score', 0)
print(f"\nPhase 1 result: best_epoch={best_epoch}, best_score={best_score:.4f}")

# Calculate fixed epochs for full-data training (1.1x median, per Codex)
import math
FIXED_EPOCHS = round(1.1 * best_epoch)
print(f"Fixed epochs for full-data: {FIXED_EPOCHS} (1.1 x {best_epoch})")

In [None]:
# Cell 3: Phase 2 — Train 3 seeds on full data (train+val combined)
import os
os.chdir("/content/competition_package")

# Use FIXED_EPOCHS from Phase 1 (or override here if needed)
# FIXED_EPOCHS = 22  # Uncomment to override

SEEDS = [42, 43, 44]
print(f"Phase 2: Full-data training with {FIXED_EPOCHS} epochs")
print(f"Seeds: {SEEDS}")
print("="*60)

for seed in SEEDS:
    print(f"\n{'='*60}")
    print(f"Training gru_fulldata_tw2_v1 seed {seed} (fulldata, {FIXED_EPOCHS} epochs)")
    print(f"{'='*60}")
    os.system(
        f"python -u scripts/train.py "
        f"--config configs/gru_fulldata_tw2_v1.yaml "
        f"--seed {seed} --device cuda "
        f"--fulldata --fixed-epochs {FIXED_EPOCHS}"
    )

print(f"\nPhase 2 done: fulldata seeds {SEEDS}")

In [None]:
# Cell 4: Strip checkpoints + copy normalizers + zip + save to Drive
import os, torch, glob, shutil
os.chdir("/content/competition_package")
os.makedirs("logs/slim", exist_ok=True)

for pt in sorted(glob.glob("logs/*.pt")):
    basename = os.path.basename(pt)
    if '_epoch' in basename:
        continue
    try:
        ckpt = torch.load(pt, map_location="cpu", weights_only=False)
    except TypeError:
        ckpt = torch.load(pt, map_location="cpu")
    slim = {
        "model_state_dict": ckpt["model_state_dict"],
        "config": ckpt.get("config", {}),
        "best_score": ckpt.get("best_score", None),
        "best_epoch": ckpt.get("best_epoch", None),
    }
    out = f"logs/slim/{basename}"
    torch.save(slim, out)
    orig = os.path.getsize(pt) / 1e6
    new = os.path.getsize(out) / 1e6
    print(f"{basename}: {orig:.1f}MB -> {new:.1f}MB")

for npz in sorted(glob.glob("logs/normalizer_*.npz")):
    shutil.copy(npz, f"logs/slim/{os.path.basename(npz)}")
    print(f"Copied {os.path.basename(npz)}")

print(f"\n--- logs/slim/ contents ({len(os.listdir('logs/slim'))} files) ---")
for f in sorted(os.listdir("logs/slim")):
    sz = os.path.getsize(f"logs/slim/{f}") / 1e6
    print(f"  {f}: {sz:.1f}MB")

shutil.make_archive("/content/fulldata_gate", "zip",
                     "/content/competition_package/logs/slim")
sz = os.path.getsize("/content/fulldata_gate.zip") / 1e6
print(f"\nfulldata_gate.zip: {sz:.1f}MB")

shutil.copy("/content/fulldata_gate.zip", "/content/drive/MyDrive/wunderfund/fulldata_gate.zip")
print("Saved to Drive: MyDrive/wunderfund/fulldata_gate.zip")

In [None]:
# Cell 5: Print results summary
import os, glob, torch
os.chdir("/content/competition_package")

results = []
for pt in sorted(glob.glob("logs/*.pt")):
    basename = os.path.basename(pt)
    if '_epoch' in basename:
        continue
    try:
        ckpt = torch.load(pt, map_location="cpu", weights_only=False)
    except TypeError:
        ckpt = torch.load(pt, map_location="cpu")
    score = ckpt.get("best_score", 0)
    epoch = ckpt.get("best_epoch", "N/A")
    results.append((basename, float(score) if score else 0.0, epoch))

results.sort(key=lambda x: x[1], reverse=True)

print(f"{'Model':<55} {'Val Score':>10} {'Epoch':>6}")
print("-" * 75)
for name, score, epoch in results:
    tag = " [FULLDATA]" if 'fulldata' in name else " [STANDARD]"
    print(f"{name:<55} {score:>10.4f} {str(epoch):>6}{tag}")

# Summary
std_scores = [s for n, s, _ in results if 'fulldata' not in n and s > 0]
fd_scores = [s for n, s, _ in results if 'fulldata' in n and s > 0]
print(f"\nStandard val scores: {std_scores}")
print(f"Full-data (no val): {fd_scores} (these have no val score - must submit to LB)")
print(f"\nNOTE: Full-data models have no validation score.")
print(f"Must build submission and test on LB to evaluate.")