# Fold2 Seed Expansion Training

Train fold2 with seeds 44, 45, 46 (3 models).

**Why**: Fold2 is the key diversifier (corr=0.83 on t1 vs 0.94+ for other folds). Adding fold2_seed42 to best-per-fold caused +0.0009 LB jump (0.2898 -> 0.2907 PB). More fold2 seeds = better fold2 selection + multi-fold2 ensembles.

**Current fold2 inventory**: seed42 (val=0.3094), seed43 (val=0.3016). Need seed44, 45, 46.

**Time**: ~1.5 hours (3 x ~30 min on T4)

In [None]:
# Cell 1: Mount Drive, download data, clone repo
import os, json, subprocess

from google.colab import drive
drive.mount('/content/drive')
os.makedirs('/content/drive/MyDrive/wunderfund', exist_ok=True)

!pip install -q kaggle==1.6.14 --force-reinstall
os.makedirs('/root/.kaggle', exist_ok=True)
with open('/root/.kaggle/kaggle.json', 'w') as f:
    json.dump({"username": "vincentvdo6", "key": "FILL_IN"}, f)
os.chmod('/root/.kaggle/kaggle.json', 0o600)

os.makedirs('/content/data', exist_ok=True)
!kaggle datasets download -d vincentvdo6/wunderfund-predictorium -p /content/data/ --force
!unzip -o -q /content/data/wunderfund-predictorium.zip -d /content/data/
!ls /content/data/*.parquet

REPO = "/content/competition_package"
os.chdir("/content")
subprocess.run(["rm", "-rf", REPO], check=False)
subprocess.run(["git", "clone", "https://github.com/vincentvdo6/competition_package.git", REPO], check=True)
os.chdir(REPO)
os.makedirs("datasets", exist_ok=True)
os.makedirs("logs", exist_ok=True)

subprocess.run(["ln", "-sf", "/content/data/train.parquet", "datasets/train.parquet"], check=True)
subprocess.run(["ln", "-sf", "/content/data/valid.parquet", "datasets/valid.parquet"], check=True)

assert os.path.exists("datasets/train.parquet")
assert os.path.exists("datasets/valid.parquet")

commit = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"], text=True).strip()
print(f"Commit: {commit}")
print(f"GPU: {subprocess.check_output(['nvidia-smi', '--query-gpu=name', '--format=csv,noheader'], text=True).strip()}")
print("Ready!")

In [None]:
# Cell 2: Train 3 fold2 models (seeds 44, 45, 46)
import subprocess, sys, os, time, torch
os.chdir("/content/competition_package")

CONFIG = "configs/gru_kfold_v1.yaml"
N_FOLDS = 5
FOLD = 2
SEEDS = [44, 45, 46]

print(f"FOLD2 SEED EXPANSION: {len(SEEDS)} models (fold {FOLD}, seeds {SEEDS})")
print(f"{'='*70}\n")

batch_start = time.time()
for i, seed in enumerate(SEEDS):
    ckpt = f"logs/gru_kfold_v1_fold{FOLD}_seed{seed}.pt"
    if os.path.exists(ckpt):
        print(f"  [{i+1}/{len(SEEDS)}] fold {FOLD} seed {seed}: exists -- skip")
        continue
    print(f"\n{'#'*70}")
    print(f"# [{i+1}/{len(SEEDS)}] fold {FOLD}, seed {seed}")
    print(f"{'#'*70}\n", flush=True)
    t0 = time.time()
    proc = subprocess.Popen(
        [sys.executable, "-u", "scripts/train_kfold.py",
         "--config", CONFIG,
         "--fold", str(FOLD),
         "--n-folds", str(N_FOLDS),
         "--seed", str(seed),
         "--device", "cuda"],
        stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
    )
    for line in proc.stdout:
        print(line, end="", flush=True)
    rc = proc.wait()
    elapsed = time.time() - t0
    if rc != 0:
        print(f"  ERROR: fold {FOLD} seed {seed} failed (rc={rc})")
    else:
        print(f"  Done in {elapsed:.0f}s ({elapsed/60:.1f}min)")

# Summary
print(f"\n{'='*70}")
print(f"FOLD2 SEED EXPANSION RESULTS")
print(f"{'='*70}")
print(f"  {'Seed':<6} {'Val':>10} {'Epoch':>8}")
print(f"  {'-'*30}")

for seed in SEEDS:
    pt = f"logs/gru_kfold_v1_fold{FOLD}_seed{seed}.pt"
    if not os.path.exists(pt):
        print(f"  s{seed:<5} MISSING")
        continue
    ckpt = torch.load(pt, map_location="cpu", weights_only=False)
    score = float(ckpt.get("best_score", 0))
    epoch = ckpt.get("best_epoch", "?")
    print(f"  s{seed:<5} {score:>10.4f} {epoch:>8}")

batch_elapsed = time.time() - batch_start
print(f"\nTotal time: {batch_elapsed/60:.0f} min ({batch_elapsed/3600:.1f} hours)")

In [None]:
# Cell 3: Save fold2 checkpoints to Drive + zip for download
import os, torch, glob, zipfile
os.chdir("/content/competition_package")

DRIVE_DIR = "/content/drive/MyDrive/wunderfund"
os.makedirs(DRIVE_DIR, exist_ok=True)

FOLD = 2
SEEDS = [44, 45, 46]

pts = [f"logs/gru_kfold_v1_fold{FOLD}_seed{s}.pt" for s in SEEDS]
pts = [p for p in pts if os.path.exists(p)]

print(f"Found {len(pts)} fold2 checkpoints")
print(f"{'='*70}")

# Save slim checkpoints to Drive
saved = 0
for pt in pts:
    ckpt = torch.load(pt, map_location="cpu", weights_only=False)
    score = float(ckpt.get("best_score", 0))
    epoch = ckpt.get("best_epoch", "?")
    slim = {
        "model_state_dict": ckpt["model_state_dict"],
        "config": ckpt.get("config", {}),
        "best_score": ckpt.get("best_score", None),
        "best_epoch": ckpt.get("best_epoch", None),
    }
    dst = f"{DRIVE_DIR}/{os.path.basename(pt)}"
    torch.save(slim, dst)
    saved += 1
    print(f"  {os.path.basename(pt)}: val={score:.4f} ep={epoch}")

print(f"\nSaved {saved} checkpoints to Drive")

# Also include existing fold2 models for completeness
all_fold2 = sorted(glob.glob(f"logs/gru_kfold_v1_fold2_seed*.pt"))
all_fold2 = [p for p in all_fold2 if '_epoch' not in p]

zip_name = "/content/fold2_seed_expansion.zip"
with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zf:
    for pt in pts:
        zf.write(pt, os.path.basename(pt))
sz = os.path.getsize(zip_name) / 1e6
print(f"\nDownload zip: {zip_name} ({sz:.1f}MB)")

# Print full fold2 inventory
print(f"\n{'='*70}")
print("FULL FOLD2 INVENTORY (all seeds)")
print(f"{'='*70}")
for pt in all_fold2:
    ckpt = torch.load(pt, map_location="cpu", weights_only=False)
    score = float(ckpt.get("best_score", 0))
    print(f"  {os.path.basename(pt)}: val={score:.4f}")

print("\nDone! Download the zip and copy checkpoints to logs/vanilla_all/ on local machine.")