# Alternate Fold Training (7-fold + 3-fold)

Train kfold models with different n_folds to create new data partitions.

**Why**: Current 5-fold fold2_seed42 is a diversity outlier (corr=0.83 vs 0.94+). The fold split is deterministic round-robin (`seq_ix_rank % n_folds`). Changing n_folds creates entirely different partitions that may contain another diversity source.

**Plan**:
- 7-fold: 7 models (seed42, all folds) -- ~3.5 hours
- 3-fold: 3 models (seed42, all folds) -- ~1.5 hours

**Time**: ~5 hours total on T4

In [None]:
# Cell 1: Mount Drive, download data, clone repo
import os, json, subprocess

from google.colab import drive
drive.mount('/content/drive')
os.makedirs('/content/drive/MyDrive/wunderfund', exist_ok=True)

!pip install -q kaggle==1.6.14 --force-reinstall
os.makedirs('/root/.kaggle', exist_ok=True)
with open('/root/.kaggle/kaggle.json', 'w') as f:
    json.dump({"username": "vincentvdo6", "key": "FILL_IN"}, f)
os.chmod('/root/.kaggle/kaggle.json', 0o600)

os.makedirs('/content/data', exist_ok=True)
!kaggle datasets download -d vincentvdo6/wunderfund-predictorium -p /content/data/ --force
!unzip -o -q /content/data/wunderfund-predictorium.zip -d /content/data/
!ls /content/data/*.parquet

REPO = "/content/competition_package"
os.chdir("/content")
subprocess.run(["rm", "-rf", REPO], check=False)
subprocess.run(["git", "clone", "https://github.com/vincentvdo6/competition_package.git", REPO], check=True)
os.chdir(REPO)
os.makedirs("datasets", exist_ok=True)
os.makedirs("logs", exist_ok=True)

subprocess.run(["ln", "-sf", "/content/data/train.parquet", "datasets/train.parquet"], check=True)
subprocess.run(["ln", "-sf", "/content/data/valid.parquet", "datasets/valid.parquet"], check=True)

assert os.path.exists("datasets/train.parquet")
assert os.path.exists("datasets/valid.parquet")

commit = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"], text=True).strip()
print(f"Commit: {commit}")
print(f"GPU: {subprocess.check_output(['nvidia-smi', '--query-gpu=name', '--format=csv,noheader'], text=True).strip()}")
print("Ready!")

In [None]:
# Cell 2: Train 7-fold models (all folds, seed 42)
import subprocess, sys, os, time, torch
os.chdir("/content/competition_package")

CONFIG = "configs/gru_kfold_v1.yaml"
N_FOLDS = 7
SEED = 42
FOLDS = list(range(N_FOLDS))

print(f"7-FOLD TRAINING: {len(FOLDS)} models (seed {SEED})")
print(f"{'='*70}\n")

batch_start = time.time()
for i, fold in enumerate(FOLDS):
    ckpt = f"logs/gru_kfold_v1_fold{fold}_nf7_seed{SEED}.pt"
    if os.path.exists(ckpt):
        print(f"  [{i+1}/{len(FOLDS)}] fold {fold}: exists -- skip")
        continue
    print(f"\n{'#'*70}")
    print(f"# [{i+1}/{len(FOLDS)}] 7-fold, fold {fold}, seed {SEED}")
    print(f"{'#'*70}\n", flush=True)
    t0 = time.time()
    proc = subprocess.Popen(
        [sys.executable, "-u", "scripts/train_kfold.py",
         "--config", CONFIG,
         "--fold", str(fold),
         "--n-folds", str(N_FOLDS),
         "--seed", str(SEED),
         "--device", "cuda",
         "--suffix", f"_nf7"],
        stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
    )
    for line in proc.stdout:
        print(line, end="", flush=True)
    rc = proc.wait()
    elapsed = time.time() - t0
    if rc != 0:
        print(f"  ERROR: fold {fold} failed (rc={rc})")
    else:
        print(f"  Done in {elapsed:.0f}s ({elapsed/60:.1f}min)")

# Summary
print(f"\n{'='*70}")
print(f"7-FOLD RESULTS")
print(f"{'='*70}")
print(f"  {'Fold':<6} {'Val':>10} {'Epoch':>8}")
print(f"  {'-'*30}")

for fold in FOLDS:
    pt = f"logs/gru_kfold_v1_fold{fold}_nf7_seed{SEED}.pt"
    if not os.path.exists(pt):
        print(f"  f{fold:<5} MISSING")
        continue
    ckpt = torch.load(pt, map_location="cpu", weights_only=False)
    score = float(ckpt.get("best_score", 0))
    epoch = ckpt.get("best_epoch", "?")
    print(f"  f{fold:<5} {score:>10.4f} {epoch:>8}")

batch_elapsed = time.time() - batch_start
print(f"\nTotal time: {batch_elapsed/60:.0f} min ({batch_elapsed/3600:.1f} hours)")

In [None]:
# Cell 3: Train 3-fold models (all folds, seed 42)
import subprocess, sys, os, time, torch
os.chdir("/content/competition_package")

CONFIG = "configs/gru_kfold_v1.yaml"
N_FOLDS = 3
SEED = 42
FOLDS = list(range(N_FOLDS))

print(f"3-FOLD TRAINING: {len(FOLDS)} models (seed {SEED})")
print(f"{'='*70}\n")

batch_start = time.time()
for i, fold in enumerate(FOLDS):
    ckpt = f"logs/gru_kfold_v1_fold{fold}_nf3_seed{SEED}.pt"
    if os.path.exists(ckpt):
        print(f"  [{i+1}/{len(FOLDS)}] fold {fold}: exists -- skip")
        continue
    print(f"\n{'#'*70}")
    print(f"# [{i+1}/{len(FOLDS)}] 3-fold, fold {fold}, seed {SEED}")
    print(f"{'#'*70}\n", flush=True)
    t0 = time.time()
    proc = subprocess.Popen(
        [sys.executable, "-u", "scripts/train_kfold.py",
         "--config", CONFIG,
         "--fold", str(fold),
         "--n-folds", str(N_FOLDS),
         "--seed", str(SEED),
         "--device", "cuda",
         "--suffix", f"_nf3"],
        stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
    )
    for line in proc.stdout:
        print(line, end="", flush=True)
    rc = proc.wait()
    elapsed = time.time() - t0
    if rc != 0:
        print(f"  ERROR: fold {fold} failed (rc={rc})")
    else:
        print(f"  Done in {elapsed:.0f}s ({elapsed/60:.1f}min)")

# Summary
print(f"\n{'='*70}")
print(f"3-FOLD RESULTS")
print(f"{'='*70}")
print(f"  {'Fold':<6} {'Val':>10} {'Epoch':>8}")
print(f"  {'-'*30}")

for fold in FOLDS:
    pt = f"logs/gru_kfold_v1_fold{fold}_nf3_seed{SEED}.pt"
    if not os.path.exists(pt):
        print(f"  f{fold:<5} MISSING")
        continue
    ckpt = torch.load(pt, map_location="cpu", weights_only=False)
    score = float(ckpt.get("best_score", 0))
    epoch = ckpt.get("best_epoch", "?")
    print(f"  f{fold:<5} {score:>10.4f} {epoch:>8}")

batch_elapsed = time.time() - batch_start
print(f"\nTotal time: {batch_elapsed/60:.0f} min ({batch_elapsed/3600:.1f} hours)")

In [None]:
# Cell 4: Save all checkpoints to Drive + zip for download
import os, torch, glob, zipfile
os.chdir("/content/competition_package")

DRIVE_DIR = "/content/drive/MyDrive/wunderfund"
os.makedirs(DRIVE_DIR, exist_ok=True)

# Collect all alt-fold checkpoints
pts_7f = sorted(glob.glob("logs/gru_kfold_v1_fold*_nf7_seed*.pt"))
pts_3f = sorted(glob.glob("logs/gru_kfold_v1_fold*_nf3_seed*.pt"))
all_pts = pts_7f + pts_3f
all_pts = [p for p in all_pts if '_epoch' not in p]

print(f"Found {len(pts_7f)} 7-fold + {len(pts_3f)} 3-fold checkpoints")
print(f"{'='*70}")

# Save slim checkpoints to Drive
saved = 0
for pt in all_pts:
    ckpt = torch.load(pt, map_location="cpu", weights_only=False)
    score = float(ckpt.get("best_score", 0))
    epoch = ckpt.get("best_epoch", "?")
    slim = {
        "model_state_dict": ckpt["model_state_dict"],
        "config": ckpt.get("config", {}),
        "best_score": ckpt.get("best_score", None),
        "best_epoch": ckpt.get("best_epoch", None),
    }
    dst = f"{DRIVE_DIR}/{os.path.basename(pt)}"
    torch.save(slim, dst)
    saved += 1
    print(f"  {os.path.basename(pt)}: val={score:.4f} ep={epoch}")

print(f"\nSaved {saved} checkpoints to Drive")

# Create download zip
zip_name = "/content/alt_folds_checkpoints.zip"
with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zf:
    for pt in all_pts:
        zf.write(pt, os.path.basename(pt))
sz = os.path.getsize(zip_name) / 1e6
print(f"\nDownload zip: {zip_name} ({sz:.1f}MB)")

# Summary table
print(f"\n{'='*70}")
print("COMPLETE INVENTORY")
print(f"{'='*70}")
for label, pts in [("7-fold", pts_7f), ("3-fold", pts_3f)]:
    print(f"\n  {label}:")
    for pt in pts:
        ckpt = torch.load(pt, map_location="cpu", weights_only=False)
        score = float(ckpt.get("best_score", 0))
        print(f"    {os.path.basename(pt)}: val={score:.4f}")

print("\nDone! Download the zip and copy checkpoints to logs/vanilla_all/ on local machine.")