In [None]:
# Cell 0: Setup â€” clone repo, link data
import os, subprocess
REPO = "/kaggle/working/competition_package"
os.chdir("/kaggle/working")
os.system(f"rm -rf {REPO}")
os.system(f"git clone https://github.com/vincentvdo6/competition_package.git {REPO}")
os.chdir(REPO)
os.makedirs("datasets", exist_ok=True)
os.makedirs("logs", exist_ok=True)
os.system("ln -sf /kaggle/input/wunderfund-predictorium/train.parquet datasets/train.parquet")
os.system("ln -sf /kaggle/input/wunderfund-predictorium/valid.parquet datasets/valid.parquet")
print("Commit:", subprocess.check_output(["git", "rev-parse", "--short", "HEAD"], text=True).strip())
print("Ready!")

In [None]:
# Cell 1: tightwd_v2 seeds 54-63 (10 seeds, ~16 min)
# Existing tw2: 42-53 (12 seeds). This adds 10 more.
import os
os.chdir("/kaggle/working/competition_package")
for seed in range(54, 64):
    print(f"\n{'='*60}")
    print(f"Training gru_derived_tightwd_v2 seed {seed}")
    print(f"{'='*60}")
    os.system(
        f"python -u scripts/train.py "
        f"--config configs/gru_derived_tightwd_v2.yaml "
        f"--seed {seed} --device cuda"
    )
print("\nBatch 1 done: tw2 seeds 54-63")

In [None]:
# Cell 2: tightwd_v2 seeds 64-73 (10 seeds, ~16 min)
import os
os.chdir("/kaggle/working/competition_package")
for seed in range(64, 74):
    print(f"\n{'='*60}")
    print(f"Training gru_derived_tightwd_v2 seed {seed}")
    print(f"{'='*60}")
    os.system(
        f"python -u scripts/train.py "
        f"--config configs/gru_derived_tightwd_v2.yaml "
        f"--seed {seed} --device cuda"
    )
print("\nBatch 2 done: tw2 seeds 64-73")

In [None]:
# Cell 3: pearson_v1 seeds 51-60 (10 seeds, ~16 min)
# Existing p1: 42-50 (9 seeds). This adds 10 more.
import os
os.chdir("/kaggle/working/competition_package")
for seed in range(51, 61):
    print(f"\n{'='*60}")
    print(f"Training gru_pearson_v1 seed {seed}")
    print(f"{'='*60}")
    os.system(
        f"python -u scripts/train.py "
        f"--config configs/gru_pearson_v1.yaml "
        f"--seed {seed} --device cuda"
    )
print("\nBatch 3 done: p1 seeds 51-60")

In [None]:
# Cell 4: pearson_v1 seeds 61-70 (10 seeds, ~16 min)
import os
os.chdir("/kaggle/working/competition_package")
for seed in range(61, 71):
    print(f"\n{'='*60}")
    print(f"Training gru_pearson_v1 seed {seed}")
    print(f"{'='*60}")
    os.system(
        f"python -u scripts/train.py "
        f"--config configs/gru_pearson_v1.yaml "
        f"--seed {seed} --device cuda"
    )
print("\nBatch 4 done: p1 seeds 61-70")

In [None]:
# Cell 5: Strip checkpoints (remove optimizer state) + copy normalizers + zip
import os, torch, glob, shutil
os.chdir("/kaggle/working/competition_package")
os.makedirs("logs/slim", exist_ok=True)

for pt in sorted(glob.glob("logs/*.pt")):
    try:
        ckpt = torch.load(pt, map_location="cpu", weights_only=False)
    except TypeError:
        ckpt = torch.load(pt, map_location="cpu")
    slim = {
        "model_state_dict": ckpt["model_state_dict"],
        "config": ckpt.get("config", {}),
        "best_score": ckpt.get("best_score", None),
    }
    out = f"logs/slim/{os.path.basename(pt)}"
    torch.save(slim, out)
    orig = os.path.getsize(pt) / 1e6
    new = os.path.getsize(out) / 1e6
    print(f"{os.path.basename(pt)}: {orig:.1f}MB -> {new:.1f}MB")

for npz in sorted(glob.glob("logs/normalizer_*.npz")):
    shutil.copy(npz, f"logs/slim/{os.path.basename(npz)}")
    print(f"Copied {os.path.basename(npz)}")

print(f"\n--- logs/slim/ contents ({len(os.listdir('logs/slim'))} files) ---")
for f in sorted(os.listdir("logs/slim")):
    sz = os.path.getsize(f"logs/slim/{f}") / 1e6
    print(f"  {f}: {sz:.1f}MB")

# Zip for download
shutil.make_archive("/kaggle/working/gru_expansion_v2", "zip",
                     "/kaggle/working/competition_package/logs/slim")
sz = os.path.getsize("/kaggle/working/gru_expansion_v2.zip") / 1e6
print(f"\ngru_expansion_v2.zip: {sz:.1f}MB")
print("Download from: /kaggle/working/gru_expansion_v2.zip")

In [None]:
# Cell 6: Print validation scores (sorted best to worst)
import os, glob, torch
os.chdir("/kaggle/working/competition_package")

results = []
for pt in sorted(glob.glob("logs/*.pt")):
    try:
        ckpt = torch.load(pt, map_location="cpu", weights_only=False)
    except TypeError:
        ckpt = torch.load(pt, map_location="cpu")
    score = ckpt.get("best_score", ckpt.get("val_score", None))
    name = os.path.basename(pt)
    if isinstance(score, (int, float)):
        results.append((name, float(score)))
    else:
        results.append((name, 0.0))

# Sort by score descending
results.sort(key=lambda x: x[1], reverse=True)

print(f"{'Rank':<5} {'Model':<55} {'Val Score':>10}")
print("-" * 72)
for i, (name, score) in enumerate(results, 1):
    marker = " *" if score >= 0.264 else ""
    print(f"{i:<5} {name:<55} {score:>10.4f}{marker}")

# Summary stats per config
tw2 = [(n, s) for n, s in results if "tightwd" in n]
p1 = [(n, s) for n, s in results if "pearson" in n]
print(f"\n--- tightwd_v2: {len(tw2)} seeds ---")
if tw2:
    scores = [s for _, s in tw2]
    print(f"  Best 5: {sorted(scores, reverse=True)[:5]}")
    print(f"  Mean: {sum(scores)/len(scores):.4f}, Std: {(sum((s-sum(scores)/len(scores))**2 for s in scores)/len(scores))**0.5:.4f}")
print(f"\n--- pearson_v1: {len(p1)} seeds ---")
if p1:
    scores = [s for _, s in p1]
    print(f"  Best 5: {sorted(scores, reverse=True)[:5]}")
    print(f"  Mean: {sum(scores)/len(scores):.4f}, Std: {(sum((s-sum(scores)/len(scores))**2 for s in scores)/len(scores))**0.5:.4f}")