In [1]:
from google.colab import drive
drive.mount('/content/drive')

SAVE_DIR = "/content/drive/MyDrive/nanoGPT_results"
!mkdir -p "$SAVE_DIR"

print("Results will be saved to:", SAVE_DIR)

Mounted at /content/drive
Results will be saved to: /content/drive/MyDrive/nanoGPT_results


In [2]:
%cd /content/
!git clone https://github.com/karpathy/nanoGPT.git
%cd nanoGPT

!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install tqdm numpy requests matplotlib ninja

/content
Cloning into 'nanoGPT'...
remote: Enumerating objects: 686, done.[K
remote: Total 686 (delta 0), reused 0 (delta 0), pack-reused 686 (from 1)[K
Receiving objects: 100% (686/686), 974.06 KiB | 42.35 MiB/s, done.
Resolving deltas: 100% (380/380), done.
/content/nanoGPT
Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting ninja
  Downloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.1 kB)
Downloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (180 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.7/180.7 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ninja
Successfully installed ninja-1.13.0


In [3]:
%cd data/shakespeare_char
!python prepare.py
%cd ../..

/content/nanoGPT/data/shakespeare_char
length of dataset in characters: 1,115,394
all the unique characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab size: 65
train has 1,003,854 tokens
val has 111,540 tokens
/content/nanoGPT


In [4]:
%%writefile run_experiments.py
import os, sys, itertools, subprocess, re, csv, time
from pathlib import Path
import torch

SAVE_DIR = os.environ.get("SAVE_DIR", "/content/drive/MyDrive/nanoGPT_results")

# Full Hyperparameter Grid
BLOCK_SIZES = [64, 128]
N_LAYERS = [4, 6]
N_HEADS = [4, 8]
N_EMBDS = [128, 256]
BATCH_SIZES = [8, 16]
MAX_ITERS = [1000, 2000]
DROPOUTS = [0.1, 0.2]

# Member → fixed hyperparams
MEMBER_MAP = {
    1: (64, 4),
    2: (64, 6),
    3: (128, 4),
    4: (128, 6),
}

CONFIG_TEMPLATE = r"""
out_dir = "{save_dir}/{out_name}"
dataset = "shakespeare_char"
eval_interval = 200
log_interval = 10
always_save_checkpoint = True

batch_size = {batch_size}
block_size = {block_size}
n_layer = {n_layer}
n_head = {n_head}
n_embd = {n_embd}
dropout = {dropout}

learning_rate = 3e-4
max_iters = {max_iters}
lr_decay_iters = {max_iters}

seed = {seed}
device = "{device}"

num_workers = 0
compile = False
"""

def list_experiments(member_id):
    block_size, n_layer = MEMBER_MAP[member_id]
    grid = list(itertools.product(N_HEADS, N_EMBDS, BATCH_SIZES, MAX_ITERS, DROPOUTS))
    exps = []
    for seed, (nh, ne, bs, mi, do) in enumerate(grid, 1):
        out_name = f"b{block_size}_L{n_layer}_H{nh}_E{ne}_BS{bs}_MI{mi}_D{int(do*100)}_s{seed}"
        exps.append({
            "block_size": block_size, "n_layer": n_layer,
            "n_head": nh, "n_embd": ne,
            "batch_size": bs, "max_iters": mi,
            "dropout": do, "seed": seed,
            "out_name": out_name
        })
    return exps

def parse_losses(stdout_line):
    m = re.search(r"train loss ([0-9.]+).*val loss ([0-9.]+)", stdout_line)
    if m:
        return float(m.group(1)), float(m.group(2))
    return None, None

def extract_model_params(logtext):
    m = re.search(r"number of parameters:\s*([0-9.]+)M", logtext)
    if m:
        return float(m.group(1)) * 1e6
    return None

def run_training(cfg, device):
    cfg_file = Path(f"{cfg['out_name']}.py")
    cfg_file.write_text(CONFIG_TEMPLATE.format(**cfg, save_dir=SAVE_DIR, device=device))

    p = subprocess.Popen(
        ["python", "train.py", str(cfg_file)],
        stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
    )

    train_loss = None
    val_loss = None
    param_count = None
    log_buf = ""

    for line in p.stdout:
        print(line, end="")
        log_buf += line
        tl, vl = parse_losses(line)
        if tl is not None:
            train_loss, val_loss = tl, vl

        if param_count is None:
            param_count = extract_model_params(log_buf)

    p.wait()
    loss_gap = val_loss - train_loss if train_loss and val_loss else None
    return train_loss, val_loss, loss_gap, param_count, cfg_file

def main():
    member_id = int(sys.argv[1])
    device = "cuda" if torch.cuda.is_available() else "cpu"
    os.makedirs(SAVE_DIR, exist_ok=True)

    result_csv = Path(SAVE_DIR) / "results.csv"
    if not result_csv.exists():
        with open(result_csv, "w") as f:
            csv.writer(f).writerow([
                "Experiment",
                "Train Loss",
                "Val Loss",
                "Loss Gap",
                "Total Params",
                "Config Path"
            ])

    exps = list_experiments(member_id)
    print(f"Running {len(exps)} experiments for Member {member_id}")

    for i, exp in enumerate(exps, 1):
        print(f"\n=== Experiment {i}/{len(exps)}: {exp['out_name']} ===")
        tr, vl, gap, params, cfg_path = run_training(exp, device)

        with open(result_csv, "a") as f:
            csv.writer(f).writerow([
                exp["out_name"], tr, vl, gap, params, str(cfg_path.resolve())
            ])

if __name__ == "__main__":
    main()

Writing run_experiments.py


In [5]:
import os
os.environ["SAVE_DIR"] = SAVE_DIR

In [6]:
!python run_experiments.py 3

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
iter 1440: loss 2.0510, time 349.51ms, mfu 0.40%
iter 1450: loss 2.0901, time 334.57ms, mfu 0.40%
iter 1460: loss 1.9920, time 332.21ms, mfu 0.41%
iter 1470: loss 2.0207, time 335.17ms, mfu 0.41%
iter 1480: loss 2.0694, time 350.99ms, mfu 0.41%
iter 1490: loss 2.0422, time 345.79ms, mfu 0.41%
iter 1500: loss 2.0475, time 342.06ms, mfu 0.41%
iter 1510: loss 2.0417, time 331.85ms, mfu 0.42%
iter 1520: loss 2.0608, time 355.17ms, mfu 0.42%
iter 1530: loss 2.0024, time 345.87ms, mfu 0.42%
iter 1540: loss 1.9986, time 354.31ms, mfu 0.42%
iter 1550: loss 1.9486, time 354.91ms, mfu 0.42%
iter 1560: loss 1.9545, time 342.63ms, mfu 0.42%
iter 1570: loss 2.0598, time 347.45ms, mfu 0.42%
iter 1580: loss 2.0179, time 340.40ms, mfu 0.42%
iter 1590: loss 1.9750, time 349.28ms, mfu 0.42%
step 1600: train loss 1.8534, val loss 1.9519
saving checkpoint to /content/drive/MyDrive/nanoGPT_results/b128_L4_H4_E128_BS16_MI2000_D20_s8
iter 1600:

In [7]:
import os
import subprocess

base_dir = "/content/drive/MyDrive/nanoGPT_results"
samples_dir = os.path.join(base_dir, "samples")

os.makedirs(samples_dir, exist_ok=True)

# iterate over each experiment folder
exp_folders = sorted([
    d for d in os.listdir(base_dir)
    if os.path.isdir(os.path.join(base_dir, d)) and d.startswith("b")
])

print(f"Found {len(exp_folders)} experiment folders.")

for i, exp in enumerate(exp_folders, 1):
    exp_path = os.path.join(base_dir, exp)
    ckpt_path = os.path.join(exp_path, "ckpt.pt")

    if not os.path.isfile(ckpt_path):
        print(f"Skipping {exp} (no ckpt.pt found)")
        continue

    out_sample = os.path.join(samples_dir, f"{exp}_sample.txt")

    print(f"[{i}/{len(exp_folders)}] Generating sample for {exp}")

    cmd = (
        f"python /content/nanoGPT/sample.py "
        f"--out_dir={exp_path} "
        f"--start=' ' "
        f"--num_samples=3 "
        f"--max_new_tokens=200 "
        f"> '{out_sample}'"
    )
    subprocess.run(cmd, shell=True)

print("✅ All samples generated and stored in:", samples_dir)

Found 32 experiment folders.
[1/32] Generating sample for b128_L4_H4_E128_BS16_MI1000_D10_s5
[2/32] Generating sample for b128_L4_H4_E128_BS16_MI1000_D20_s6
[3/32] Generating sample for b128_L4_H4_E128_BS16_MI2000_D10_s7
[4/32] Generating sample for b128_L4_H4_E128_BS16_MI2000_D20_s8
[5/32] Generating sample for b128_L4_H4_E128_BS8_MI1000_D10_s1
[6/32] Generating sample for b128_L4_H4_E128_BS8_MI1000_D20_s2
[7/32] Generating sample for b128_L4_H4_E128_BS8_MI2000_D10_s3
[8/32] Generating sample for b128_L4_H4_E128_BS8_MI2000_D20_s4
[9/32] Generating sample for b128_L4_H4_E256_BS16_MI1000_D10_s13
[10/32] Generating sample for b128_L4_H4_E256_BS16_MI1000_D20_s14
[11/32] Generating sample for b128_L4_H4_E256_BS16_MI2000_D10_s15
[12/32] Generating sample for b128_L4_H4_E256_BS16_MI2000_D20_s16
[13/32] Generating sample for b128_L4_H4_E256_BS8_MI1000_D10_s9
[14/32] Generating sample for b128_L4_H4_E256_BS8_MI1000_D20_s10
[15/32] Generating sample for b128_L4_H4_E256_BS8_MI2000_D10_s11
[16/32