# yoctoGPT on Google Colab (T4, ~12GB VRAM)

This notebook mounts Google Drive for persistent checkpoints, clones the
yoctoGPT repository, prepares a token dataset from all `.txt` files in
`data/`, trains a speed-focused model sized for a Colab T4 (~12GB),
provides two sampling examples, and includes a resume cell. It also
adapts context/batch size when the corpus is tiny to avoid random index
errors.

In [None]:
#@title Mount Google Drive and set project directory (persist checkpoints)
from google.colab import drive

drive.mount("/content/drive")

import os
from pathlib import Path

project_dir = "/content/drive/MyDrive/yocto"  # adjust to your setup
os.makedirs(project_dir, exist_ok=True)
CKPT_DIR = Path(project_dir) / "checkpoints/colab_fast"
CKPT_DIR.mkdir(parents=True, exist_ok=True)
os.chdir(project_dir)
print("Working dir:", os.getcwd())
print("Checkpoints dir:", CKPT_DIR)

In [None]:
#@title Setup: install deps and clone/update the repo
!nvidia-smi || true
!python -V
!pip -q install tokenizers tqdm

import os, pathlib, subprocess, textwrap

repo_root = pathlib.Path("yoctoGPT")
if repo_root.exists():
    print("Repo exists, pulling latest...")
    subprocess.run(["git", "pull"], cwd=repo_root, check=False)
else:
    !git clone https://github.com/yhilpisch/yoctoGPT.git
os.chdir(repo_root)

if os.path.exists("requirements.txt"):
    !pip -q install -r requirements.txt || true

data_dir = pathlib.Path("data")
data_dir.mkdir(exist_ok=True)
txts = list(data_dir.glob("*.txt"))
if not txts:
    sample = textwrap.dedent('''
    Philosophy is the study of general and fundamental questions,
    such as those about existence, reason, knowledge, values, mind,
    and language. It often poses questions rather than providing
    answers, inviting us to think.
    ''').strip()
    (data_dir / "philosophy.txt").write_text(sample, encoding="utf-8")
    print("Created sample data/philosophy.txt")
else:
    names = [p.name for p in txts][:5]
    print(f"Found {len(txts)} text files in data/: {names}")

In [None]:
#@title Tokenization: prepare token-level dataset from all .txt files
!python -m scripts.prepare_tokenizer       --all_txt_in_dir --text_dir data --out_dir data/token       --vocab_size 8000 --random_split --split_seed 1337

In [None]:
#@title Pick safe block_size/batch_size for this corpus
import numpy as np
from pathlib import Path

train_path = Path("data/token/train.bin")
val_path = Path("data/token/val.bin")
train_tokens = int(np.fromfile(train_path, dtype=np.int32).shape[0])
val_tokens = int(np.fromfile(val_path, dtype=np.int32).shape[0])
min_tokens = min(train_tokens, val_tokens)

if min_tokens <= 4:
    raise SystemExit(
        "Dataset too small. Add more text to data/ and rerun tokenization."
    )

block_candidates = [512, 384, 256, 192, 128, 96, 64, 48, 32, 24, 16]
block_size = next(
    (b for b in block_candidates if min_tokens > b + 2),
    max(8, min_tokens - 2),
)

target_tokens = min(6000, max(512, min_tokens))
batch_size = max(1, min(24, target_tokens // block_size))

print(
    f"Train tokens: {train_tokens}, Val tokens: {val_tokens}, "
    f"min_tokens: {min_tokens}"
)
print(f"Using block_size={block_size}, batch_size={batch_size}")

In [None]:
#@title (Optional) Get an auto-recommended command for this GPU (T4 ~12GB)
!python -m scripts.recommend_training       --mode token       --data_dir data/token       --tokenizer_path data/token/tokenizer.json       --ckpt_dir {CKPT_DIR}       --priority speed       --device cuda       --device_mem_gb 12

## Training

We use the speed-focused variant `gpt_fast` (Flash/SDPA attention) with
a configuration chosen to fit comfortably on a Colab T4 (~12GB). If you
hit OOM, lower `batch_size` or `block_size`; if you have headroom, you
can nudge them upward. The auto-picked block/batch above avoids tiny-
corpus indexing errors.

In [None]:
#@title Train (gpt_fast) on Colab T4
from pathlib import Path

CKPT_DIR = Path(project_dir) / "checkpoints/colab_fast"
CKPT_DIR.mkdir(parents=True, exist_ok=True)

!python -m yoctoGPT.train       --mode token       --data_dir data/token       --tokenizer_path data/token/tokenizer.json       --ckpt_dir {CKPT_DIR}       --model_type gpt_fast       --device cuda       --n_layer 6 --n_head 6 --n_embd 384       --block_size {block_size} --batch_size {batch_size}       --dropout 0.15 --weight_decay 0.08       --tie_weights --label_smoothing 0.05       --eval_interval 400 --eval_iters 200       --cosine_lr --warmup_iters 400       --min_lr 1e-5 --lr 2e-4       --max_iters 1500       --ema --ema_decay 0.999

## Sampling examples

Generate text from the latest checkpoint. Adjust temperature/top-k/top-p
for style.

In [None]:
    #@title Sample 1: Q/A style prompt
    from pathlib import Path

    CKPT_DIR = Path(project_dir) / "checkpoints/colab_fast"

    !python -m yoctoGPT.sampler       --mode token       --ckpt {CKPT_DIR}/latest.pt       --tokenizer_path data/token/tokenizer.json       --prompt "Q: What is the meaning of life?
A:"       --max_new_tokens 200       --temperature 0.8 --top_k 50 --top_p 0.95

In [None]:
#@title Sample 2: Continuation
from pathlib import Path

CKPT_DIR = Path(project_dir) / "checkpoints/colab_fast"

!python -m yoctoGPT.sampler       --mode token       --ckpt {CKPT_DIR}/latest.pt       --tokenizer_path data/token/tokenizer.json       --prompt "In the beginning, philosophy sought to"       --max_new_tokens 200       --temperature 0.9 --top_k 40 --top_p 0.95

## Resume training

Resume from the latest checkpoint to continue training for additional
steps. `--max_iters` is interpreted as extra steps beyond the
checkpointed iteration count. Checkpoints are stored under
`/content/drive/.../checkpoints/colab_fast`.

In [None]:
#@title Resume training from latest.pt (additional 800 steps)
from pathlib import Path

CKPT_DIR = Path(project_dir) / "checkpoints/colab_fast"
latest = CKPT_DIR / "latest.pt"
if not latest.exists():
    raise SystemExit("No latest.pt found in CKPT_DIR; run training first.")

!python -m yoctoGPT.train       --mode token       --data_dir data/token       --tokenizer_path data/token/tokenizer.json       --ckpt_dir {CKPT_DIR}       --resume {latest}       --model_type gpt_fast       --device cuda       --n_layer 6 --n_head 6 --n_embd 384       --block_size {block_size} --batch_size {batch_size}       --dropout 0.15 --weight_decay 0.08       --tie_weights --label_smoothing 0.05       --eval_interval 400 --eval_iters 200       --cosine_lr --warmup_iters 400       --min_lr 1e-5 --lr 2e-4       --max_iters 800       --ema --ema_decay 0.999

Optionally, inspect the last lines of the training metrics CSV to
monitor progress.

In [None]:
#@title Inspect metrics
from pathlib import Path

CKPT_DIR = Path(project_dir) / "checkpoints/colab_fast"

!tail -n 20 {CKPT_DIR}/metrics.csv || true