In [None]:
from google.colab import drive
drive.mount("/content/drive")

import os
PROJECT_DIR = "/content/drive/MyDrive/MusicScalingProject"
print("PROJECT_DIR exists?", os.path.exists(PROJECT_DIR))
print("PROJECT_DIR contents:", os.listdir(PROJECT_DIR))

print("data folder contents:", os.listdir(os.path.join(PROJECT_DIR, "data")))
print("data/v3 contents:", os.listdir(os.path.join(PROJECT_DIR, "data", "v3")))
print("src contents:", os.listdir(os.path.join(PROJECT_DIR, "src")))


Mounted at /content/drive
PROJECT_DIR exists? True
PROJECT_DIR contents: ['src', 'data', 'results', 'checkpoints_rnn']
data folder contents: ['processed', 'checkpoints', 'processed_v2', 'checkpoints_best', 'v3']
data/v3 contents: ['tokenizer_bpe_4096.json', 'train.bin', 'val.bin', 'test.bin']
src contents: ['model.py', '__pycache__', 'train_scaling.py', 'finish_xl.py', 'model_rnn.py', 'config_rnn.py', 'train_rnn.py', 'finish_rnn_xl.py', 'generate.py', 'evaluate_best.py', 'config.py']


In [None]:
!pip -q install tokenizers

In [None]:
import shutil, os
PROJECT_DIR = "/content/drive/MyDrive/MusicScalingProject"

src_model = os.path.join(PROJECT_DIR, "src", "model.py")
dst_model = "/content/model.py"
shutil.copyfile(src_model, dst_model)
print("Copied:", src_model, "->", dst_model)


Copied: /content/drive/MyDrive/MusicScalingProject/src/model.py -> /content/model.py


In [None]:
%%writefile /content/train_v3_drive.py
from __future__ import annotations

import os, time, math, json, random
from dataclasses import dataclass, asdict
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
from torch.optim import AdamW
from tokenizers import Tokenizer

from model import GPT  




PROJECT_DIR = Path("/content/drive/MyDrive/MusicScalingProject")
DATA_DIR = PROJECT_DIR / "data" / "v3"   
TOK_JSON = DATA_DIR / "tokenizer_bpe_4096.json"
TRAIN_BIN = DATA_DIR / "train.bin"
VAL_BIN   = DATA_DIR / "val.bin"

RUN_NAME = "V3_best_150M"
RUN_DIR = PROJECT_DIR / "runs" / RUN_NAME
CKPT_DIR = RUN_DIR / "checkpoints"
SAMPLE_DIR = RUN_DIR / "samples"
RUN_DIR.mkdir(parents=True, exist_ok=True)
CKPT_DIR.mkdir(parents=True, exist_ok=True)
SAMPLE_DIR.mkdir(parents=True, exist_ok=True)

METRICS_JSONL = RUN_DIR / "metrics.jsonl"
CONFIG_JSON   = RUN_DIR / "run_config.json"




@dataclass
class ModelConfig:
    n_layer: int = 16
    n_head: int = 16
    n_embd: int = 1024
    block_size: int = 1024
    vocab_size: int = 4096
    dropout: float = 0.1

CFG = ModelConfig()




DEVICE = "cuda"
DTYPE = torch.bfloat16  

MICRO_BSZ = 16
GRAD_ACCUM = 4
BLOCK_SIZE = CFG.block_size

MAX_STEPS = 20000  
LR = 3e-4
MIN_LR = 3e-5
WARMUP_STEPS = 200
WEIGHT_DECAY = 0.1
BETAS = (0.9, 0.95)
GRAD_CLIP = 1.0

LOG_INTERVAL = 20
EVAL_INTERVAL = 200
EVAL_ITERS = 200
CKPT_INTERVAL = 500
SAMPLE_INTERVAL = 500


GEN_PROMPT_TOKENS = 256
GEN_NEW_TOKENS = 1024
TEMPERATURE = 1.0
TOP_K = 200
STOP_AT_EOT = True

SEED = 1337

DELIM = "<|endoftext|>"

def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def cosine_lr(step: int) -> float:
    if step < WARMUP_STEPS:
        return LR * (step + 1) / WARMUP_STEPS
    progress = min(1.0, (step - WARMUP_STEPS) / max(1, (MAX_STEPS - WARMUP_STEPS)))
    coeff = 0.5 * (1.0 + math.cos(math.pi * progress))
    return MIN_LR + coeff * (LR - MIN_LR)

class BinDataset:
    def __init__(self, path: Path):
        self.data = np.memmap(path, dtype=np.uint16, mode="r")
        self.n = len(self.data)

    def get_batch(self, batch_size: int, block_size: int, device: str):
        ix = np.random.randint(0, self.n - block_size - 1, size=(batch_size,))
        x = np.stack([self.data[i:i+block_size].astype(np.int64) for i in ix])
        y = np.stack([self.data[i+1:i+1+block_size].astype(np.int64) for i in ix])
        x = torch.from_numpy(x).to(device, non_blocking=True)
        y = torch.from_numpy(y).to(device, non_blocking=True)
        return x, y

@torch.no_grad()
def estimate_loss(model: nn.Module, val_ds: BinDataset) -> float:
    model.eval()
    losses = []
    for _ in range(EVAL_ITERS):
        x, y = val_ds.get_batch(MICRO_BSZ, BLOCK_SIZE, DEVICE)
        with torch.autocast(device_type="cuda", dtype=DTYPE):
            _, loss = model(x, y)
        losses.append(loss.item())
    model.train()
    return float(np.mean(losses))

def save_jsonl(path: Path, record: dict):
    with path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(record) + "\n")

def save_checkpoint(model, optim, step: int, best_val):
    path = CKPT_DIR / f"ckpt_step_{step:07d}.pt"
    torch.save({
        "step": step,
        "model": model.state_dict(),
        "optim": optim.state_dict(),
        "cfg": asdict(CFG),
        "best_val": best_val,
    }, path)
    print(f"[CKPT] saved {path}")

@torch.no_grad()
def generate_sample(model: nn.Module, tok: Tokenizer, val_ds: BinDataset, step: int):
    model.eval()
    eot_id = tok.token_to_id(DELIM)
    assert eot_id is not None

    start = np.random.randint(0, val_ds.n - (GEN_PROMPT_TOKENS + 1))
    prompt_ids = val_ds.data[start:start+GEN_PROMPT_TOKENS].astype(np.int64).tolist()
    out = torch.tensor([prompt_ids], dtype=torch.long, device=DEVICE)

    for _ in range(GEN_NEW_TOKENS):
        out_cond = out[:, -BLOCK_SIZE:]
        with torch.autocast(device_type="cuda", dtype=DTYPE):
            logits, _ = model(out_cond)
        logits = logits[:, -1, :] / TEMPERATURE

        if TOP_K is not None:
            v, _ = torch.topk(logits, TOP_K)
            logits[logits < v[:, [-1]]] = -float("inf")

        probs = torch.softmax(logits, dim=-1)
        next_id = torch.multinomial(probs, num_samples=1)
        out = torch.cat([out, next_id], dim=1)

        if STOP_AT_EOT and next_id.item() == eot_id:
            break

    text = tok.decode(out[0].tolist())
    sample_path = SAMPLE_DIR / f"sample_step_{step:07d}.abc"
    sample_path.write_text(text, encoding="utf-8", newline="\n")
    print(f"[SAMPLE] wrote {sample_path}")
    model.train()

def main():
    set_seed(SEED)
    torch.set_float32_matmul_precision("high")

    
    for p in [TOK_JSON, TRAIN_BIN, VAL_BIN]:
        if not p.exists():
            raise FileNotFoundError(f"Missing: {p}")

    tok = Tokenizer.from_file(str(TOK_JSON))
    print("EOT id:", tok.token_to_id(DELIM))

    train_ds = BinDataset(TRAIN_BIN)
    val_ds = BinDataset(VAL_BIN)
    print("train tokens:", train_ds.n)
    print("val tokens:", val_ds.n)

    model = GPT(CFG).to(DEVICE)
    n_params = sum(p.numel() for p in model.parameters())
    print(f"Model params: {n_params/1e6:.2f}M")

    optim = AdamW(model.parameters(), lr=LR, betas=BETAS, weight_decay=WEIGHT_DECAY)

    CONFIG_JSON.write_text(json.dumps({
        "run_name": RUN_NAME,
        "device": DEVICE,
        "dtype": str(DTYPE),
        "micro_bsz": MICRO_BSZ,
        "grad_accum": GRAD_ACCUM,
        "block_size": BLOCK_SIZE,
        "max_steps": MAX_STEPS,
        "lr": LR,
        "min_lr": MIN_LR,
        "warmup_steps": WARMUP_STEPS,
        "weight_decay": WEIGHT_DECAY,
        "betas": BETAS,
        "grad_clip": GRAD_CLIP,
        "eval_interval": EVAL_INTERVAL,
        "eval_iters": EVAL_ITERS,
        "ckpt_interval": CKPT_INTERVAL,
        "sample_interval": SAMPLE_INTERVAL,
        "dataset": {"train_bin": str(TRAIN_BIN), "val_bin": str(VAL_BIN)},
        "model_cfg": asdict(CFG),
    }, indent=2), encoding="utf-8")

    model.train()
    t0 = time.time()
    running_loss = 0.0
    running_count = 0
    best_val = None

    for step in range(MAX_STEPS):
        lr = cosine_lr(step)
        for pg in optim.param_groups:
            pg["lr"] = lr

        optim.zero_grad(set_to_none=True)

        for _ in range(GRAD_ACCUM):
            x, y = train_ds.get_batch(MICRO_BSZ, BLOCK_SIZE, DEVICE)
            with torch.autocast(device_type="cuda", dtype=DTYPE):
                _, loss = model(x, y)
                loss = loss / GRAD_ACCUM
            loss.backward()

            running_loss += loss.item() * GRAD_ACCUM
            running_count += 1

        if GRAD_CLIP is not None:
            torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)

        optim.step()

        if (step + 1) % LOG_INTERVAL == 0:
            dt = time.time() - t0
            avg_loss = running_loss / max(1, running_count)
            tokens_per_step = MICRO_BSZ * GRAD_ACCUM * BLOCK_SIZE
            tok_per_sec = (tokens_per_step * LOG_INTERVAL) / max(1e-9, dt)

            rec = {
                "step": step + 1,
                "train_loss": avg_loss,
                "lr": lr,
                "tokens_per_step": tokens_per_step,
                "tokens_seen": (step + 1) * tokens_per_step,
                "tok_per_sec": tok_per_sec,
            }
            save_jsonl(METRICS_JSONL, rec)
            print(f"[LOG] step {step+1} | train_loss {avg_loss:.4f} | lr {lr:.2e} | tok/s {tok_per_sec:,.0f}")

            running_loss = 0.0
            running_count = 0
            t0 = time.time()

        if (step + 1) % EVAL_INTERVAL == 0:
            val_loss = estimate_loss(model, val_ds)
            save_jsonl(METRICS_JSONL, {"step": step + 1, "val_loss": val_loss})
            print(f"[EVAL] step {step+1} | val_loss {val_loss:.4f}")
            if best_val is None or val_loss < best_val:
                best_val = val_loss
                save_checkpoint(model, optim, step + 1, best_val)

        if (step + 1) % CKPT_INTERVAL == 0:
            save_checkpoint(model, optim, step + 1, best_val)

        if (step + 1) % SAMPLE_INTERVAL == 0:
            generate_sample(model, tok, val_ds, step + 1)

    save_checkpoint(model, optim, MAX_STEPS, best_val)
    print("[DONE]")

if __name__ == "__main__":
    main()


Writing /content/train_v3_drive.py


In [None]:
!python /content/train_v3_drive.py

  _C._set_float32_matmul_precision(precision)
EOT id: 1
train tokens: 136393191
val tokens: 1277450
Model initialized: 206.64M parameters
Model params: 206.64M
[LOG] step 20 | train_loss 6.2284 | lr 3.00e-05 | tok/s 23,392
[LOG] step 40 | train_loss 4.1622 | lr 6.00e-05 | tok/s 35,159
[LOG] step 60 | train_loss 3.0290 | lr 9.00e-05 | tok/s 35,653
[LOG] step 80 | train_loss 2.5450 | lr 1.20e-04 | tok/s 35,653
[LOG] step 100 | train_loss 2.3597 | lr 1.50e-04 | tok/s 35,650
[LOG] step 120 | train_loss 2.2602 | lr 1.80e-04 | tok/s 35,639
[LOG] step 140 | train_loss 2.1835 | lr 2.10e-04 | tok/s 35,640
[LOG] step 160 | train_loss 2.1404 | lr 2.40e-04 | tok/s 35,636
[LOG] step 180 | train_loss 2.1079 | lr 2.70e-04 | tok/s 35,640
[LOG] step 200 | train_loss 2.0693 | lr 3.00e-04 | tok/s 35,644
[EVAL] step 200 | val_loss 1.9957
[CKPT] saved /content/drive/MyDrive/MusicScalingProject/runs/V3_best_150M/checkpoints/ckpt_step_0000200.pt
[LOG] step 220 | train_loss 1.9979 | lr 3.00e-04 | tok/s 17,088

In [None]:
import os, shutil

LOCAL_DATA = "/content/dataV3"
os.makedirs(LOCAL_DATA, exist_ok=True)

SRC_DATA = "/content/drive/MyDrive/MusicScalingProject/data/v3"

for fn in ["train.bin", "val.bin", "test.bin", "tokenizer_bpe_4096.json"]:
    shutil.copyfile(f"{SRC_DATA}/{fn}", f"{LOCAL_DATA}/{fn}")

print("Local data files:", os.listdir(LOCAL_DATA))


Local data files: ['tokenizer_bpe_4096.json', 'val.bin', 'train.bin', 'test.bin']


In [None]:
%%writefile /content/train_v3_resume.py
from __future__ import annotations

import time, math, json, random
from dataclasses import dataclass, asdict
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
from torch.optim import AdamW
from tokenizers import Tokenizer

from model import GPT




PROJECT_DIR = Path("/content/drive/MyDrive/MusicScalingProject")
RUN_NAME = "V3_best_150M"
DATA_DIR = Path("/content/dataV3")  

MAX_STEPS = 11000                  


EVAL_ITERS = 50
LOG_INTERVAL = 20
EVAL_INTERVAL = 300                


SAVE_FULL_EVERY = 1500             
KEEP_LAST_FULL = 2                 


SAMPLE_INTERVAL = 1500             


MICRO_BSZ = 16
GRAD_ACCUM = 4
LR = 3e-4
MIN_LR = 3e-5
WARMUP_STEPS = 200
WEIGHT_DECAY = 0.1
BETAS = (0.9, 0.95)
GRAD_CLIP = 1.0

SEED = 1337
DELIM = "<|endoftext|>"

GEN_PROMPT_TOKENS = 256
GEN_NEW_TOKENS = 1024
TEMPERATURE = 1.0
TOP_K = 200
STOP_AT_EOT = True


RUN_DIR = PROJECT_DIR / "runs" / RUN_NAME
CKPT_DIR = RUN_DIR / "checkpoints"
SAMPLE_DIR = RUN_DIR / "samples"
RUN_DIR.mkdir(parents=True, exist_ok=True)
CKPT_DIR.mkdir(parents=True, exist_ok=True)
SAMPLE_DIR.mkdir(parents=True, exist_ok=True)

METRICS_JSONL = RUN_DIR / "metrics.jsonl"
CONFIG_JSON   = RUN_DIR / "run_config.json"

TOK_JSON = DATA_DIR / "tokenizer_bpe_4096.json"
TRAIN_BIN = DATA_DIR / "train.bin"
VAL_BIN   = DATA_DIR / "val.bin"


@dataclass
class ModelConfig:
    n_layer: int = 16
    n_head: int = 16
    n_embd: int = 1024
    block_size: int = 1024
    vocab_size: int = 4096
    dropout: float = 0.1

CFG = ModelConfig()
BLOCK_SIZE = CFG.block_size

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.bfloat16

def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def cosine_lr(step: int) -> float:
    if step < WARMUP_STEPS:
        return LR * (step + 1) / WARMUP_STEPS
    progress = min(1.0, (step - WARMUP_STEPS) / max(1, (MAX_STEPS - WARMUP_STEPS)))
    coeff = 0.5 * (1.0 + math.cos(math.pi * progress))
    return MIN_LR + coeff * (LR - MIN_LR)

class BinDataset:
    def __init__(self, path: Path):
        self.data = np.memmap(path, dtype=np.uint16, mode="r")
        self.n = len(self.data)

    def get_batch(self, batch_size: int, block_size: int, device: str):
        ix = np.random.randint(0, self.n - block_size - 1, size=(batch_size,))
        x = np.stack([self.data[i:i+block_size].astype(np.int64) for i in ix])
        y = np.stack([self.data[i+1:i+1+block_size].astype(np.int64) for i in ix])
        x = torch.from_numpy(x).to(device, non_blocking=True)
        y = torch.from_numpy(y).to(device, non_blocking=True)
        return x, y

@torch.no_grad()
def estimate_loss(model: nn.Module, val_ds: BinDataset) -> float:
    model.eval()
    losses = []
    for _ in range(EVAL_ITERS):
        x, y = val_ds.get_batch(MICRO_BSZ, BLOCK_SIZE, DEVICE)
        with torch.autocast(device_type="cuda", dtype=DTYPE):
            _, loss = model(x, y)
        losses.append(loss.item())
    model.train()
    return float(np.mean(losses))

def save_jsonl(path: Path, record: dict):
    with path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(record) + "\n")

def cleanup_full_checkpoints():
    full_ckpts = sorted(CKPT_DIR.glob("full_step_*.pt"))
    if len(full_ckpts) > KEEP_LAST_FULL:
        for p in full_ckpts[:-KEEP_LAST_FULL]:
            p.unlink(missing_ok=True)

def save_full_checkpoint(model, optim, step: int, best_val):
    path = CKPT_DIR / f"full_step_{step:07d}.pt"
    torch.save({
        "step": step,
        "model": model.state_dict(),
        "optim": optim.state_dict(),
        "cfg": asdict(CFG),
        "best_val": best_val,
    }, path)
    print(f"[CKPT] saved FULL {path}")
    cleanup_full_checkpoints()

def load_latest_checkpoint(model, optim):
    
    full_ckpts = sorted(CKPT_DIR.glob("full_step_*.pt"))
    if full_ckpts:
        path = full_ckpts[-1]
        ckpt = torch.load(path, map_location="cpu")
        model.load_state_dict(ckpt["model"])
        optim.load_state_dict(ckpt["optim"])
        step0 = int(ckpt["step"])
        best_val = ckpt.get("best_val", None)
        print(f"[RESUME] loaded FULL {path} (step={step0})")
        return step0, best_val

    
    legacy = sorted(CKPT_DIR.glob("ckpt_step_*.pt"))
    if legacy:
        path = legacy[-1]
        ckpt = torch.load(path, map_location="cpu")
        model.load_state_dict(ckpt["model"])
        if "optim" in ckpt:
            optim.load_state_dict(ckpt["optim"])
        step0 = int(ckpt.get("step", 0))
        best_val = ckpt.get("best_val", None)
        print(f"[RESUME] loaded LEGACY {path} (step={step0})")
        return step0, best_val

    return 0, None

@torch.no_grad()
def generate_sample(model: nn.Module, tok: Tokenizer, val_ds: BinDataset, step: int):
    model.eval()
    eot_id = tok.token_to_id(DELIM)
    assert eot_id is not None

    start = np.random.randint(0, val_ds.n - (GEN_PROMPT_TOKENS + 1))
    prompt_ids = val_ds.data[start:start+GEN_PROMPT_TOKENS].astype(np.int64).tolist()
    out = torch.tensor([prompt_ids], dtype=torch.long, device=DEVICE)

    for _ in range(GEN_NEW_TOKENS):
        out_cond = out[:, -BLOCK_SIZE:]
        with torch.autocast(device_type="cuda", dtype=DTYPE):
            logits, _ = model(out_cond)
        logits = logits[:, -1, :] / TEMPERATURE

        if TOP_K is not None:
            v, _ = torch.topk(logits, TOP_K)
            logits[logits < v[:, [-1]]] = -float("inf")

        probs = torch.softmax(logits, dim=-1)
        next_id = torch.multinomial(probs, num_samples=1)
        out = torch.cat([out, next_id], dim=1)

        if STOP_AT_EOT and next_id.item() == eot_id:
            break

    text = tok.decode(out[0].tolist())
    sample_path = SAMPLE_DIR / f"sample_step_{step:07d}.abc"
    sample_path.write_text(text, encoding="utf-8", newline="\n")
    print(f"[SAMPLE] wrote {sample_path}")
    model.train()

def main():
    set_seed(SEED)
    torch.set_float32_matmul_precision("high")

    for p in [TOK_JSON, TRAIN_BIN, VAL_BIN]:
        if not p.exists():
            raise FileNotFoundError(f"Missing: {p}")

    tok = Tokenizer.from_file(str(TOK_JSON))
    print("EOT id:", tok.token_to_id(DELIM))

    train_ds = BinDataset(TRAIN_BIN)
    val_ds = BinDataset(VAL_BIN)
    print("train tokens:", train_ds.n)
    print("val tokens:", val_ds.n)

    model = GPT(CFG).to(DEVICE)
    n_params = sum(p.numel() for p in model.parameters())
    print(f"Model params: {n_params/1e6:.2f}M")

    optim = AdamW(model.parameters(), lr=LR, betas=BETAS, weight_decay=WEIGHT_DECAY)

    
    if not CONFIG_JSON.exists():
        CONFIG_JSON.write_text(json.dumps({
            "run_name": RUN_NAME,
            "device": DEVICE,
            "dtype": str(DTYPE),
            "micro_bsz": MICRO_BSZ,
            "grad_accum": GRAD_ACCUM,
            "block_size": BLOCK_SIZE,
            "max_steps": MAX_STEPS,
            "lr": LR,
            "min_lr": MIN_LR,
            "warmup_steps": WARMUP_STEPS,
            "weight_decay": WEIGHT_DECAY,
            "betas": BETAS,
            "grad_clip": GRAD_CLIP,
            "eval_interval": EVAL_INTERVAL,
            "eval_iters": EVAL_ITERS,
            "save_full_every": SAVE_FULL_EVERY,
            "keep_last_full": KEEP_LAST_FULL,
            "sample_interval": SAMPLE_INTERVAL,
            "dataset": {"train_bin": str(TRAIN_BIN), "val_bin": str(VAL_BIN)},
            "model_cfg": asdict(CFG),
        }, indent=2), encoding="utf-8")

    step0, best_val = load_latest_checkpoint(model, optim)

    model.train()
    t0 = time.time()
    running_loss = 0.0
    running_count = 0

    for step in range(step0, MAX_STEPS):
        lr = cosine_lr(step)
        for pg in optim.param_groups:
            pg["lr"] = lr

        optim.zero_grad(set_to_none=True)

        for _ in range(GRAD_ACCUM):
            x, y = train_ds.get_batch(MICRO_BSZ, BLOCK_SIZE, DEVICE)
            with torch.autocast(device_type="cuda", dtype=DTYPE):
                _, loss = model(x, y)
                loss = loss / GRAD_ACCUM
            loss.backward()
            running_loss += loss.item() * GRAD_ACCUM
            running_count += 1

        if GRAD_CLIP is not None:
            torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)

        optim.step()

        if (step + 1) % LOG_INTERVAL == 0:
            dt = time.time() - t0
            avg_loss = running_loss / max(1, running_count)
            tokens_per_step = MICRO_BSZ * GRAD_ACCUM * BLOCK_SIZE
            tok_per_sec = (tokens_per_step * LOG_INTERVAL) / max(1e-9, dt)

            save_jsonl(METRICS_JSONL, {
                "step": step + 1,
                "train_loss": avg_loss,
                "lr": lr,
                "tokens_per_step": tokens_per_step,
                "tokens_seen": (step + 1) * tokens_per_step,
                "tok_per_sec": tok_per_sec,
            })
            print(f"[LOG] step {step+1} | train_loss {avg_loss:.4f} | lr {lr:.2e} | tok/s {tok_per_sec:,.0f}")

            running_loss = 0.0
            running_count = 0
            t0 = time.time()

        if (step + 1) % EVAL_INTERVAL == 0:
            val_loss = estimate_loss(model, val_ds)
            save_jsonl(METRICS_JSONL, {"step": step + 1, "val_loss": val_loss})
            print(f"[EVAL] step {step+1} | val_loss {val_loss:.4f}")

            
            if best_val is None or val_loss < best_val:
                best_val = val_loss
                save_full_checkpoint(model, optim, step + 1, best_val)

        
        if (step + 1) % SAVE_FULL_EVERY == 0:
            save_full_checkpoint(model, optim, step + 1, best_val)

        if (step + 1) % SAMPLE_INTERVAL == 0:
            generate_sample(model, tok, val_ds, step + 1)

    save_full_checkpoint(model, optim, MAX_STEPS, best_val)
    print("[DONE]")

if __name__ == "__main__":
    main()


Overwriting /content/train_v3_resume.py


In [None]:
!python /content/train_v3_resume.py

  _C._set_float32_matmul_precision(precision)
EOT id: 1
train tokens: 136393191
val tokens: 1277450
Model initialized: 206.64M parameters
Model params: 206.64M
[RESUME] loaded FULL /content/drive/MyDrive/MusicScalingProject/runs/V3_best_150M/checkpoints/full_step_0001200.pt (step=1200)
[LOG] step 1220 | train_loss 0.9596 | lr 2.94e-04 | tok/s 35,165
[LOG] step 1240 | train_loss 0.9663 | lr 2.94e-04 | tok/s 35,681
[LOG] step 1260 | train_loss 0.9632 | lr 2.94e-04 | tok/s 35,663
[LOG] step 1280 | train_loss 0.9601 | lr 2.93e-04 | tok/s 35,662
[LOG] step 1300 | train_loss 0.9545 | lr 2.93e-04 | tok/s 35,658
[LOG] step 1320 | train_loss 0.9536 | lr 2.93e-04 | tok/s 35,659
[LOG] step 1340 | train_loss 0.9372 | lr 2.93e-04 | tok/s 35,656
[LOG] step 1360 | train_loss 0.9343 | lr 2.92e-04 | tok/s 35,641
[LOG] step 1380 | train_loss 0.9483 | lr 2.92e-04 | tok/s 35,642
[LOG] step 1400 | train_loss 0.9283 | lr 2.92e-04 | tok/s 35,639
[LOG] step 1420 | train_loss 0.9401 | lr 2.92e-04 | tok/s 35,64