In [1]:
from IPython.display import display
from pathlib import Path
import sys
import timeit
import os
from contextlib import nullcontext

project_dir = Path(os.path.abspath('')).parent
basics_path = (project_dir / "cs336-basics").as_posix()
if sys.path[0] != basics_path:
    sys.path.insert(0, basics_path)

import pandas as pd
import numpy as np
import torch
from tqdm import tqdm

from cs336_basics.model import BasicsTransformerLM

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

%matplotlib inline

In [2]:
model_configs = {
    "small": {"d_model": 768, "d_ff": 3072, "num_layers": 12, "num_heads": 12}, # fmt:skip
    "medium": {"d_model": 1024, "d_ff": 4096, "num_layers": 24, "num_heads": 16}, # fmt:skip
    "large": {"d_model": 1280, "d_ff": 5120, "num_layers": 36, "num_heads": 20}, # fmt:skip
    "xl": {"d_model": 1600, "d_ff": 6400, "num_layers": 48, "num_heads": 25}, # fmt:skip
    "2.7B": {"d_model": 2560, "d_ff": 10240, "num_layers": 32, "num_heads": 32}, # fmt:skip
}

def benchmark_model(
    model_name,
    context_length: int,
    warmup_steps: int,
    exe_steps,
    only_forward: bool,
    dtype=None,
):
    batch_size = 4
    vocab_size = 10000
    rope_theta = 10000
    x = torch.randint(0, vocab_size, (batch_size, context_length)).to(device)
    model = BasicsTransformerLM(
        vocab_size=vocab_size,
        context_length=context_length,
        rope_theta=rope_theta,
        **model_configs[model_name],
    )
    param_count = 0
    for p in model.parameters():
        param_count += p.numel()
    model.to(device)

    if dtype is None:
        context = nullcontext()
    elif dtype=='float16':
        context = torch.autocast(device_type="cuda", dtype=torch.float16)
    elif dtype=='bfloat16':
        context = torch.autocast(device_type="cuda", dtype=torch.bfloat16)
    else:
        raise
    with context:
        for _ in range(warmup_steps):
            logits = model(x)
            if not only_forward:
                loss = logits.sum()
                loss.backward()
            if torch.cuda.is_available():
                torch.cuda.synchronize()

        torch.cuda.memory._record_memory_history(max_entries=1000000)
        start = timeit.default_timer()
        ts = []
        logits = model(x)
        if not only_forward:
            loss = logits.sum()
            loss.backward()
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        ts.append(timeit.default_timer() - start)
        path = project_dir/"data"/"profile_memory"/f"modelname_{model_name}-context_length_{context_length}-only_forward_{int(only_forward)}.pickle"
        path.parent.mkdir(parents=True, exist_ok=True)
        torch.cuda.memory._dump_snapshot(path.as_posix())
        torch.cuda.memory._record_memory_history(enabled=None)
    ts = np.diff(ts, prepend=0)
    del x, model
    return ts, param_count

### Standard benchmarking

In [None]:
def benchmark(warmup_steps=5, exe_steps=10, only_forward=True, dtype=None):
    rows = []
    ts = {}
    for model_name in ["small"]:
        ts[model_name], param_count = benchmark_model(
            model_name=model_name,
            context_length=64,
            warmup_steps=warmup_steps,
            exe_steps=exe_steps,
            only_forward=only_forward,
            dtype=dtype
        )
        rows.append(
            [
                model_name,
                format(param_count, ","),
                np.mean(ts[model_name]),
                np.std(ts[model_name]),
                warmup_steps,
                exe_steps,
                only_forward,
            ]
        )
        torch.cuda.empty_cache()

    df = pd.DataFrame(
        rows,
        columns=[
            "model_name",
            "param_count",
            "t_mean",
            "t_std",
            "warmup_steps",
            "exe_steps",
            "only_forward",
        ],
    )
    df_ts = pd.DataFrame.from_dict(ts)
    display(df)
    display(df_ts)

In [5]:
benchmark(warmup_steps=5, exe_steps=10, only_forward=True)
benchmark(warmup_steps=5, exe_steps=10, only_forward=False)

Unnamed: 0,model_name,param_count,t_mean,t_std,warmup_steps,exe_steps,only_forward
0,small,128625408,0.019906,0.0,5,10,True


Unnamed: 0,small
0,0.019906


Unnamed: 0,model_name,param_count,t_mean,t_std,warmup_steps,exe_steps,only_forward
0,small,128625408,0.03994,0.0,5,10,False


Unnamed: 0,small
0,0.03994
