In [1]:
from IPython.display import display
from pathlib import Path
import sys
import timeit
import os
from contextlib import nullcontext

project_dir = Path(os.path.abspath('')).parent
basics_path = (project_dir / "cs336-basics").as_posix()
if sys.path[0] != basics_path:
    sys.path.insert(0, basics_path)

import pandas as pd
import numpy as np
import torch
from tqdm import tqdm

from cs336_basics.model import BasicsTransformerLM

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

%matplotlib inline

In [2]:
model_configs = {
    "small": {"d_model": 768, "d_ff": 3072, "num_layers": 12, "num_heads": 12}, # fmt:skip
    "medium": {"d_model": 1024, "d_ff": 4096, "num_layers": 24, "num_heads": 16}, # fmt:skip
    "large": {"d_model": 1280, "d_ff": 5120, "num_layers": 36, "num_heads": 20}, # fmt:skip
    "xl": {"d_model": 1600, "d_ff": 6400, "num_layers": 48, "num_heads": 25}, # fmt:skip
    "2.7B": {"d_model": 2560, "d_ff": 10240, "num_layers": 32, "num_heads": 32}, # fmt:skip
}

def benchmark_model(
    model_name,
    context_length: int,
    warmup_steps: int,
    exe_steps,
    only_forward: bool,
    dtype=None,
):
    batch_size = 4
    vocab_size = 10000
    rope_theta = 10000
    x = torch.randint(0, vocab_size, (batch_size, context_length)).to(device)
    model = BasicsTransformerLM(
        vocab_size=vocab_size,
        context_length=context_length,
        rope_theta=rope_theta,
        **model_configs[model_name],
    )
    param_count = 0
    for p in model.parameters():
        param_count += p.numel()
    model.to(device)

    if dtype is None:
        context = nullcontext()
    elif dtype=='float16':
        context = torch.autocast(device_type="cuda", dtype=torch.float16)
    elif dtype=='bfloat16':
        context = torch.autocast(device_type="cuda", dtype=torch.bfloat16)
    else:
        raise
    with context:
        for _ in range(warmup_steps):
            logits = model(x)
            if not only_forward:
                loss = logits.sum()
                loss.backward()
            if torch.cuda.is_available():
                torch.cuda.synchronize()

        start = timeit.default_timer()
        ts = []
        for _ in range(exe_steps):
            logits = model(x)
            if not only_forward:
                loss = logits.sum()
                loss.backward()
            if torch.cuda.is_available():
                torch.cuda.synchronize()
            ts.append(timeit.default_timer() - start)
    ts = np.diff(ts, prepend=0)
    del x, model
    return ts, param_count

### Standard benchmarking

In [3]:
def benchmark(warmup_steps=5, exe_steps=10, only_forward=True, dtype=None):
    rows = []
    ts = {}
    for model_name in ["small", "medium", "large"]:
        ts[model_name], param_count = benchmark_model(
            model_name=model_name,
            context_length=64,
            warmup_steps=warmup_steps,
            exe_steps=exe_steps,
            only_forward=only_forward,
            dtype=dtype
        )
        rows.append(
            [
                model_name,
                format(param_count, ","),
                np.mean(ts[model_name]),
                np.std(ts[model_name]),
                warmup_steps,
                exe_steps,
                only_forward,
            ]
        )
        torch.cuda.empty_cache()

    df = pd.DataFrame(
        rows,
        columns=[
            "model_name",
            "param_count",
            "t_mean",
            "t_std",
            "warmup_steps",
            "exe_steps",
            "only_forward",
        ],
    )
    df_ts = pd.DataFrame.from_dict(ts)
    display(df)
    display(df_ts)

In [4]:
for warmup_steps in [5, 0, 1]:
    benchmark(warmup_steps=warmup_steps, exe_steps=10, only_forward=True)
    benchmark(warmup_steps=warmup_steps, exe_steps=10, only_forward=False)

Unnamed: 0,model_name,param_count,t_mean,t_std,warmup_steps,exe_steps,only_forward
0,small,128625408,0.017236,0.001097,5,10,True
1,medium,423183360,0.047553,0.002354,5,10,True
2,large,969411840,0.109274,0.002365,5,10,True


Unnamed: 0,small,medium,large
0,0.01757,0.04531,0.105904
1,0.016165,0.050834,0.111613
2,0.015983,0.045551,0.108106
3,0.017824,0.050295,0.111589
4,0.017431,0.04534,0.107513
5,0.018218,0.049771,0.111594
6,0.017603,0.046603,0.106616
7,0.016093,0.045586,0.112043
8,0.016017,0.050657,0.106777
9,0.019456,0.045586,0.110986


Unnamed: 0,model_name,param_count,t_mean,t_std,warmup_steps,exe_steps,only_forward
0,small,128625408,0.05398,0.002627,5,10,False
1,medium,423183360,0.154339,0.002187,5,10,False
2,large,969411840,0.357708,0.002046,5,10,False


Unnamed: 0,small,medium,large
0,0.057018,0.152815,0.356307
1,0.051686,0.15504,0.358699
2,0.05549,0.159287,0.355934
3,0.053052,0.151429,0.35975
4,0.05142,0.156862,0.356346
5,0.058871,0.154656,0.355916
6,0.052897,0.154147,0.357503
7,0.05136,0.152658,0.362732
8,0.056537,0.15367,0.356751
9,0.051464,0.152825,0.357143


Unnamed: 0,model_name,param_count,t_mean,t_std,warmup_steps,exe_steps,only_forward
0,small,128625408,0.018197,0.002373,0,10,True
1,medium,423183360,0.048556,0.002472,0,10,True
2,large,969411840,0.109451,0.002257,0,10,True


Unnamed: 0,small,medium,large
0,0.022651,0.051243,0.111048
1,0.021753,0.04587,0.107782
2,0.016453,0.050496,0.107841
3,0.015946,0.051536,0.108691
4,0.016127,0.046025,0.109922
5,0.019896,0.047267,0.111452
6,0.017677,0.051185,0.106531
7,0.016138,0.045778,0.114554
8,0.016349,0.05041,0.109111
9,0.018978,0.04575,0.107577


Unnamed: 0,model_name,param_count,t_mean,t_std,warmup_steps,exe_steps,only_forward
0,small,128625408,0.054426,0.003223,0,10,False
1,medium,423183360,0.152946,0.002279,0,10,False
2,large,969411840,0.356832,0.003856,0,10,False


Unnamed: 0,small,medium,large
0,0.060921,0.15111,0.345317
1,0.051647,0.152061,0.357351
2,0.0518,0.150462,0.357837
3,0.057188,0.158484,0.358111
4,0.052011,0.150965,0.357963
5,0.057814,0.151993,0.358402
6,0.052701,0.155099,0.358313
7,0.051565,0.153628,0.358226
8,0.056645,0.153519,0.358846
9,0.051964,0.152136,0.357957


Unnamed: 0,model_name,param_count,t_mean,t_std,warmup_steps,exe_steps,only_forward
0,small,128625408,0.017764,0.001754,1,10,True
1,medium,423183360,0.058862,0.032337,1,10,True
2,large,969411840,0.110124,0.003601,1,10,True


Unnamed: 0,small,medium,large
0,0.021799,0.155552,0.107002
1,0.016971,0.045508,0.110686
2,0.016499,0.051061,0.106386
3,0.019467,0.045501,0.114475
4,0.017713,0.051395,0.108863
5,0.016066,0.045963,0.116643
6,0.016084,0.051071,0.10607
7,0.019119,0.045395,0.112255
8,0.017465,0.051316,0.106217
9,0.016454,0.045857,0.112645


Unnamed: 0,model_name,param_count,t_mean,t_std,warmup_steps,exe_steps,only_forward
0,small,128625408,0.054516,0.002171,1,10,False
1,medium,423183360,0.153355,0.00221,1,10,False
2,large,969411840,0.358409,0.001448,1,10,False


Unnamed: 0,small,medium,large
0,0.056561,0.152841,0.355244
1,0.052324,0.151673,0.36125
2,0.055069,0.155238,0.358981
3,0.053385,0.150974,0.35733
4,0.051764,0.154938,0.358233
5,0.057304,0.151706,0.358071
6,0.051999,0.158358,0.359098
7,0.057619,0.151039,0.359238
8,0.05295,0.152714,0.358142
9,0.056184,0.154066,0.358502


In [None]:
for warmup_steps in [5, 0, 1]:
    benchmark(warmup_steps=warmup_steps, exe_steps=10, only_forward=True, dtype='bfloat16')
    benchmark(warmup_steps=warmup_steps, exe_steps=10, only_forward=False, dtype='bfloat16')

Unnamed: 0,model_name,param_count,t_mean,t_std,warmup_steps,exe_steps,only_forward
0,small,128625408,0.023921,0.001851,5,10,True
1,medium,423183360,0.066699,0.002666,5,10,True
2,large,969411840,0.150513,0.001876,5,10,True


Unnamed: 0,small,medium,large
0,0.027499,0.064592,0.151622
1,0.022922,0.06856,0.147038
2,0.022441,0.064387,0.152126
3,0.026493,0.068805,0.151324
4,0.022801,0.064325,0.149745
5,0.022838,0.068918,0.152354
6,0.026076,0.064772,0.150617
7,0.022637,0.072467,0.151863
8,0.022439,0.065644,0.147046
9,0.023067,0.064523,0.151398


OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB. GPU 0 has a total capacity of 11.53 GiB of which 17.12 MiB is free. Process 14835 has 257.06 MiB memory in use. Process 349757 has 144.00 MiB memory in use. Including non-PyTorch memory, this process has 10.59 GiB memory in use. Of the allocated memory 10.34 GiB is allocated by PyTorch, and 137.55 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [5]:
for warmup_steps in [5, 0, 1]:
    benchmark(warmup_steps=warmup_steps, exe_steps=10, only_forward=True, dtype='float16')
    benchmark(warmup_steps=warmup_steps, exe_steps=10, only_forward=False, dtype='float16')

Unnamed: 0,model_name,param_count,t_mean,t_std,warmup_steps,exe_steps,only_forward
0,small,128625408,0.015496,0.000308,5,10,True
1,medium,423183360,0.033073,0.003043,5,10,True
2,large,969411840,0.048348,0.002105,5,10,True


Unnamed: 0,small,medium,large
0,0.015774,0.033309,0.051147
1,0.015279,0.035062,0.048912
2,0.015694,0.03182,0.052684
3,0.015267,0.041419,0.048152
4,0.014971,0.030346,0.046484
5,0.015943,0.031067,0.046729
6,0.015619,0.03227,0.049247
7,0.015257,0.032207,0.045571
8,0.015285,0.031256,0.046818
9,0.015874,0.031976,0.047738


OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB. GPU 0 has a total capacity of 11.53 GiB of which 24.44 MiB is free. Process 14835 has 257.06 MiB memory in use. Process 349757 has 144.00 MiB memory in use. Including non-PyTorch memory, this process has 10.60 GiB memory in use. Of the allocated memory 10.34 GiB is allocated by PyTorch, and 137.55 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)