# Benchmark

> Deep Learning GPU benchmark



Run a standard PyTorch training loop on an image classifier model of your choice with specified batch size and FP16/FP32. The result is the measure of throughput - number of trainig samples per second. It can be synced to [Wights & Biases](https://wandb.ai/xl0/ready-steady-go). See more in [CLI](cli.html)

> Note: The data never leaves the GPU, and the throughput should be mostly independent of the rest of the system, at least for larger batch sizes.

In [None]:
#| default_exp benchmark

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
# |export
# |hide
import os
import time
from itertools import count
import torch
from torch import nn
from torch.nn import functional as F
from torch.cuda.amp.autocast_mode import autocast

import timm
from tqdm.auto import tqdm

In [None]:
# |exports

def benchmark(model: nn.Module, # Model to run
                bs: int =32,    # Batch size
                n_batches: int =None,  # Number of batches to run. `seconds` must be None
                n_seconds: int =None,  # Number of seconds to run. `n_batches` must be None
                fp16: int =False,      # Use Automatic Mixed Precision
                size: int=224,         # Mock-train on this size "images"
                dev: torch.device=torch.device("cuda:0"),): # Device to run on
    """Mock-train the model on random noise input."""

    # There can be only one
    assert not n_batches or not n_seconds
    assert n_batches or n_seconds


    torch.backends.cudnn.benchmark=True
    assert torch.backends.cudnn.is_available()

    model.to(dev)
    optim = torch.optim.SGD(model.parameters(), lr=0.00001, weight_decay=0.00005, momentum=0.9)

    state = { k : v.cpu() for k,v in model.state_dict().items() }


    X = torch.randn((bs, 3, size, size), device=dev)
    y = torch.randint(0, 999, (bs,), device=dev)

    if n_batches:
        pbar = tqdm(total=n_batches, unit="Batch")
    else:
        pbar = tqdm(total=n_seconds,
            bar_format="{l_bar}{bar}| {n:.1f}/{total} s [{elapsed}<{remaining} {postfix}]")
    
    start_time = last_time = 0
    for c in count():

        model.load_state_dict(state)

        with autocast(enabled=fp16):
            yhat = model(X)
            loss = F.cross_entropy(yhat, y)

        loss.backward()
        optim.step()

        tt=time.time()
        optim.zero_grad(set_to_none=True)

        if not start_time:
            last_time = start_time = tt
        else:
            if n_batches:
                pbar.update()
                # Note: c starts with 0, but we discard the first iteration
                if c == n_batches:
                    break
            else:
                iter_time =  tt - last_time
                run_time = tt - start_time
                pbar.update(iter_time)
                if run_time >= n_seconds:
                    break
                last_time = tt

    pbar.close()

    return ((time.time() - start_time), c*bs)


In [None]:
# |hide
# |eval: false

def hammer_transfer(n_iter = None,
        n_seconds=100,
        buffer_sz=(1024, 1024, 1024),
        dev = torch.device("cuda:0")):

    X = torch.randn(buffer_sz)

    X *= 2
    X /= 2 


    if n_iter:
        pbar = tqdm(total=n_iter, unit="Iter")
    else:
        pbar = tqdm(total=n_seconds,
            bar_format="{l_bar}{bar}| {n:.1f}/{total} s [{elapsed}<{remaining} {postfix}]")

    last_time = start_time = time.time()
    for c in count():
        X1 = X.to(dev)

        X1 *= 2
        X1 /= 2

        #X1[0,0,0] = 1

        X1 = X1.cpu()

        if not X1.eq(X).all():
            pbar.write("Your GPU might be faulty")


        curr_time = time.time()      

        if n_iter:
            pbar.update()
            if c > n_iter: break
        else:
            pbar.update(curr_time - last_time)
            if curr_time - start_time > n_seconds: break
        
        last_time = curr_time

    pbar.close()


In [None]:
# |hide
# |eval: false

def compare_params(p1, p2):
    return torch.cat([ pt1.eq(pt2).all().unsqueeze(-1) for pt1, pt2 in zip(p1, p2)])


def hammer_gpu(model: nn.Module,           # Model to run
                bs: int =32,           # Batch size
                n_batches: int =None,  # Number of batches to run. `seconds` must be None
                n_seconds: int =None,  # Number of seconds to run. `n_batches` must be None
                fp16: int =False,      # Use Automatic Mixed Precision
                size: int=224,         # Mock-train on this size "images"
                dev: torch.device=torch.device("cuda:0"),): # Device to run on
    """Mock-train the model on random noise input."""

    os.environ["CUBLAS_WORKSPACE_CONFIG"]=":4096:8"
    torch.use_deterministic_algorithms(True)

    # There can be only one
    assert not n_batches or not n_seconds
    assert n_batches or n_seconds

    torch.backends.cudnn.benchmark=True
    assert torch.backends.cudnn.is_available()

    model.to(dev)
    optim = torch.optim.SGD(model.parameters(), lr=0.1, weight_decay=0.005, momentum=0.)

    state = { k : v.clone() for k,v in model.state_dict().items() }

    X = torch.randn((bs, 3, size, size), device=dev)
    y = torch.randint(0, 999, (bs,), device=dev)

    if n_batches:
        pbar = tqdm(total=n_batches, unit="Batch")
    else:
        pbar = tqdm(total=n_seconds,
            bar_format="{l_bar}{bar}| {n:.1f}/{total} s [{elapsed}<{remaining} {postfix}]")
    
    start_time = last_time = 0
    for c in count():

        model.load_state_dict(state)

        with autocast(enabled=fp16):
            yhat = model(X)
            loss = F.cross_entropy(yhat, y)

        loss.backward()
        optim.step()

        if not c:
            p1 = [ p.clone() for p in model.parameters() ]
            # Note: we ignore the first batch.
            last_time = start_time = tt
        else:
            pn = [ p.detach() for p in model.parameters() ]
            if not compare_params(p1, pn).all():
                pbar.write("Your GPU might be failty!")

        tt=time.time()
        optim.zero_grad(set_to_none=True)

        if n_batches:
            if c != 0:
                pbar.update()
                # Note: c starts with 0, but we discard the first iteration
                if c == n_batches:
                    break
        else:
            if last_time:
                iter_time =  tt - last_time
                run_time = tt - start_time
                pbar.update(iter_time)
                if run_time >= n_seconds:
                    break
                last_time = tt
    pbar.close()

    return ((time.time() - start_time), c*bs)


In [None]:
# |eval: false
model = timm.create_model("vgg11", pretrained=False)
benchmark(model, n_seconds=10)

  0%|          | 0.0/10 s [00:00<? ]

(10.038218975067139, 1888)

In [None]:
# |eval: false
benchmark(model, n_batches=10)

  0%|          | 0/10 [00:00<?, ?Batch/s]

(1.6976494789123535, 320)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()