# PyTorch GPU Workload Efficiency Benchmarks (Colab)

This notebook generates reproducible GPU performance benchmarks + torch.profiler traces and creates batch-size sweep plots.

**Important:** Run cells in order. After the install cell, Colab will restart automatically.


In [None]:
!nvidia-smi
import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))


## 1) Install pinned dependencies (Colab-safe)

This avoids conflicts with Colab's preinstalled packages (e.g., pandas 2.2.2). The runtime will restart at the end of this cell.


In [None]:
!pip -q uninstall -y pandas tensorboard torch torchvision torchaudio 2>/dev/null

!pip -q install pandas==2.2.2
!pip -q install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121
!pip -q install tensorboard==2.19.0 matplotlib==3.8.4

import os
os.kill(os.getpid(), 9)  # restart runtime


## 2) Verify environment (run after restart)


In [None]:
import torch, pandas as pd, tensorboard
print("torch:", torch.__version__)
print("cuda:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("gpu:", torch.cuda.get_device_name(0))
print("pandas:", pd.__version__)
print("tensorboard:", tensorboard.__version__)


## 3) Write repo files

Creates a mini repo under `/content/pytorch-gpu-workload-efficiency` with:
- CLI benchmark runner
- torch.profiler trace support
- batch-size sweep
- plotting scripts

**Note:** `src/` is a proper Python package and is run via `python -m ...` to avoid import errors.


In [None]:

import os, textwrap, pathlib

ROOT = "/content/pytorch-gpu-workload-efficiency"
os.makedirs(ROOT, exist_ok=True)

def write(rel_path, content):
    p = pathlib.Path(ROOT) / rel_path
    p.parent.mkdir(parents=True, exist_ok=True)
    p.write_text(textwrap.dedent(content))
    return str(p)

write(".gitignore", """
.venv/
__pycache__/
*.pyc
.ipynb_checkpoints/
outputs/
assets/
""")

write("requirements.txt", """
torch==2.5.1
torchvision==0.20.1
torchaudio==2.5.1
pandas==2.2.2
matplotlib==3.8.4
tensorboard==2.19.0
""")

# Make src a package
write("src/__init__.py", """""")

write("src/metrics.py", """
import torch
import statistics

def synchronize_if_cuda():
    if torch.cuda.is_available():
        torch.cuda.synchronize()

def latency_stats(latencies_ms):
    latencies_ms = list(latencies_ms)
    latencies_ms.sort()

    def pct(p):
        if not latencies_ms:
            return None
        k = int(round((p/100) * (len(latencies_ms)-1)))
        return latencies_ms[k]

    return {
        "p50_ms": pct(50),
        "p95_ms": pct(95),
        "mean_ms": statistics.mean(latencies_ms) if latencies_ms else None
    }
""")

write("src/optimizations.py", """
import torch
from contextlib import nullcontext

def apply_channels_last(model):
    return model.to(memory_format=torch.channels_last)

def maybe_compile(model, enabled: bool):
    # torch.compile is available in PyTorch 2.x. If it fails on a given GPU, fall back gracefully.
    if enabled and hasattr(torch, "compile"):
        try:
            return torch.compile(model)
        except Exception:
            return model
    return model

def autocast_ctx(enabled: bool):
    if torch.cuda.is_available():
        return torch.autocast(device_type="cuda", dtype=torch.float16, enabled=enabled)
    return nullcontext()
""")

write("src/profilers.py", """
import os
import torch

def maybe_profile(enabled: bool, profile_dir: str):
    if not enabled:
        return None

    os.makedirs(profile_dir, exist_ok=True)

    activities = [torch.profiler.ProfilerActivity.CPU]
    if torch.cuda.is_available():
        activities.append(torch.profiler.ProfilerActivity.CUDA)

    prof = torch.profiler.profile(
        activities=activities,
        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
        on_trace_ready=torch.profiler.tensorboard_trace_handler(profile_dir),
        record_shapes=True,
        profile_memory=True,
        with_stack=False,
    )
    return prof
""")

write("src/benchmark_infer.py", r"""
import argparse, os, json, csv
import torch
import torchvision.models as models

from .metrics import synchronize_if_cuda, latency_stats
from .optimizations import apply_channels_last, maybe_compile, autocast_ctx
from .profilers import maybe_profile

def get_model(name: str):
    name = name.lower()
    if name == "resnet50":
        return models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
    if name == "efficientnet_b0":
        return models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.DEFAULT)
    raise ValueError(f"Unsupported model: {name}")

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--model", default="resnet50")
    ap.add_argument("--batch-size", type=int, default=32)
    ap.add_argument("--steps", type=int, default=120)
    ap.add_argument("--warmup", type=int, default=20)

    ap.add_argument("--amp", action="store_true")
    ap.add_argument("--channels-last", action="store_true")
    ap.add_argument("--compile", action="store_true")

    ap.add_argument("--profile", action="store_true")
    ap.add_argument("--profile-dir", default="outputs/profiler_traces")

    ap.add_argument("--out-csv", default="outputs/results.csv")
    ap.add_argument("--out-json", default="outputs/result.json")
    args = ap.parse_args()

    device = "cuda" if torch.cuda.is_available() else "cpu"

    model = get_model(args.model).eval().to(device)
    if args.channels_last:
        model = apply_channels_last(model)
    model = maybe_compile(model, args.compile)

    x = torch.randn(args.batch_size, 3, 224, 224, device=device)
    if args.channels_last:
        x = x.to(memory_format=torch.channels_last)

    # Warmup
    with torch.no_grad():
        for _ in range(args.warmup):
            with autocast_ctx(args.amp):
                _ = model(x)
        synchronize_if_cuda()

    prof = maybe_profile(args.profile, args.profile_dir)
    latencies = []
    total_images = 0

    with torch.no_grad():
        if prof:
            prof.__enter__()

        for _ in range(args.steps):
            if device == "cuda":
                start = torch.cuda.Event(enable_timing=True)
                end = torch.cuda.Event(enable_timing=True)
                start.record()

                with autocast_ctx(args.amp):
                    _ = model(x)

                end.record()
                torch.cuda.synchronize()
                ms = start.elapsed_time(end)
            else:
                import time
                t0 = time.perf_counter()
                with autocast_ctx(args.amp):
                    _ = model(x)
                t1 = time.perf_counter()
                ms = (t1 - t0) * 1000.0

            latencies.append(ms)
            total_images += args.batch_size

            if prof:
                prof.step()

        if prof:
            prof.__exit__(None, None, None)

    stats = latency_stats(latencies)
    total_time_s = sum(latencies) / 1000.0
    throughput = total_images / total_time_s if total_time_s > 0 else None

    result = {
        "task": "inference",
        "model": args.model,
        "device": device,
        "batch_size": args.batch_size,
        "steps": args.steps,
        "amp": args.amp,
        "channels_last": args.channels_last,
        "compile": args.compile,
        "throughput_img_s": throughput,
        **stats,
    }

    os.makedirs(os.path.dirname(args.out_csv), exist_ok=True)
    write_header = not os.path.exists(args.out_csv)
    with open(args.out_csv, "a", newline="") as f:
        w = csv.DictWriter(f, fieldnames=list(result.keys()))
        if write_header:
            w.writeheader()
        w.writerow(result)

    os.makedirs(os.path.dirname(args.out_json), exist_ok=True)
    with open(args.out_json, "w") as f:
        json.dump(result, f, indent=2)

    print(json.dumps(result, indent=2))

if __name__ == "__main__":
    main()
""")

write("src/benchmark_train_micro.py", r"""
import argparse, os, json, csv
import torch
import torchvision.models as models
import torch.nn as nn
import torch.optim as optim

from .metrics import synchronize_if_cuda, latency_stats
from .optimizations import apply_channels_last, maybe_compile, autocast_ctx
from .profilers import maybe_profile

def get_model(name: str, num_classes=1000):
    name = name.lower()
    if name == "resnet50":
        return models.resnet50(weights=None, num_classes=num_classes)
    if name == "efficientnet_b0":
        return models.efficientnet_b0(weights=None, num_classes=num_classes)
    raise ValueError(f"Unsupported model: {name}")

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--model", default="resnet50")
    ap.add_argument("--batch-size", type=int, default=32)
    ap.add_argument("--steps", type=int, default=80)
    ap.add_argument("--warmup", type=int, default=10)

    ap.add_argument("--amp", action="store_true")
    ap.add_argument("--channels-last", action="store_true")
    ap.add_argument("--compile", action="store_true")

    ap.add_argument("--profile", action="store_true")
    ap.add_argument("--profile-dir", default="outputs/profiler_traces")

    ap.add_argument("--out-csv", default="outputs/results.csv")
    ap.add_argument("--out-json", default="outputs/train_result.json")
    args = ap.parse_args()

    device = "cuda" if torch.cuda.is_available() else "cpu"
    torch.manual_seed(0)

    model = get_model(args.model).train().to(device)
    if args.channels_last:
        model = apply_channels_last(model)
    model = maybe_compile(model, args.compile)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

    x = torch.randn(args.batch_size, 3, 224, 224, device=device)
    y = torch.randint(0, 1000, (args.batch_size,), device=device)
    if args.channels_last:
        x = x.to(memory_format=torch.channels_last)

    scaler = torch.cuda.amp.GradScaler(enabled=(args.amp and device == "cuda"))

    # Warmup
    for _ in range(args.warmup):
        optimizer.zero_grad(set_to_none=True)
        with autocast_ctx(args.amp):
            out = model(x)
            loss = criterion(out, y)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
    synchronize_if_cuda()

    prof = maybe_profile(args.profile, args.profile_dir)
    it_ms = []
    total_images = 0

    if prof:
        prof.__enter__()

    for _ in range(args.steps):
        if device == "cuda":
            start = torch.cuda.Event(enable_timing=True)
            end = torch.cuda.Event(enable_timing=True)
            start.record()

        optimizer.zero_grad(set_to_none=True)
        with autocast_ctx(args.amp):
            out = model(x)
            loss = criterion(out, y)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        if device == "cuda":
            end.record()
            torch.cuda.synchronize()
            ms = start.elapsed_time(end)
        else:
            import time
            t0 = time.perf_counter()
            optimizer.step()
            t1 = time.perf_counter()
            ms = (t1 - t0) * 1000.0

        it_ms.append(ms)
        total_images += args.batch_size

        if prof:
            prof.step()

    if prof:
        prof.__exit__(None, None, None)

    stats = latency_stats(it_ms)
    total_time_s = sum(it_ms) / 1000.0
    throughput = total_images / total_time_s if total_time_s > 0 else None

    result = {
        "task": "train_micro",
        "model": args.model,
        "device": device,
        "batch_size": args.batch_size,
        "steps": args.steps,
        "amp": args.amp,
        "channels_last": args.channels_last,
        "compile": args.compile,
        "throughput_img_s": throughput,
        **stats,
    }

    os.makedirs(os.path.dirname(args.out_csv), exist_ok=True)
    write_header = not os.path.exists(args.out_csv)
    with open(args.out_csv, "a", newline="") as f:
        w = csv.DictWriter(f, fieldnames=list(result.keys()))
        if write_header:
            w.writeheader()
        w.writerow(result)

    os.makedirs(os.path.dirname(args.out_json), exist_ok=True)
    with open(args.out_json, "w") as f:
        json.dump(result, f, indent=2)

    print(json.dumps(result, indent=2))

if __name__ == "__main__":
    main()
""")

write("scripts/sweep_batch.py", r"""
import os, subprocess
import pandas as pd

BATCH_SIZES = [1, 2, 4, 8, 16, 32, 64, 128]
MODEL = "resnet50"

OUT = "outputs/sweep_results.csv"
os.makedirs("outputs", exist_ok=True)

if os.path.exists(OUT):
    os.remove(OUT)

def run_one(bs, amp=False, channels_last=False, compile_=False, tag=""):
    cmd = [
        "python", "-m", "src.benchmark_infer",
        "--model", MODEL,
        "--batch-size", str(bs),
        "--steps", "120",
        "--warmup", "20",
        "--out-csv", OUT,
        "--out-json", f"outputs/sweep_{tag}_bs{bs}.json",
    ]
    if amp:
        cmd.append("--amp")
    if channels_last:
        cmd.append("--channels-last")
    if compile_:
        cmd.append("--compile")

    print("RUN:", " ".join(cmd))
    subprocess.run(cmd, check=True)

# baseline sweep
for bs in BATCH_SIZES:
    run_one(bs, amp=False, channels_last=False, compile_=False, tag="baseline")

# optimized sweep
for bs in BATCH_SIZES:
    run_one(bs, amp=True, channels_last=True, compile_=True, tag="optimized")

df = pd.read_csv(OUT).sort_values(["amp", "channels_last", "compile", "batch_size"])
print(df)
print("\nSaved:", OUT)
""")

write("scripts/plot_sweep.py", r"""
import os
import pandas as pd
import matplotlib.pyplot as plt

IN_CSV = "outputs/sweep_results.csv"
os.makedirs("assets", exist_ok=True)

df = pd.read_csv(IN_CSV)

# Throughput plot
plt.figure()
for (amp, ch, comp), g in df.groupby(["amp", "channels_last", "compile"]):
    g = g.sort_values("batch_size")
    label = f"amp={amp}, cl={ch}, compile={comp}"
    plt.plot(g["batch_size"], g["throughput_img_s"], marker="o", label=label)

plt.xscale("log", base=2)
plt.xlabel("Batch size")
plt.ylabel("Throughput (img/s)")
plt.title("ResNet-50 Inference Throughput vs Batch Size")
plt.grid(True)
plt.legend()
plt.savefig("assets/batch_sweep_throughput.png", dpi=200, bbox_inches="tight")
plt.close()

# p50 latency plot
plt.figure()
for (amp, ch, comp), g in df.groupby(["amp", "channels_last", "compile"]):
    g = g.sort_values("batch_size")
    label = f"amp={amp}, cl={ch}, compile={comp}"
    plt.plot(g["batch_size"], g["p50_ms"], marker="o", label=label)

plt.xscale("log", base=2)
plt.xlabel("Batch size")
plt.ylabel("p50 latency (ms)")
plt.title("ResNet-50 Inference p50 Latency vs Batch Size")
plt.grid(True)
plt.legend()
plt.savefig("assets/batch_sweep_latency.png", dpi=200, bbox_inches="tight")
plt.close()

print("Saved plots:")
print(" - assets/batch_sweep_throughput.png")
print(" - assets/batch_sweep_latency.png")
""")

write("README.md", """
# PyTorch GPU Workload Efficiency Benchmark Suite

Reproducible benchmarking + profiling toolkit for PyTorch training and inference on GPU.

## What it measures
- **Throughput:** images/sec
- **Latency:** p50 / p95 (ms)

## Quickstart (Colab)
Run the notebook `notebooks/GWE_Benchmarks_Colab.ipynb`.

## Profiling
Profiler traces are written to `outputs/profiler_traces/` and can be viewed in TensorBoard.

## Batch sweep plots
Plots are saved in `assets/`.
""")

print("Wrote repo to:", ROOT)


## 4) Run baseline + optimized benchmarks


In [None]:
%cd /content/pytorch-gpu-workload-efficiency
!rm -f outputs/results.csv

# Inference baseline
!python -m src.benchmark_infer --model resnet50 --batch-size 32 --steps 150 --warmup 20 --out-csv outputs/results.csv --out-json outputs/infer_baseline.json

# Inference optimized (AMP + channels-last + compile)
!python -m src.benchmark_infer --model resnet50 --batch-size 32 --steps 150 --warmup 20 --amp --channels-last --compile --out-csv outputs/results.csv --out-json outputs/infer_optimized.json

# Micro-train baseline
!python -m src.benchmark_train_micro --model resnet50 --batch-size 32 --steps 80 --warmup 10 --out-csv outputs/results.csv --out-json outputs/train_baseline.json

# Micro-train optimized (AMP + channels-last)
!python -m src.benchmark_train_micro --model resnet50 --batch-size 32 --steps 80 --warmup 10 --amp --channels-last --out-csv outputs/results.csv --out-json outputs/train_optimized.json

!python - <<'PY'
import pandas as pd
df = pd.read_csv("outputs/results.csv")
display(df.sort_values(["task","amp","channels_last","compile"]))
PY


## 5) Generate a profiler trace + open TensorBoard

1. Run the trace cell
2. Run TensorBoard cell
3. Screenshot the **Profile → Trace viewer** page and upload to `assets/profiler_trace.png`


In [None]:
%cd /content/pytorch-gpu-workload-efficiency
!rm -rf outputs/profiler_traces
!mkdir -p outputs/profiler_traces

!python -m src.benchmark_infer --model resnet50 --batch-size 32 --steps 30 --warmup 10 --amp --profile --profile-dir outputs/profiler_traces --out-csv outputs/results.csv --out-json outputs/profile_run.json


In [None]:
%load_ext tensorboard
%tensorboard --logdir /content/pytorch-gpu-workload-efficiency/outputs/profiler_traces


## 6) Batch-size sweep + plots


In [None]:
%cd /content/pytorch-gpu-workload-efficiency
!python scripts/sweep_batch.py
!python scripts/plot_sweep.py
!ls -lah assets


## 7) Package for GitHub upload


In [None]:
%cd /content
!zip -r pytorch-gpu-workload-efficiency.zip pytorch-gpu-workload-efficiency
print("Zip created: /content/pytorch-gpu-workload-efficiency.zip")
