In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.utils.prune as prune
import torch.quantization as tq
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.profiler import profile, record_function, ProfilerActivity

In [2]:
MODEL_NAME      = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
PRUNE_AMOUNT    = 0.30      # 30% magnitude pruning
BATCH_SIZE      = 5
MAX_NEW_TOKENS  = 50
PROMPT          = (
    "In a world increasingly driven by artificial intelligence, the ability to interpret "
    "large language models efficiently is crucial for both research and deployment."
)
LOGDIR_BASELINE = "./profiler_logs/baseline"


In [3]:
def load_model(device: torch.device):
    """
      - Load the tokenizer: AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
      - Load the model: AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float32)
      - Move model to `device` and call .eval()
      - Return (tokenizer, model)
    """
    # 1) Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

    # 2) Load FP32 model
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float32
    )

    # 3) Move to device and set to eval mode
    model.to(device)
    model.eval()

    # 4) Return both
    return tokenizer, model

In [4]:
def make_batch(tokenizer, prompt: str, device: torch.device):
    """
      - Duplicate `prompt` BATCH_SIZE times into a list of strings
      - Tokenize with padding and truncation: tokenizer(..., return_tensors="pt")
      - Move inputs to `device`
      - Return the tokenized inputs
    """
    # 1) Replicate prompt
    texts = [prompt] * BATCH_SIZE

    # 2) Tokenize with padding & truncation
    inputs = tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True
    )

    # 3) Move to device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    return inputs


In [5]:
def profile_inference(model, inputs, logdir: str, label: str):
    """
      - Create `logdir` if it doesn't exist
      - Use `torch.profiler.profile` (CPU & CUDA, record_shapes, profile_memory, with_stack)
      - Inside the profiler, wrap the call to `model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)` in `record_function(label)`
      - After profiling, print:
          1) Top-3 ops by "self_cpu_time_total"
          2) Top-3 ops by "self_cuda_time_total"
          3) Total CPU vs CUDA self-time in milliseconds
      - Note: use `prof.key_averages().table(...)` and sum over `evt.self_cpu_time_total`, `evt.self_cuda_time_total`
      - Traces should be saved automatically by `tensorboard_trace_handler`
    """
    # 1) Ensure log directory exists
    os.makedirs(logdir, exist_ok=True)

    # 2) Run profiler
    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
        record_shapes=True,
        profile_memory=True,
        with_stack=True,
        on_trace_ready=torch.profiler.tensorboard_trace_handler(logdir)
    ) as prof:
        with record_function(label):
            _ = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)

    # 3) Print top‐3 operators by CPU self time
    print(f"\n=== {label} Top-3 ops by CPU self time ===")
    print(prof.key_averages().table(
        sort_by="self_cpu_time_total", row_limit=3
    ))

    # 4) Print top‐3 operators by CUDA self time
    print(f"\n=== {label} Top-3 ops by CUDA self time ===")
    print(prof.key_averages().table(
        sort_by="self_cuda_time_total", row_limit=3
    ))

    # 5) Summarize total CPU vs CUDA self times
    events     = prof.key_averages()
    total_cpu  = sum(evt.self_cpu_time_total for evt in events)
    total_cuda = sum(getattr(evt, "self_cuda_time_total", 0) for evt in events)
    print(f"\n=== {label} Total self-time ===")
    print(f"CPU  : {total_cpu/1e3:.2f} ms")
    print(f"CUDA : {total_cuda/1e3:.2f} ms")

    print(f"\nTrace files for '{label}' written to: {logdir}")

In [6]:
def apply_pruning_and_quant(model: nn.Module):
    """
      - On CPU, apply `prune.l1_unstructured(..., amount=PRUNE_AMOUNT)` to every nn.Linear weight
      - Call `prune.remove(...)` to make masks permanent
      - Then apply `torch.quantization.quantize_dynamic` on {nn.Linear} with dtype=torch.qint8
      - Return the quantized model
    """
    # 1) Prune on CPU
    model.cpu()
    for module in model.modules():
        if isinstance(module, nn.Linear):
            # zero out PRUNE_AMOUNT fraction of the smallest‐magnitude weights
            prune.l1_unstructured(module, name="weight", amount=PRUNE_AMOUNT)
            # make the pruning permanent
            prune.remove(module, "weight")

    # 2) Dynamic 8-bit quantization
    quantized = tq.quantize_dynamic(
        model,
        {nn.Linear},
        dtype=torch.qint8
    )

    return quantized

In [7]:
def start():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 1) Load & benchmark baseline
    tokenizer, model = load_model(device)
    batch_inputs     = make_batch(tokenizer, PROMPT, device)
    print("Profiling inference…")
    profile_inference(model, batch_inputs, LOGDIR_BASELINE, label="Baseline")


In [8]:
start()

Profiling inference…


STAGE:2025-09-06 02:57:50 10150:10150 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2025-09-06 02:57:57 10150:10150 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2025-09-06 02:57:57 10150:10150 ActivityProfilerController.cpp:324] Completed Stage: Post Processing



=== Baseline Top-3 ops by CPU self time ===
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                               Baseline        36.95%        1.494s       100.00%        4.043s        4.043s       0.000us         0.00%        1.458s        1.458s          

# Exercise 1: Measure Model Size & Nonzeros

## Concept: After pruning and quantization, the student should measure how the model structure has changed.

In [15]:
import os, shutil, torch
import numpy as np

def _safe_state_dict(model):
    """
    Return a CPU state_dict with ONLY tensor values.
    Filters out dtype/None/custom objects that can break saving.
    """
    sd = model.state_dict()
    safe = {}
    for k, v in sd.items():
        if torch.is_tensor(v):
            safe[k] = v.detach().cpu()
        # If libraries stash buffers as numpy arrays, you can keep them too:
        elif isinstance(v, np.ndarray):
            safe[k] = torch.from_numpy(v)
        # else: drop non-tensor entries (e.g., dtype objects)
    return safe

def _dir_size_mb(path: str) -> float:
    total = 0
    for root, _, files in os.walk(path):
        for f in files:
            total += os.path.getsize(os.path.join(root, f))
    return total / (1024 ** 2)

def measure_model_sparsity_and_size(model, name: str = "model"):
    """
    Robust version:
      - Counts sparsity for ANY submodule that exposes a tensor 'weight'
      - Saves either with HF .save_pretrained() (try) or a filtered state_dict (fallback)
      - Prints % sparsity + on-disk size (MB)
    """
    total, nonzero = 0, 0

    for mod in model.modules():
        # count ONLY if there is a real tensor weight
        W = getattr(mod, "weight", None)
        if torch.is_tensor(W):
            w = W.detach()
            total += w.numel()
            nonzero += (w != 0).sum().item()

    if total > 0:
        sparsity_pct = 100.0 * (1.0 - (nonzero / total))
        print(f"[{name}] Nonzero params: {nonzero}/{total} ({sparsity_pct:.2f}% sparse)")
    else:
        print(f"[{name}] Could not find tensor weights to count (quantized/packed?).")

    # save into a temporary folder
    tmp_dir = f"./tmp_{name}"
    if os.path.exists(tmp_dir):
        shutil.rmtree(tmp_dir)
    os.makedirs(tmp_dir, exist_ok=True)

    size_mb = None
    # Try the native HF save first, but guard against bad state entries
    try:
        if hasattr(model, "save_pretrained"):
            model.save_pretrained(tmp_dir, safe_serialization=True)
        else:
            raise AttributeError("No save_pretrained; using filtered state_dict fallback.")
    except Exception as e:
        # Fallback: filtered, CPU-only tensors
        safe_sd = _safe_state_dict(model)
        torch.save(safe_sd, os.path.join(tmp_dir, "model.pt"))

    size_mb = _dir_size_mb(tmp_dir)
    print(f"[{name}] Disk size: {size_mb:.2f} MB")


# Exercise 2 — Batch size & prompt-length sensitivity (latency/throughput)

## Concept: Show how decoding cost scales with batch size and max_new_tokens on your baseline vs optimized models.

In [19]:
import time, torch
import pandas as pd

def _model_device(model: torch.nn.Module) -> torch.device:
    for p in model.parameters():
        return p.device
    for b in model.buffers():
        return b.device
    return torch.device("cpu")

def sweep_batch_and_length(tokenizer, model_dict, _unused_device,
                           prompt, batch_sizes=(1, 2, 4),
                           gen_lengths=(16, 64, 128),
                           runs=3):
    """
    Device-safe sweep:
      - Tokenize once on CPU for a SINGLE sample.
      - For each model, move to model's device and tile to the requested batch size.
      - Time .generate() for each (B, L), with CUDA sync around timing.
      - Return tidy DataFrame.
    """
    # 1) Tokenize ONCE as a single sample
    tok_cpu = tokenizer(prompt, return_tensors="pt")
    input_ids_1 = tok_cpu["input_ids"][:1]            # ensure shape [1, T]
    attn_mask_1 = tok_cpu.get("attention_mask", None)
    rows = []

    for name, mdl in model_dict.items():
        mdev = _model_device(mdl)

        for B in batch_sizes:
            # 2) Tile to B on CPU, then move to model device
            input_ids = input_ids_1.expand(B, -1).to(mdev)
            if attn_mask_1 is not None:
                attn_mask = attn_mask_1.expand(B, -1).to(mdev)
                enc = {"input_ids": input_ids, "attention_mask": attn_mask}
            else:
                enc = {"input_ids": input_ids}

            for L in gen_lengths:
                latencies, tputs = [], []
                for _ in range(runs):
                    if mdev.type == "cuda":
                        torch.cuda.synchronize()
                    t0 = time.perf_counter()
                    with torch.inference_mode():
                        out = mdl.generate(**enc, max_new_tokens=L, use_cache=True)
                    if mdev.type == "cuda":
                        torch.cuda.synchronize()
                    t1 = time.perf_counter()

                    gen_len = out.shape[1] - enc["input_ids"].shape[1]
                    lat = t1 - t0
                    latencies.append(lat)
                    # tokens/sec across the WHOLE batch
                    tputs.append((gen_len * B) / max(lat, 1e-9))

                rows.append({
                    "model": name,
                    "batch_size": B,
                    "max_new_tokens": L,
                    "latency_s": sum(latencies)/len(latencies),
                    "tokens_per_sec": sum(tputs)/len(tputs),
                    "device": str(mdev),
                })

    return pd.DataFrame(rows)


# Exercise 3 — Operator hot-spots before vs after optimization

## Concept: Use torch.profiler to summarize top operators for baseline vs optimized. Students see where time is spent (e.g., attention matmuls vs layernorm) and how it shifts after pruning/quant.

In [23]:
# --- Robust operator hot-spot summary (event-based, device-safe) ---
from torch.profiler import profile, ProfilerActivity
import torch

def _model_device(model: torch.nn.Module) -> torch.device:
    for p in model.parameters():
        return p.device
    for b in model.buffers():
        return b.device
    return torch.device("cpu")

def _to_device_batch(batch: dict, device: torch.device) -> dict:
    moved = {}
    for k, v in batch.items():
        moved[k] = v.to(device) if isinstance(v, torch.Tensor) else v
    return moved

def top_ops_summary(model, batch_inputs, device_unused, with_cuda=True, top_k=10, label="run"):
    """
    Uses profiler event API for reliable CPU/CUDA self times.
    Returns: {"label", "device", "cpu": [(op, ms), ...], "cuda": [(op, ms), ...], "counts": {...}}
    """
    mdev = _model_device(model)
    enc  = _to_device_batch(batch_inputs, mdev)

    acts = [ProfilerActivity.CPU]
    if with_cuda and mdev.type == "cuda":
        acts.append(ProfilerActivity.CUDA)

    # Collect a single short run
    with profile(activities=acts, record_shapes=False, profile_memory=False) as prof:
        with torch.inference_mode():
            if mdev.type == "cuda":
                torch.cuda.synchronize()
            _ = model.generate(**enc, max_new_tokens=24, use_cache=True)  # bump tokens a bit to get more ops
            if mdev.type == "cuda":
                torch.cuda.synchronize()

    evts = prof.key_averages(group_by_input_shape=False)

    # Build CPU top list
    cpu_items = []
    for e in evts:
        cpu_ms = getattr(e, "self_cpu_time_total", 0.0) / 1000.0  # us -> ms
        if cpu_ms > 0:
            cpu_items.append((e.key, cpu_ms))
    cpu_items.sort(key=lambda x: x[1], reverse=True)
    cpu_items = cpu_items[:top_k]

    # Build CUDA top list (if applicable)
    cuda_items = []
    if mdev.type == "cuda":
        for e in evts:
            cuda_ms = getattr(e, "self_cuda_time_total", 0.0) / 1000.0
            if cuda_ms > 0:
                cuda_items.append((e.key, cuda_ms))
        cuda_items.sort(key=lambda x: x[1], reverse=True)
        cuda_items = cuda_items[:top_k]

    return {
        "label": label,
        "device": str(mdev),
        "cpu": cpu_items,
        "cuda": cuda_items,
        "counts": {
            "evt_total": len(evts),
            "cpu_nonzero": sum(1 for _, ms in cpu_items if ms > 0),
            "cuda_nonzero": sum(1 for _, ms in cuda_items if ms > 0),
        },
    }


In [25]:
# === Run All Exercises ===
def run_all_exercises():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 1) Load baseline model & tokenizer
    tokenizer, baseline_model = load_model(device)
    batch_inputs = make_batch(tokenizer, PROMPT, device)

    # 2) Create optimized model (prune + quantize)
    optimized_model = apply_pruning_and_quant(baseline_model)

    print("\n================ Exercise 1: Sparsity & Size ================")
    measure_model_sparsity_and_size(baseline_model, "baseline")
    measure_model_sparsity_and_size(optimized_model, "optimized")

    print("\n================ Exercise 2: Batch Size & Length Sweep ================")
    df_sweep = sweep_batch_and_length(
        tokenizer,
        {"baseline": baseline_model, "optimized": optimized_model},
        device,
        PROMPT,
        batch_sizes=(1, 2),
        gen_lengths=(16, 64),
        runs=2
    )
    display(df_sweep)

    print("\n================ Exercise 3: Operator Hot-Spots ================")
    base_ops = top_ops_summary(baseline_model, batch_inputs, device, with_cuda=True, top_k=8, label="baseline")
    opt_ops  = top_ops_summary(optimized_model, batch_inputs, device, with_cuda=True, top_k=8, label="optimized")
    
    print("Baseline device:", base_ops["device"], "| events:", base_ops["counts"])
    print("Optimized device:", opt_ops["device"], "| events:", opt_ops["counts"])
    print("\nTop CPU ops (baseline):", base_ops["cpu"])
    print("Top CPU ops (optimized):", opt_ops["cpu"])
    
# Call it
run_all_exercises()



[baseline] Nonzero params: 789722330/1100048384 (28.21% sparse)
[baseline] Disk size: 4196.37 MB
[optimized] Nonzero params: 65628160/65628160 (0.00% sparse)
[optimized] Disk size: 250.45 MB



Unnamed: 0,model,batch_size,max_new_tokens,latency_s,tokens_per_sec,device
0,baseline,1,16,1.56453,10.228149,cpu
1,baseline,1,64,5.054521,12.667585,cpu
2,baseline,2,16,1.840176,17.395002,cpu
3,baseline,2,64,6.051661,21.179314,cpu
4,optimized,1,16,0.679012,23.564206,cpu
5,optimized,1,64,2.692659,23.768656,cpu
6,optimized,2,16,0.767073,41.722865,cpu
7,optimized,2,64,2.942899,43.494537,cpu





STAGE:2025-09-06 03:25:30 10150:10150 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2025-09-06 03:25:35 10150:10150 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2025-09-06 03:25:35 10150:10150 ActivityProfilerController.cpp:324] Completed Stage: Post Processing
STAGE:2025-09-06 03:25:45 10150:10150 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2025-09-06 03:25:47 10150:10150 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2025-09-06 03:25:47 10150:10150 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


Baseline device: cpu | events: {'evt_total': 74, 'cpu_nonzero': 8, 'cuda_nonzero': 0}
Optimized device: cpu | events: {'evt_total': 72, 'cpu_nonzero': 8, 'cuda_nonzero': 0}

Top CPU ops (baseline): [('aten::mm', 3373.148), ('aten::mul', 69.753), ('aten::cat', 56.771), ('aten::_scaled_dot_product_flash_attention_for_cpu', 52.607), ('aten::copy_', 49.95), ('aten::add', 37.566), ('aten::matmul', 37.352), ('aten::silu', 30.704)]
Top CPU ops (optimized): [('quantized::linear_dynamic', 601.199), ('aten::cat', 49.253), ('aten::mul', 45.053), ('aten::_scaled_dot_product_flash_attention_for_cpu', 45.007), ('aten::copy_', 34.219), ('aten::add', 25.002), ('aten::empty', 21.191), ('aten::silu', 20.13)]
