# 🚀 Import Required Libraries
This cell:
- Imports essential libraries for PyTorch operations, pruning, and quantization.
- Imports Hugging Face Transformers for model and tokenizer handling.
- Imports PyTorch Profiler for performance analysis.

In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.utils.prune as prune
import torch.quantization as tq
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.profiler import profile, record_function, ProfilerActivity

# 🔧 Define Model and Profiling Settings
This cell:
- Specifies the model name to be used for pruning and profiling.
- Defines the pruning amount (`PRUNE_AMOUNT`) as 30%.
- Sets the batch size and maximum number of tokens to generate during inference.
- Provides a sample prompt for benchmarking.
- Specifies the directory to save profiler logs for the baseline model.

In [2]:
MODEL_NAME      = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
PRUNE_AMOUNT    = 0.30      # 30% magnitude pruning
BATCH_SIZE      = 5
MAX_NEW_TOKENS  = 50
PROMPT          = (
    "In a world increasingly driven by artificial intelligence, the ability to interpret "
    "large language models efficiently is crucial for both research and deployment."
)
LOGDIR_BASELINE = "./profiler_logs/baseline"


# 📦 Load Model and Tokenizer
This function:
- Loads the tokenizer and model using Hugging Face Transformers.
- Configures the model to use FP32 precision for accurate profiling.
- Moves the model to the specified device (CPU or GPU).
- Sets the model to evaluation mode to disable gradient computations.
- Returns the loaded tokenizer and model.

In [3]:
def load_model(device: torch.device):
    """
    TODO:
      - Load the tokenizer: AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
      - Load the model: AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float32)
      - Move model to `device` and call .eval()
      - Return (tokenizer, model)
    """
    # 1) Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

    # 2) Load FP32 model
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float32
    )

    # 3) Move to device and set to eval mode
    model.to(device)
    model.eval()

    # 4) Return both
    return tokenizer, model

# 📝 Create a Batch of Inputs
This function:
- Duplicates the provided prompt `BATCH_SIZE` times to simulate a batch of inputs.
- Tokenizes the batch with padding and truncation to ensure uniform input size.
- Moves the tokenized inputs to the specified device (CPU or GPU).
- Returns the prepared batch of inputs.

In [4]:
def make_batch(tokenizer, prompt: str, device: torch.device):
    """
    TODO:
      - Duplicate `prompt` BATCH_SIZE times into a list of strings
      - Tokenize with padding and truncation: tokenizer(..., return_tensors="pt")
      - Move inputs to `device`
      - Return the tokenized inputs
    """
    # 1) Replicate prompt
    texts = [prompt] * BATCH_SIZE

    # 2) Tokenize with padding & truncation
    inputs = tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True
    )

    # 3) Move to device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    return inputs


# 📊 Profile Inference
This function:
- Profiles the model's inference performance using PyTorch Profiler.
- Captures:
  - CPU and CUDA activity.
  - Memory usage.
  - Operator-level performance metrics.
- Saves the profiling traces to the specified log directory for visualization in TensorBoard.
- Prints:
  - Top-3 operators by CPU self-time.
  - Top-3 operators by CUDA self-time.
  - Total CPU and CUDA self-times in milliseconds.

In [5]:
def profile_inference(model, inputs, logdir: str, label: str):
    """
    TODO:
      - Create `logdir` if it doesn't exist
      - Use `torch.profiler.profile` (CPU & CUDA, record_shapes, profile_memory, with_stack)
      - Inside the profiler, wrap the call to `model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)` in `record_function(label)`
      - After profiling, print:
          1) Top-3 ops by "self_cpu_time_total"
          2) Top-3 ops by "self_cuda_time_total"
          3) Total CPU vs CUDA self-time in milliseconds
      - Note: use `prof.key_averages().table(...)` and sum over `evt.self_cpu_time_total`, `evt.self_cuda_time_total`
      - Traces should be saved automatically by `tensorboard_trace_handler`
    """
    # 1) Ensure log directory exists
    os.makedirs(logdir, exist_ok=True)

    # 2) Run profiler
    with profile(
        activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
        record_shapes=True,
        profile_memory=True,
        with_stack=True,
        on_trace_ready=torch.profiler.tensorboard_trace_handler(logdir)
    ) as prof:
        with record_function(label):
            _ = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)

    # 3) Print top‐3 operators by CPU self time
    print(f"\n=== {label} Top-3 ops by CPU self time ===")
    print(prof.key_averages().table(
        sort_by="self_cpu_time_total", row_limit=3
    ))

    # 4) Print top‐3 operators by CUDA self time
    print(f"\n=== {label} Top-3 ops by CUDA self time ===")
    print(prof.key_averages().table(
        sort_by="self_cuda_time_total", row_limit=3
    ))

    # 5) Summarize total CPU vs CUDA self times
    events     = prof.key_averages()
    total_cpu  = sum(evt.self_cpu_time_total for evt in events)
    total_cuda = sum(getattr(evt, "self_cuda_time_total", 0) for evt in events)
    print(f"\n=== {label} Total self-time ===")
    print(f"CPU  : {total_cpu/1e3:.2f} ms")
    print(f"CUDA : {total_cuda/1e3:.2f} ms")

    print(f"\nTrace files for '{label}' written to: {logdir}")

# ✂️ Apply Pruning and Quantization
This function:
- Applies unstructured magnitude-based pruning to all `nn.Linear` layers in the model.
  - Prunes 30% of the smallest-magnitude weights.
  - Makes the pruning masks permanent.
- Dynamically quantizes the pruned model to 8-bit integers (`torch.qint8`) for efficient inference.
- Returns the pruned and quantized model.

In [6]:
def apply_pruning_and_quant(model: nn.Module):
    """
    TODO:
      - On CPU, apply `prune.l1_unstructured(..., amount=PRUNE_AMOUNT)` to every nn.Linear weight
      - Call `prune.remove(...)` to make masks permanent
      - Then apply `torch.quantization.quantize_dynamic` on {nn.Linear} with dtype=torch.qint8
      - Return the quantized model
    """
    # 1) Prune on CPU
    model.cpu()
    for module in model.modules():
        if isinstance(module, nn.Linear):
            # zero out PRUNE_AMOUNT fraction of the smallest‐magnitude weights
            prune.l1_unstructured(module, name="weight", amount=PRUNE_AMOUNT)
            # make the pruning permanent
            prune.remove(module, "weight")

    # 2) Dynamic 8-bit quantization
    quantized = tq.quantize_dynamic(
        model,
        {nn.Linear},
        dtype=torch.qint8
    )

    return quantized

# 🚦 Start the Profiling Process
This function:
- Loads the model and tokenizer.
- Creates a batch of inputs using the provided prompt.
- Profiles the baseline model's inference performance.
- Saves the profiling results to the specified log directory.

In [7]:
def start():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 1) Load & benchmark baseline
    tokenizer, model = load_model(device)
    batch_inputs     = make_batch(tokenizer, PROMPT, device)
    print("Profiling inference…")
    profile_inference(model, batch_inputs, LOGDIR_BASELINE, label="Baseline")


# ▶️ Run the Profiling Pipeline
This cell:
- Calls the `start` function to execute the profiling pipeline.
- Outputs the profiling results, including operator-level performance metrics and total CPU/CUDA self-times.

In [8]:
start()

Profiling inference…


STAGE:2025-09-04 02:48:46 12034:12034 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2025-09-04 02:48:53 12034:12034 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2025-09-04 02:48:53 12034:12034 ActivityProfilerController.cpp:324] Completed Stage: Post Processing



=== Baseline Top-3 ops by CPU self time ===
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                               Baseline        35.35%        1.413s       100.00%        3.997s        3.997s       0.000us         0.00%        1.459s        1.459s          