In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.utils.prune as prune
import torch.quantization as tq
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.profiler import profile, record_function, ProfilerActivity

In [2]:
MODEL_NAME      = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
PRUNE_AMOUNT    = 0.30      # 30% magnitude pruning
BATCH_SIZE      = 5
MAX_NEW_TOKENS  = 50
PROMPT          = (
    "In a world increasingly driven by artificial intelligence, the ability to interpret "
    "large language models efficiently is crucial for both research and deployment."
)
LOGDIR_BASELINE = "./profiler_logs/baseline"


In [3]:
def load_model(device: torch.device):
    """
    TODO:
      - Load the tokenizer: AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
      - Load the model: AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float32)
      - Move model to `device` and call .eval()
      - Return (tokenizer, model)
    """
    raise NotImplementedError

In [4]:
def make_batch(tokenizer, prompt: str, device: torch.device):
    """
    TODO:
      - Duplicate `prompt` BATCH_SIZE times into a list of strings
      - Tokenize with padding and truncation: tokenizer(..., return_tensors="pt")
      - Move inputs to `device`
      - Return the tokenized inputs
    """
    raise NotImplementedError


In [5]:
def profile_inference(model, inputs, logdir: str, label: str):
    """
    TODO:
      - Create `logdir` if it doesn't exist
      - Use `torch.profiler.profile` (CPU & CUDA, record_shapes, profile_memory, with_stack)
      - Inside the profiler, wrap the call to `model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)` in `record_function(label)`
      - After profiling, print:
          1) Top-3 ops by "self_cpu_time_total"
          2) Top-3 ops by "self_cuda_time_total"
          3) Total CPU vs CUDA self-time in milliseconds
      - Note: use `prof.key_averages().table(...)` and sum over `evt.self_cpu_time_total`, `evt.self_cuda_time_total`
      - Traces should be saved automatically by `tensorboard_trace_handler`
    """
    raise NotImplementedError

In [6]:
def apply_pruning_and_quant(model: nn.Module):
    """
    TODO:
      - On CPU, apply `prune.l1_unstructured(..., amount=PRUNE_AMOUNT)` to every nn.Linear weight
      - Call `prune.remove(...)` to make masks permanent
      - Then apply `torch.quantization.quantize_dynamic` on {nn.Linear} with dtype=torch.qint8
      - Return the quantized model
    """
    raise NotImplementedError

In [8]:
def start():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 1) Load & benchmark baseline
    tokenizer, model = load_model(device)
    batch_inputs     = make_batch(tokenizer, PROMPT, device)
    print("Profiling inference…")
    profile_inference(model, batch_inputs, LOGDIR_BASELINE, label="Baseline")
