# Exercise: Measure Latency & Throughput
**Objective:** Get hands‑on loading Llama 3.2-Text-1B and timing a simple generation so you understand raw latency and throughput.

**Tasks:**
1. Load the model & tokenizer  
2. Prepare a prompt: Pick or write ~30–50 words; tokenize with tokenizer(...)
3. Time your generation
4. Record
   - Latency (s)
   - Throughput (tokens/s)

In [1]:
import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
# 1. SETTINGS
MODEL_NAME     = "meta-llama/Llama-3.2-1B"
MAX_NEW_TOKENS = 50

In [3]:
def load_model_and_tokenizer(model_name: str):
    """
    TODO:
      - load and return (tokenizer, model)
      - move model to GPU if available and set eval mode
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16
    )
    model.to(device).eval()
    return tokenizer, model, device

In [4]:
def prepare_prompt(tokenizer):
    """
    TODO:
      - choose or write ~30–50 word prompt
      - tokenize: return inputs (dict) and input_length (int)
    """
    prompt = (
        "The first two courses on Udacity started on 20 February 2012,[29] entitled 'CS 101: Building a Search Engine', taught by David Evans from the University of Virginia, and 'CS 373: Programming a Robotic Car' taught by Thrun. Both courses use Python."
    )
    inputs = tokenizer(prompt, return_tensors="pt")
    input_len = inputs["input_ids"].size(1)
    return inputs, input_len

In [5]:
def warmup_model(model, inputs, device):
    """
    TODO:
      - run a small generate (e.g. max_new_tokens=5)
      - torch.cuda.synchronize() if on CUDA
    """
    _ = model.generate(**inputs, max_new_tokens=5)
    if device.type == "cuda":
        torch.cuda.synchronize()

In [11]:
def measure_generation(model, inputs, max_new_tokens, device):
    """
    TODO:
      - record start time
      - call model.generate(**inputs, max_new_tokens=max_new_tokens)
      - synchronize if CUDA
      - record end time
      - compute latency (s) and generated_tokens (int)
      - return latency, generated_tokens
    """
    inputs = {k: v.to(device) for k, v in inputs.items()}
    input_len = inputs["input_ids"].size(1)

    start = time.time()
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    if device.type == "cuda":
        torch.cuda.synchronize()
    end = time.time()

    latency_s = end - start
    gen_tokens = outputs.size(1) - input_len
    return latency_s, gen_tokens

In [12]:
def start():
    # 0. Load
    tokenizer, model, device = load_model_and_tokenizer(MODEL_NAME)
    print(f"Using device: {device}")

    # 1. Prepare prompt
    inputs, input_len = prepare_prompt(tokenizer)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    print(f"Prompt token length: {input_len}")

    # 2. Warm-up
    warmup_model(model, inputs, device)
    print("Warm-up complete.")

    # 3. Measure
    latency, gen_tokens = measure_generation(model, inputs, MAX_NEW_TOKENS, device)

    # 4. Compute & report
    throughput = gen_tokens / latency
    print(f"Generated tokens: {gen_tokens}")
    print(f"Latency        : {latency:.3f} s")
    print(f"Throughput     : {throughput:.1f} tokens/s")

In [13]:
start()

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Using device: cuda
Prompt token length: 62
Warm-up complete.
Generated tokens: 50
Latency        : 0.874 s
Throughput     : 57.2 tokens/s
