# Exercise: Measure Latency & Throughput
**Objective:** Get hands‑on loading Llama 3.2-Text-1B and timing a simple generation so you understand raw latency and throughput.

**Tasks:**
1. Load the model & tokenizer  
2. Prepare a prompt: Pick or write ~30–50 words; tokenize with tokenizer(...)
3. Time your generation
4. Record
   - Latency (s)
   - Throughput (tokens/s)

In [None]:
import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
# 1. SETTINGS
MODEL_NAME     = "meta-llama/Llama-3.2-1B"
MAX_NEW_TOKENS = 50

In [None]:
def load_model_and_tokenizer(model_name: str):
    """
    TODO:
      - load and return (tokenizer, model)
      - move model to GPU if available and set eval mode
    """
    # tokenizer = ...
    # model     = ...
    # model.to(device).eval()
    raise NotImplementedError

In [None]:
def prepare_prompt(tokenizer):
    """
    TODO:
      - choose or write ~30–50 word prompt
      - tokenize: return inputs (dict) and input_length (int)
    """
    # prompt = "… your 30–50 word text …"
    # inputs = tokenizer(prompt, return_tensors="pt")
    # input_len = inputs.input_ids.size(1)
    raise NotImplementedError

In [None]:
def warmup_model(model, inputs, device):
    """
    TODO:
      - run a small generate (e.g. max_new_tokens=5)
      - torch.cuda.synchronize() if on CUDA
    """
    raise NotImplementedError

In [None]:
def measure_generation(model, inputs, max_new_tokens, device):
    """
    TODO:
      - record start time
      - call model.generate(**inputs, max_new_tokens=max_new_tokens)
      - synchronize if CUDA
      - record end time
      - compute latency (s) and generated_tokens (int)
      - return latency, generated_tokens
    """
    raise NotImplementedError

In [None]:
def start():
    # 0. Load
    tokenizer, model, device = load_model_and_tokenizer(MODEL_NAME)
    print(f"Using device: {device}")

    # 1. Prepare prompt
    inputs, input_len = prepare_prompt(tokenizer)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    print(f"Prompt token length: {input_len}")

    # 2. Warm-up
    warmup_model(model, inputs, device)
    print("Warm-up complete.")

    # 3. Measure
    latency, gen_tokens = measure_generation(model, inputs, MAX_NEW_TOKENS, device)

    # 4. Compute & report
    throughput = gen_tokens / latency
    print(f"Generated tokens: {gen_tokens}")
    print(f"Latency        : {latency:.3f} s")
    print(f"Throughput     : {throughput:.1f} tokens/s")

In [None]:
start()