# Demo: Measure Latency & Throughput
**Objective:** Get hands‑on loading Llama 3.2-Text-1B and timing a simple generation so you understand raw latency and throughput.

**Tasks:**
1. Load the model & tokenizer  
2. Prepare a prompt: Pick or write ~30–50 words; tokenize with tokenizer(...)
3. Time your generation
4. Record
   - Latency (s)
   - Throughput (tokens/s)

In [None]:
import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
MODEL_NAME     = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
MAX_NEW_TOKENS = 50

In [None]:
def load_model_and_tokenizer(model_name: str):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16
    )
    model.to(device).eval()
    return tokenizer, model, device

In [None]:
def prepare_prompt(tokenizer):
    prompt = (
        "The first two courses on Udacity started on 20 February 2012,[29] entitled 'CS 101: Building a Search Engine', taught by David Evans from the University of Virginia, and 'CS 373: Programming a Robotic Car' taught by Thrun. Both courses use Python."
    )
    inputs = tokenizer(prompt, return_tensors="pt")
    input_len = inputs["input_ids"].size(1)
    return inputs, input_len

In [None]:
def warmup_model(model, inputs, device):
    _ = model.generate(**inputs, max_new_tokens=5)
    if device.type == "cuda":
        torch.cuda.synchronize()

In [None]:
def measure_generation(model, inputs, max_new_tokens, device):
    inputs = {k: v.to(device) for k, v in inputs.items()}
    input_len = inputs["input_ids"].size(1)

    start = time.time()
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    if device.type == "cuda":
        torch.cuda.synchronize()
    end = time.time()

    latency_s = end - start
    gen_tokens = outputs.size(1) - input_len
    return latency_s, gen_tokens

In [None]:
def start():
    # 0. Load
    tokenizer, model, device = load_model_and_tokenizer(MODEL_NAME)
    print(f"Using device: {device}")

    # 1. Prepare prompt
    inputs, input_len = prepare_prompt(tokenizer)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    print(f"Prompt token length: {input_len}")

    # 2. Warm-up
    warmup_model(model, inputs, device)
    print("Warm-up complete.")

    # 3. Measure
    latency, gen_tokens = measure_generation(model, inputs, MAX_NEW_TOKENS, device)

    # 4. Compute & report
    throughput = gen_tokens / latency
    print(f"Generated tokens: {gen_tokens}")
    print(f"Latency        : {latency:.3f} s")
    print(f"Throughput     : {throughput:.1f} tokens/s")

In [None]:
start()

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Using device: cuda
Prompt token length: 72
Warm-up complete.
Generated tokens: 50
Latency        : 1.471 s
Throughput     : 34.0 tokens/s
