In [1]:
import math
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
# ─── SETTINGS ────────────────────────────────────────────────────────────────
MODEL_NAME = "meta-llama/Llama-3.2-1B"

In [3]:
def load_model_and_tokenizer(model_name: str):
    """
    TODO:
      - Load tokenizer & model from `model_name`
      - Move model to GPU if available, choose float16 for CUDA else float32
      - Set model to eval mode
      - Return (tokenizer, model, device)
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    dtype  = torch.float16 if device.type == "cuda" else torch.float32

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=dtype
    )
    model.to(device)
    model.eval()
    return tokenizer, model, device


In [4]:
def select_text():
    """
    TODO:
      - Pick or paste a ~100-token passage (e.g. a Wiki snippet)
      - Return (text_str)
    """
    return (
        "Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural "
        "intelligence displayed by humans and animals. Leading AI textbooks define the field as the study "
        "of intelligent agents: any system that perceives its environment and takes actions that maximize "
        "its chance of achieving its goals. Colloquially, the term \"artificial intelligence\" is often "
        "used to describe machines that mimic cognitive functions that humans associate with the human mind, "
        "such as learning and problem-solving."
    )

In [5]:
def tokenize_with_labels(tokenizer, text: str):
    """
    TODO:
      - Tokenize `text` with return_tensors="pt"
      - Prepare inputs and set labels = input_ids
      - Return (inputs_dict, input_len)
    """
    inputs = tokenizer(text, return_tensors="pt")
    inputs["labels"] = inputs["input_ids"].clone()
    input_len = inputs["input_ids"].size(1)
    return inputs, input_len

In [6]:
def compute_peak_memory_and_loss(model, inputs, device):
    """
    TODO:
      - Reset peak memory stats if CUDA
      - Run model(**inputs) under torch.no_grad()
      - Sync CUDA if needed
      - Retrieve peak memory via torch.cuda.max_memory_allocated (in MiB)
      - Return (peak_mem_mib, loss_value)
    """
    if device.type == "cuda":
        torch.cuda.reset_peak_memory_stats(device)

    with torch.no_grad():
        outputs = model(
            input_ids=inputs["input_ids"].to(device),
            attention_mask=inputs.get("attention_mask", None).to(device) if inputs.get("attention_mask") is not None else None,
            labels=inputs["labels"].to(device)
        )

    if device.type == "cuda":
        torch.cuda.synchronize()
        peak_bytes = torch.cuda.max_memory_allocated(device)
        peak_mib = peak_bytes / 1024**2
    else:
        peak_mib = float("nan")

    return peak_mib, outputs.loss.item()

In [7]:
def compute_perplexity(loss: float):
    """
    TODO:
      - Compute and return math.exp(loss)
    """
    return math.exp(loss)

In [8]:
def start():
    # 1. Load
    tokenizer, model, device = load_model_and_tokenizer(MODEL_NAME)
    print(f"Using device: {device}")

    # 2. Select & tokenize
    text = select_text()
    inputs, input_len = tokenize_with_labels(tokenizer, text)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    print(f"Tokenized length: {input_len}")

    # 3. Measure peak memory & loss
    peak_mem, loss = compute_peak_memory_and_loss(model, inputs, device)
    print(f"Peak GPU memory: {peak_mem:.1f} MiB")

    # 4. Compute perplexity
    ppl = compute_perplexity(loss)
    print(f"Next-token perplexity: {ppl:.3f}")


In [9]:
start()

2025-08-03 01:08:25.013294: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-03 01:08:25.215918: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-08-03 01:08:25.249061: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-08-03 01:08:25.258579: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-03 01:08:25.492654: I tensorflow/core/platform/cpu_feature_guar

Using device: cuda
Tokenized length: 94
Peak GPU memory: 2484.6 MiB
Next-token perplexity: 3.425
