In [1]:
import os
import time
import math
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
)

In [2]:
# ─── SETTINGS ────────────────────────────────────────────────────────────────
MODEL_NAME     = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"   # or your chosen variant
PROMPT         = "…"  # TODO: write a 30–50 word prompt for generation
PERP_TEXT      = "…"  # TODO: write a ~100-token passage for perplexity
MAX_NEW_TOKENS = 50
FP32_DIR       = "./model_fp32"
EIGHTBIT_DIR   = "./model_8bit"

In [3]:
# ─── HELPERS ─────────────────────────────────────────────────────────────────
def get_dir_size_mib(path: str) -> float:
    """
    TODO: walk `path` and sum file sizes to return MiB.
    """
    raise NotImplementedError

In [4]:
# ─── LOADING & SAVING ─────────────────────────────────────────────────────────
def load_and_save_fp32(model_name: str, save_dir: str):
    """
    TODO:
      - Load tokenizer & FP32 model (torch_dtype=torch.float32)
      - Force model onto CPU only (device_map={"": "cpu"})
      - Set eval mode
      - Save to `save_dir`
      - Return (tokenizer, model)
    """
    raise NotImplementedError

In [5]:
def load_and_save_8bit(model_name: str, save_dir: str):
    """
    TODO:
      - Create a BitsAndBytesConfig with load_in_8bit=True
      - Load tokenizer & 8-bit model via quantization_config
      - Use device_map="auto" so 8-bit layers go to GPU
      - Set eval mode
      - Save to `save_dir`
      - Return model
    """
    raise NotImplementedError

In [6]:
# ─── METRICS ──────────────────────────────────────────────────────────────────
def measure_latency_and_throughput(model, tokenizer, prompt: str, device):
    """
    TODO:
      - Tokenize `prompt` to tensors on `device`
      - Warm up with a short generate
      - Time a full generate(max_new_tokens=MAX_NEW_TOKENS)
      - Return (latency_s, tokens_per_second)
    """
    raise NotImplementedError

In [7]:
def measure_peak_mem_and_perplexity(model, tokenizer, text: str, device):
    """
    TODO:
      - Tokenize `text` to tensors on `device` with labels=input_ids
      - Reset peak GPU mem stats (if CUDA)
      - Run model(**inputs) under torch.no_grad()
      - Sync CUDA, read max_memory_allocated → MiB
      - Compute loss → perplexity = exp(loss)
      - Return (peak_mem_mib, perplexity)
    """
    raise NotImplementedError

In [None]:
def start():
    # 1) FP32 on CPU
    tokenizer, model_fp32 = load_and_save_fp32(MODEL_NAME, FP32_DIR)
    size_fp32 = get_dir_size_mib(FP32_DIR)
    print(f"FP32 size on disk: {size_fp32:.1f} MiB")

    # 2) 8-bit quantized on GPU
    model_8bit = load_and_save_8bit(MODEL_NAME, EIGHTBIT_DIR)
    size_8bit = get_dir_size_mib(EIGHTBIT_DIR)
    print(f"8-bit size on disk: {size_8bit:.1f} MiB\n")

    # 3) Latency & Throughput
    cpu = torch.device("cpu")
    gpu = torch.device("cuda") if torch.cuda.is_available() else None

    lat32, thr32 = measure_latency_and_throughput(model_fp32, tokenizer, PROMPT, cpu)
    print(f"FP32 (CPU)    → Latency: {lat32:.3f}s, Throughput: {thr32:.1f} tok/s")

    if gpu:
        lat8, thr8 = measure_latency_and_throughput(model_8bit, tokenizer, PROMPT, gpu)
        print(f"8-bit (GPU)   → Latency: {lat8:.3f}s, Throughput: {thr8:.1f} tok/s\n")

    # 4) Memory & Perplexity (8-bit on GPU)
    if gpu:
        peak8, ppl8 = measure_peak_mem_and_perplexity(model_8bit, tokenizer, PERP_TEXT, gpu)
        print(f"8-bit Peak GPU mem: {peak8:.1f} MiB")
        print(f"8-bit Perplexity  : {ppl8:.3f}")

In [None]:
start()