# Part 1

In [1]:
import os, shutil, time, math, gc, tempfile, json, contextlib
from dataclasses import dataclass

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

# Optimum (Quanto) for post-training quantization (static, weight-only int8)
from optimum.quanto import quantize, freeze, qint8

MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
DEVICE   = "cuda" if torch.cuda.is_available() else "cpu"
SEED     = 42

torch.manual_seed(SEED)
if DEVICE == "cuda":
    torch.cuda.manual_seed_all(SEED)


2025-08-31 19:49:33.114781: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-31 19:49:33.128764: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-08-31 19:49:33.146588: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-08-31 19:49:33.151886: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-31 19:49:33.164321: I tensorflow/core/platform/cpu_feature_guar

In [2]:
# --- 1) Utilities -------------------------------------------------------------
def dir_size_mb(path: str) -> float:
    total = 0
    for root, _, files in os.walk(path):
        for f in files:
            total += os.path.getsize(os.path.join(root, f))
    return total / (1024**2)

In [3]:
@contextlib.contextmanager
def torch_cuda_monitor():
    """Context manager to measure peak GPU memory in MB."""
    if DEVICE == "cuda":
        torch.cuda.reset_peak_memory_stats()
        torch.cuda.synchronize()
        start_alloc = torch.cuda.memory_allocated()
        try:
            yield
        finally:
            torch.cuda.synchronize()
            peak = torch.cuda.max_memory_allocated()
            torch.cuda.empty_cache()
            # return values indirectly by storing on the function object
            torch_cuda_monitor.peak_mb = peak / (1024**2)
            torch_cuda_monitor.start_mb = start_alloc / (1024**2)
    else:
        try:
            yield
        finally:
            torch_cuda_monitor.peak_mb = 0.0
            torch_cuda_monitor.start_mb = 0.0


In [4]:
@dataclass
class GenMetrics:
    latency_s: float
    tokens_per_sec: float
    peak_gpu_mem_mb: float

In [5]:
def measure_generate(model, tokenizer, prompt: str, max_new_tokens=64, runs=3) -> GenMetrics:
    """Measure latency, throughput, and peak GPU memory for text generation."""
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    input_len = inputs["input_ids"].shape[1]

    # Warmup
    with torch.inference_mode():
        _ = model.generate(**inputs, max_new_tokens=8, do_sample=False, use_cache=True)

    latencies, tps = [], []
    with torch_cuda_monitor():
        for _ in range(runs):
            if DEVICE == "cuda":
                torch.cuda.synchronize()
            t0 = time.perf_counter()
            with torch.inference_mode():
                out = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False, use_cache=True)
            if DEVICE == "cuda":
                torch.cuda.synchronize()
            t1 = time.perf_counter()

            gen_len = out.shape[1] - input_len
            lat = t1 - t0
            latencies.append(lat)
            tps.append(gen_len / lat if lat > 0 else float("nan"))

    return GenMetrics(
        latency_s=sum(latencies)/len(latencies),
        tokens_per_sec=sum(tps)/len(tps),
        peak_gpu_mem_mb=getattr(torch_cuda_monitor, "peak_mb", 0.0),
    )

In [6]:
@torch.no_grad()
def compute_perplexity(model, tokenizer, seq_len=128) -> float:
    """
    Self-contained perplexity estimate using a small built-in eval text.
    Keeps evaluation light but still allows FP32 vs INT8 comparison.
    """
    eval_text = (
        "Quantization reduces the precision of neural network weights and activations. "
        "This process shrinks model size, lowers memory use, and can speed up inference. "
        "The tradeoff is a small drop in accuracy. "
        "Perplexity measures how well a language model predicts text: "
        "a lower perplexity means the model is more confident in its predictions. "
        "Large language models like LLaMA or TinyLlama are evaluated on benchmarks such as WikiText, "
        "where perplexity is calculated over thousands of tokens. "
        "In practice, we only need a small text sample to compare relative changes. "
        "By quantizing a model to 8-bit, we can observe whether perplexity increases significantly. "
        "If the rise is modest while speed and memory improve, quantization is usually a good trade-off. "
        "This evaluation text is deliberately extended to ensure enough tokens for testing."
    )

    enc = tokenizer(eval_text, return_tensors="pt")
    input_ids = enc["input_ids"][0]

    usable = (len(input_ids) // seq_len) * seq_len
    input_ids = input_ids[:usable + 1]
    if len(input_ids) <= seq_len:
        raise ValueError("Not enough tokens for perplexity calculation. Try reducing seq_len.")

    nll_sum, tok_count = 0.0, 0
    model.eval()

    for start in range(0, len(input_ids) - 1 - seq_len, seq_len):
        chunk = input_ids[start:start+seq_len+1]
        inp = chunk[:-1].unsqueeze(0).to(DEVICE)
        labels = chunk[1:].unsqueeze(0).to(DEVICE)

        out = model(input_ids=inp, labels=labels)
        nll_sum += float(out.loss) * labels.numel()
        tok_count += labels.numel()

    return math.exp(nll_sum / max(1, tok_count))

In [7]:
def save_and_size(model, tokenizer, out_dir: str) -> float:
    if os.path.exists(out_dir):
        shutil.rmtree(out_dir)
    os.makedirs(out_dir, exist_ok=True)
    tokenizer.save_pretrained(out_dir)
    model.save_pretrained(out_dir, safe_serialization=True)
    return dir_size_mb(out_dir)

In [8]:
def print_row(title, size_mb, lat_s, tps, gpu_mb, ppl):
    print(
        f"{title:18s} | Size: {size_mb:8.1f} MB | Latency: {lat_s:7.3f} s | "
        f"Throughput: {tps:7.2f} tok/s | Peak VRAM: {gpu_mb:7.1f} MB | PPL: {ppl:7.2f}"
    )

In [9]:
# --- 2) Load tokenizer --------------------------------------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
# Some chat models have no pad token; make generation/perplexity robust:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [10]:
# --- 3) Baseline: FP32 --------------------------------------------------------
print("\n== Baseline FP32 ==")
baseline_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float32,
    low_cpu_mem_usage=True,
)
baseline_model.to(DEVICE)

baseline_size_mb = save_and_size(baseline_model, tokenizer, out_dir="tinyllama_fp32")
baseline_gen = measure_generate(
    baseline_model,
    tokenizer,
    prompt="Explain quantization in one paragraph for ML engineers.",
    max_new_tokens=128,
    runs=3,
)
baseline_ppl = compute_perplexity(baseline_model, tokenizer, seq_len=128)

print_row("FP32 (baseline)", baseline_size_mb, baseline_gen.latency_s, baseline_gen.tokens_per_sec,
          baseline_gen.peak_gpu_mem_mb, baseline_ppl)


== Baseline FP32 ==
FP32 (baseline)    | Size:   4200.3 MB | Latency:   2.644 s | Throughput:   48.42 tok/s | Peak VRAM:  4212.9 MB | PPL:    1.00


In [11]:
# --- 4) Post-Training Quantization: 8-bit (static, weight-only) --------------
# Fresh load in FP32, then quantize weights to int8 via Optimum-Quanto.
print("\n== PTQ INT8 (Optimum-Quanto, weight-only) ==")
q_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float32,
    low_cpu_mem_usage=True,
)

# Static PTQ (weight-only): no calibration set required.
# This converts Linear weights to int8-packed format and wires quant/dequant where needed.
quantize(q_model, weights=qint8)
freeze(q_model)               # finalize quantization graphs / params
q_model.to(DEVICE)

q_size_mb = save_and_size(q_model, tokenizer, out_dir="tinyllama_int8_quanto")

q_gen = measure_generate(
    q_model,
    tokenizer,
    prompt="Explain quantization in one paragraph for ML engineers.",
    max_new_tokens=128,
    runs=3,
)
q_ppl = compute_perplexity(q_model, tokenizer, seq_len=128)

print_row("INT8 (Quanto)", q_size_mb, q_gen.latency_s, q_gen.tokens_per_sec, q_gen.peak_gpu_mem_mb, q_ppl)



== PTQ INT8 (Optimum-Quanto, weight-only) ==
INT8 (Quanto)      | Size:   1242.5 MB | Latency:   6.658 s | Throughput:   19.23 tok/s | Peak VRAM:  5765.4 MB | PPL:    1.00


In [12]:
# --- 5) Summary JSON (optional) ----------------------------------------------
summary = {
    "device": DEVICE,
    "model": MODEL_ID,
    "seed": SEED,
    "baseline_fp32": {
        "size_mb": baseline_size_mb,
        "latency_s": baseline_gen.latency_s,
        "tokens_per_sec": baseline_gen.tokens_per_sec,
        "peak_gpu_mem_mb": baseline_gen.peak_gpu_mem_mb,
        "perplexity": baseline_ppl,
    },
    "int8_quanto": {
        "size_mb": q_size_mb,
        "latency_s": q_gen.latency_s,
        "tokens_per_sec": q_gen.tokens_per_sec,
        "peak_gpu_mem_mb": q_gen.peak_gpu_mem_mb,
        "perplexity": q_ppl,
    },
}
with open("ptq_tinyllama_summary.json", "w") as f:
    json.dump(summary, f, indent=2)

print("\nSaved summary -> ptq_tinyllama_summary.json")



Saved summary -> ptq_tinyllama_summary.json


In [None]:
summary

# Part 2

In [13]:
import os, time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.ao.quantization import quantize_dynamic

In [14]:
# -----------------------------
# Config (CPU-only demo)
# -----------------------------
MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # swap to "sshleifer/tiny-gpt2" if CPU is tight
DEVICE = "cpu"
MAX_NEW_TOKENS = 128
RUNS = 3
PROMPT = "Quantization test: explain why int8 dynamic quantization can be faster on CPU."

torch.set_grad_enabled(False)
torch.set_num_threads(max(1, os.cpu_count() or 1))  # let PyTorch use available cores

In [15]:
def measure_generate(model, tokenizer, prompt=PROMPT, max_new_tokens=MAX_NEW_TOKENS, runs=RUNS):
    model.eval()
    tokenizer.padding_side = "left"
    enc = tokenizer(prompt, return_tensors="pt").to(DEVICE)

    # Warmup
    with torch.inference_mode():
        _ = model.generate(**enc, max_new_tokens=8, use_cache=True)

    latencies, throughputs = [], []
    for _ in range(runs):
        t0 = time.perf_counter()
        with torch.inference_mode():
            out = model.generate(**enc, max_new_tokens=max_new_tokens, use_cache=True)
        t1 = time.perf_counter()
        gen_len = out.shape[1] - enc["input_ids"].shape[1]
        lat = t1 - t0
        latencies.append(lat)
        throughputs.append(gen_len / lat)

    return sum(latencies)/len(latencies), sum(throughputs)/len(throughputs)

In [16]:
# -----------------------------
# Tokenizer (shared)
# -----------------------------
tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
if tok.pad_token_id is None:
    tok.pad_token_id = tok.eos_token_id


In [17]:
# -----------------------------
# Baseline: FP32 on CPU
# -----------------------------
print("== CPU FP32 Baseline ==")
model_fp32 = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.float32).to(DEVICE).eval()
model_fp32.config.use_cache = True

lat_fp32, tps_fp32 = measure_generate(model_fp32, tok)
print(f"FP32 (CPU) | Latency {lat_fp32:.3f}s | Throughput {tps_fp32:.2f} tok/s")

== CPU FP32 Baseline ==
FP32 (CPU) | Latency 0.611s | Throughput 1.66 tok/s


In [18]:
# -----------------------------
# Quantized: INT8 dynamic (CPU)
# -----------------------------
# Quantize only Linear layers to int8 (native PyTorch). This is weight-only int8 + dynamic activation quant.
model_int8 = quantize_dynamic(
    model_fp32.cpu(),
    {torch.nn.Linear},
    dtype=torch.qint8
).eval()
# (Optional) free original object to reduce memory
del model_fp32


In [19]:
# -----------------------------
# Summary
# -----------------------------
lat_int8, tps_int8 = measure_generate(model_int8, tok)
speedup = lat_fp32 / lat_int8 if lat_int8 > 0 else float("inf")


print("\n== Summary ==")
print(f"CPU FP32 latency: {lat_fp32:.3f}s | CPU INT8 latency: {lat_int8:.3f}s | Speedup: {speedup:.2f}x")
print(f"CPU FP32 tput:   {tps_fp32:.2f} tok/s | CPU INT8 tput:   {tps_int8:.2f} tok/s")



== Summary ==
CPU FP32 latency: 0.611s | CPU INT8 latency: 0.220s | Speedup: 2.78x
CPU FP32 tput:   1.66 tok/s | CPU INT8 tput:   4.55 tok/s
