In [1]:
!pip install --upgrade bitsandbytes accelerate



In [2]:
import os
import time
import math
import torch
import shutil
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
)

In [3]:
# ─── SETTINGS ────────────────────────────────────────────────────────────────
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
PROMPT = (
    "Over the next decade, sustainable energy solutions will revolutionize "
    "global power grids, reducing carbon footprints and fostering resilient "
    "communities through innovative storage and distribution technologies."
)
PERP_TEXT = (
    "Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast "
    "to the natural intelligence displayed by humans and animals. Leading AI textbooks "
    "define the field as the study of intelligent agents: any system that perceives "
    "its environment and takes actions that maximize its chance of achieving its goals."
)
MAX_NEW_TOKENS = 50
FP32_DIR = "./model_fp32"
EIGHTBIT_DIR = "./model_8bit"

In [4]:
# ─── HELPERS ─────────────────────────────────────────────────────────────────
def get_dir_size_mib(path: str) -> float:
    """
    TODO: walk `path` and sum file sizes to return MiB.
    """
    total_bytes = 0
    for root, _, files in os.walk(path):
        for fname in files:
            total_bytes += os.path.getsize(os.path.join(root, fname))
    return total_bytes / 1024**2

In [5]:
# ─── LOADING & SAVING ─────────────────────────────────────────────────────────
def load_and_save_fp32(model_name: str, save_dir: str):
    """
    TODO:
      - Load tokenizer & FP32 model (torch_dtype=torch.float32)
      - Force model onto CPU only (device_map={"": "cpu"})
      - Set eval mode
      - Save to `save_dir`
      - Return (tokenizer, model)
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    model_fp32 = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float32,
        device_map={"": "cpu"}         # force all weights to CPU
    )
    model_fp32.eval()

    shutil.rmtree(save_dir, ignore_errors=True)
    model_fp32.save_pretrained(save_dir)
    return tokenizer, model_fp32

In [6]:
def load_and_save_8bit(model_name: str, save_dir: str):
    """
    TODO:
      - Create a BitsAndBytesConfig with load_in_8bit=True
      - Load tokenizer & 8-bit model via quantization_config
      - Use device_map="auto" so 8-bit layers go to GPU
      - Set eval mode
      - Save to `save_dir`
      - Return model
    """
    quant_config = BitsAndBytesConfig(load_in_8bit=True, llm_int8_threshold=6.0)
    # Note: tokenizer can be reused from FP32 load if desired
    model_8bit = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quant_config,
        device_map="auto"             # will put 8-bit weights on GPU
    )
    model_8bit.eval()

    shutil.rmtree(save_dir, ignore_errors=True)
    model_8bit.save_pretrained(save_dir)
    return model_8bit

In [7]:
def measure_latency_and_throughput(model, tokenizer, prompt: str, device):
    """
    TODO:
      - Tokenize `prompt` to tensors on `device`
      - Warm up with a short generate
      - Time a full generate(max_new_tokens=MAX_NEW_TOKENS)
      - Return (latency_s, tokens_per_second)
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    input_len = inputs["input_ids"].size(1)

    # warm-up
    _ = model.generate(**inputs, max_new_tokens=5)
    if device.type == "cuda":
        torch.cuda.synchronize()

    # timed generation
    start = time.time()
    outputs = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
    if device.type == "cuda":
        torch.cuda.synchronize()
    end = time.time()

    latency = end - start
    gen_tokens = outputs.size(1) - input_len
    return latency, gen_tokens / latency

In [8]:
def measure_peak_mem_and_perplexity(model, tokenizer, text: str, device):
    """
    TODO:
      - Tokenize `text` to tensors on `device` with labels=input_ids
      - Reset peak GPU mem stats (if CUDA)
      - Run model(**inputs) under torch.no_grad()
      - Sync CUDA, read max_memory_allocated → MiB
      - Compute loss → perplexity = exp(loss)
      - Return (peak_mem_mib, perplexity)
    """
    inputs = tokenizer(text, return_tensors="pt").to(device)

    torch.cuda.reset_peak_memory_stats(device)
    with torch.no_grad():
        out = model(**inputs, labels=inputs["input_ids"])
    torch.cuda.synchronize()
    peak_mem_mib = torch.cuda.max_memory_allocated(device) / 1024**2
    perplexity = math.exp(out.loss.item())
    return peak_mem_mib, perplexity

In [9]:
def start():
    # 1) FP32 on CPU
    tokenizer, model_fp32 = load_and_save_fp32(MODEL_NAME, FP32_DIR)
    size_fp32 = get_dir_size_mib(FP32_DIR)
    print(f"FP32 size on disk: {size_fp32:.1f} MiB")

    # 2) 8-bit quantized on GPU
    model_8bit = load_and_save_8bit(MODEL_NAME, EIGHTBIT_DIR)
    size_8bit = get_dir_size_mib(EIGHTBIT_DIR)
    print(f"8-bit size on disk: {size_8bit:.1f} MiB\n")

    # 3) Latency & Throughput
    cpu = torch.device("cpu")
    gpu = torch.device("cuda") if torch.cuda.is_available() else None

    lat32, thr32 = measure_latency_and_throughput(model_fp32, tokenizer, PROMPT, cpu)
    print(f"FP32 (CPU)    → Latency: {lat32:.3f}s, Throughput: {thr32:.1f} tok/s")

    if gpu:
        lat8, thr8 = measure_latency_and_throughput(model_8bit, tokenizer, PROMPT, gpu)
        print(f"8-bit (GPU)   → Latency: {lat8:.3f}s, Throughput: {thr8:.1f} tok/s\n")

    # 4) Memory & Perplexity (8-bit on GPU)
    if gpu:
        peak8, ppl8 = measure_peak_mem_and_perplexity(model_8bit, tokenizer, PERP_TEXT, gpu)
        print(f"8-bit Peak GPU mem: {peak8:.1f} MiB")
        print(f"8-bit Perplexity  : {ppl8:.3f}")

In [10]:
start()

FP32 size on disk: 4196.4 MiB
8-bit size on disk: 1175.7 MiB



Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


FP32 (CPU)    → Latency: 9.270s, Throughput: 5.4 tok/s
8-bit (GPU)   → Latency: 3.974s, Throughput: 12.6 tok/s

8-bit Peak GPU mem: 1269.4 MiB
8-bit Perplexity  : 4.674
