In [1]:
import os
import time
import math
import torch
import torch.nn as nn
import torch.nn.utils.prune as prune
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
MODEL_NAME     = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
MLP_PRUNE_FRAC = 0.5       # fraction of inner neurons to prune
MAX_NEW_TOKENS = 50
PROMPT = (
    "Over the next decade, sustainable energy solutions will revolutionize "
    "global power grids, reducing carbon footprints and fostering resilient "
    "communities through innovative storage and distribution technologies."
)
PERP_TEXT = (
    "Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast "
    "to the natural intelligence displayed by humans and animals. Leading AI textbooks "
    "define the field as the study of intelligent agents: any system that perceives "
    "its environment and takes actions that maximize its chance of achieving its goals."
)


In [3]:
def load_model_and_tokenizer(model_name: str, device: torch.device):
    """
    TODO:
      - Load AutoTokenizer.from_pretrained(model_name, use_fast=True)
      - Load AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
      - Move model to `device` and set to .eval()
      - Return tokenizer, model
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    # load model in FP16 for faster inference
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16
    )
    # move to device and set to eval
    model = model.to(device)
    model.eval()
    return tokenizer, model

In [4]:
def measure_baseline(model: nn.Module, tokenizer, prompt: str, perp_text: str, device: torch.device):
    """
    TODO:
      - Warm up & measure generation latency & throughput on `prompt`
      - Measure peak GPU memory & perplexity on `perp_text`
      - Print or return these baseline metrics
    """
    # 1) Measure latency & throughput
    latency, throughput = measure_latency_and_throughput(model, tokenizer, prompt, device)
    # 2) Measure peak GPU memory & perplexity
    peak_mem, perplexity = measure_peak_mem_and_perplexity(model, tokenizer, perp_text, device)

    # 3) Print baseline metrics
    print(f"[Baseline] latency   = {latency:.3f}s")
    print(f"[Baseline] throughput= {throughput:.1f} tok/s")
    print(f"[Baseline] peak GPU  = {peak_mem:.1f} MiB")
    print(f"[Baseline] perplexity= {perplexity:.3f}")

    # Return in case caller wants to use them programmatically
    return {
        "latency": latency,
        "throughput": throughput,
        "peak_gpu_mem": peak_mem,
        "perplexity": perplexity
    }

In [5]:
def prune_mlp_rows_and_cols(model: nn.Module, prune_frac: float):
    """
    TODO:
      - Move model to CPU
      - For each layer in model.model.layers:
          • Zero out `prune_frac` of rows in gate_proj and up_proj
          • Zero out corresponding `prune_frac` of columns in down_proj
      - Remove pruning reparameterizations
    """
    # 1) Ensure we prune on CPU to avoid GPU OOM
    model.cpu()
    torch.cuda.empty_cache()

    # 2) Iterate through each decoder layer’s MLP
    for layer in model.model.layers:
        gate = layer.mlp.gate_proj   # [inner, hidden]
        up   = layer.mlp.up_proj     # [inner, hidden]
        down = layer.mlp.down_proj   # [hidden, inner]

        # 2a) Zero out rows in gate_proj and up_proj
        for proj in (gate, up):
            prune.ln_structured(
                proj,
                name="weight",
                amount=prune_frac,
                n=1,
                dim=0,           # prune entire rows
            )
            prune.remove(proj, "weight")

        # 2b) Zero out corresponding columns in down_proj
        prune.ln_structured(
            down,
            name="weight",
            amount=prune_frac,
            n=1,
            dim=1,               # prune entire columns
        )
        prune.remove(down, "weight")

    # 3) Return the model (now with zeros in place)
    return model

In [6]:
def rebuild_mlp_blocks(model: nn.Module):
    """
    TODO:
      - For each layer in model.model.layers:
          1) Identify kept neuron indices in gate_proj
          2) Construct new nn.Linear modules for gate_proj, up_proj, down_proj
             with reduced dimensions
          3) Copy over weights and biases
          4) Replace the old modules on the model
    """
    for layer in model.model.layers:
        # original modules (still on CPU, dtype=original)
        old_gate = layer.mlp.gate_proj
        old_up   = layer.mlp.up_proj
        old_down = layer.mlp.down_proj

        # discover surviving rows in gate_proj
        Wg = old_gate.weight.data     # [inner_orig, hidden], dtype say torch.half
        keep_idx = (Wg.abs().sum(dim=1) != 0).nonzero(as_tuple=False).view(-1)
        inner_new = keep_idx.numel()
        hidden    = Wg.size(1)
        dtype     = Wg.dtype
        device    = Wg.device

        # helper to build a new Linear with the same dtype/device
        def make_linear(in_f, out_f, bias, old_weight, old_bias=None):
            nl = nn.Linear(in_f, out_f, bias=bias)
            # init in correct dtype & device
            nl.weight.data = old_weight.clone().to(device=device, dtype=dtype)
            if bias and old_bias is not None:
                nl.bias.data = old_bias.clone().to(device=device, dtype=dtype)
            return nl

        # rebuild gate_proj: hidden -> inner_new
        new_gate = make_linear(
            hidden, inner_new, 
            bias=(old_gate.bias is not None),
            old_weight=old_gate.weight.data[keep_idx],
            old_bias=old_gate.bias.data[keep_idx] if old_gate.bias is not None else None
        )

        # rebuild up_proj: hidden -> inner_new
        new_up = make_linear(
            hidden, inner_new,
            bias=(old_up.bias is not None),
            old_weight=old_up.weight.data[keep_idx],
            old_bias=old_up.bias.data[keep_idx] if old_up.bias is not None else None
        )

        # rebuild down_proj: inner_new -> hidden
        new_down = make_linear(
            inner_new, hidden,
            bias=(old_down.bias is not None),
            old_weight=old_down.weight.data[:, keep_idx],
            old_bias=old_down.bias.data if old_down.bias is not None else None
        )

        # swap in-place
        layer.mlp.gate_proj = new_gate
        layer.mlp.up_proj   = new_up
        layer.mlp.down_proj = new_down

    return model

In [7]:
def measure_rebuilt(model: nn.Module, tokenizer, prompt: str, perp_text: str, device: torch.device):
    """
    TODO:
      - Move rebuilt model to `device` & .eval()
      - Re-measure latency, throughput, peak memory, perplexity
      - Print or return these metrics
    """
    # 1) Move to device and set to eval
    model.to(device)
    model.eval()

    # 2) Measure latency & throughput
    latency, throughput = measure_latency_and_throughput(
        model, tokenizer, prompt, device
    )

    # 3) Measure peak GPU memory & perplexity
    peak_mem, perplexity = measure_peak_mem_and_perplexity(
        model, tokenizer, perp_text, device
    )

    # 4) Print results
    print(f"[Rebuilt] latency   = {latency:.3f}s")
    print(f"[Rebuilt] throughput= {throughput:.1f} tok/s")
    print(f"[Rebuilt] peak GPU  = {peak_mem:.1f} MiB")
    print(f"[Rebuilt] perplexity= {perplexity:.3f}")

    # 5) Return for further use if needed
    return {
        "latency": latency,
        "throughput": throughput,
        "peak_gpu_mem": peak_mem,
        "perplexity": perplexity
    }

In [8]:
def save_and_report_size(model: nn.Module, output_dir: str):
    """
    TODO:
      - model.save_pretrained(output_dir)
      - Walk `output_dir` to sum file sizes (in MiB)
      - Print the on-disk size
    """
    # 1) Save
    model.save_pretrained(output_dir)

    # 2) Sum file sizes
    total_bytes = 0
    for root, _, files in os.walk(output_dir):
        for fname in files:
            total_bytes += os.path.getsize(os.path.join(root, fname))

    # 3) Convert to MiB and print
    size_mb = total_bytes / 1024**2
    print(f"[Rebuilt] on-disk size = {size_mb:.1f} MiB")

    return size_mb

In [9]:
def measure_peak_mem_and_perplexity(model, tokenizer, text: str, device):
    """
    TODO:
      - Tokenize `text` to tensors on `device` with labels=input_ids
      - Reset peak GPU mem stats (if CUDA)
      - Run model(**inputs) under torch.no_grad()
      - Sync CUDA, read max_memory_allocated → MiB
      - Compute loss → perplexity = exp(loss)
      - Return (peak_mem_mib, perplexity)
    """
    inputs = tokenizer(text, return_tensors="pt").to(device)

    torch.cuda.reset_peak_memory_stats(device)
    with torch.no_grad():
        out = model(**inputs, labels=inputs["input_ids"])
    torch.cuda.synchronize()
    peak_mem_mib = torch.cuda.max_memory_allocated(device) / 1024**2
    perplexity = math.exp(out.loss.item())
    return peak_mem_mib, perplexity

In [10]:
def measure_latency_and_throughput(model, tokenizer, prompt: str, device):
    """
    TODO:
      - Tokenize `prompt` to tensors on `device`
      - Warm up with a short generate
      - Time a full generate(max_new_tokens=MAX_NEW_TOKENS)
      - Return (latency_s, tokens_per_second)
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    input_len = inputs["input_ids"].size(1)

    # warm-up
    _ = model.generate(**inputs, max_new_tokens=5)
    if device.type == "cuda":
        torch.cuda.synchronize()

    # timed generation
    start = time.time()
    outputs = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
    if device.type == "cuda":
        torch.cuda.synchronize()
    end = time.time()

    latency = end - start
    gen_tokens = outputs.size(1) - input_len
    return latency, gen_tokens / latency

In [11]:
def start():
    device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer, model = load_model_and_tokenizer(MODEL_NAME, device)

    # Baseline
    measure_baseline(model, tokenizer, PROMPT, PERP_TEXT, device)

    # Prune on CPU
    prune_mlp_rows_and_cols(model, MLP_PRUNE_FRAC)

    # Rebuild smaller MLPs
    rebuild_mlp_blocks(model)

    # Re-benchmark rebuilt model
    measure_rebuilt(model, tokenizer, PROMPT, PERP_TEXT, device)

    # Save & report on-disk size
    save_and_report_size(model, "llama_pruned_rebuilt")

In [12]:
start()

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


[Baseline] latency   = 1.071s
[Baseline] throughput= 46.7 tok/s
[Baseline] peak GPU  = 2125.4 MiB
[Baseline] perplexity= 4.557
[Rebuilt] latency   = 1.075s
[Rebuilt] throughput= 46.5 tok/s
[Rebuilt] peak GPU  = 1465.4 MiB
[Rebuilt] perplexity= 428383.216
[Rebuilt] on-disk size = 1372.2 MiB
