# Import Required Libraries
This cell:
- Imports standard libraries for file handling, timing, and mathematical operations.
- Imports PyTorch for deep learning operations and pruning utilities.
- Imports Hugging Face Transformers for model and tokenizer handling.

In [1]:
import os
import time
import math
import torch
import torch.nn as nn
import torch.nn.utils.prune as prune
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define Model and Evaluation Settings
This cell:
- Specifies the model name to be used for pruning and evaluation.
- Defines the fraction of neurons to prune in the MLP layers (`MLP_PRUNE_FRAC`).
- Sets the maximum number of tokens to generate during inference.
- Provides sample texts for benchmarking latency, throughput, and perplexity.

In [2]:
MODEL_NAME     = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
MLP_PRUNE_FRAC = 0.5       # fraction of inner neurons to prune
MAX_NEW_TOKENS = 50
PROMPT = (
    "Over the next decade, sustainable energy solutions will revolutionize "
    "global power grids, reducing carbon footprints and fostering resilient "
    "communities through innovative storage and distribution technologies."
)
PERP_TEXT = (
    "Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast "
    "to the natural intelligence displayed by humans and animals. Leading AI textbooks "
    "define the field as the study of intelligent agents: any system that perceives "
    "its environment and takes actions that maximize its chance of achieving its goals."
)


# Load Model and Tokenizer
This function:
- Loads the tokenizer and model using Hugging Face Transformers.
- Configures the model to use FP16 precision for faster inference.
- Moves the model to the specified device (CPU or GPU).
- Sets the model to evaluation mode to disable gradient computations.

In [3]:
def load_model_and_tokenizer(model_name: str, device: torch.device):
    """
    TODO:
      - Load AutoTokenizer.from_pretrained(model_name, use_fast=True)
      - Load AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
      - Move model to `device` and set to .eval()
      - Return tokenizer, model
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    # load model in FP16 for faster inference
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16
    )
    # move to device and set to eval
    model = model.to(device)
    model.eval()
    return tokenizer, model

# Measure Baseline Performance
This function:
- Measures the baseline performance of the model before pruning.
- Evaluates:
  - **Latency**: Time taken to generate tokens for a given prompt.
  - **Throughput**: Tokens generated per second.
  - **Peak GPU Memory Usage**: Maximum memory used during inference.
  - **Perplexity**: A measure of how well the model predicts the given text.
- Prints the baseline metrics for comparison with the pruned model.

In [None]:
def measure_latency_and_throughput(model, tokenizer, prompt: str, device: torch.device, max_new_tokens=50, runs=3):
    """
    Measure latency and throughput for text generation.

    Args:
        model: The language model to evaluate.
        tokenizer: The tokenizer associated with the model.
        prompt (str): Input prompt for text generation.
        device (torch.device): Device to run the model on.
        max_new_tokens (int): Maximum number of new tokens to generate.
        runs (int): Number of runs for averaging metrics.

    Returns:
        tuple: Average latency (seconds) and throughput (tokens per second).
    """
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)

    # Warmup
    with torch.inference_mode():
        _ = model.generate(**inputs, max_new_tokens=8)

    latencies = []
    throughputs = []
    for _ in range(runs):
        if device.type == "cuda":
            torch.cuda.synchronize()
        t0 = time.perf_counter()
        with torch.inference_mode():
            outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
        if device.type == "cuda":
            torch.cuda.synchronize()
        t1 = time.perf_counter()

        gen_len = outputs.shape[1] - inputs["input_ids"].shape[1]
        latency = t1 - t0
        throughput = gen_len / latency if latency > 0 else float("nan")

        latencies.append(latency)
        throughputs.append(throughput)

    avg_latency = sum(latencies) / len(latencies)
    avg_throughput = sum(throughputs) / len(throughputs)
    return avg_latency, avg_throughput

In [4]:
def measure_baseline(model: nn.Module, tokenizer, prompt: str, perp_text: str, device: torch.device):
    """
    TODO:
      - Warm up & measure generation latency & throughput on `prompt`
      - Measure peak GPU memory & perplexity on `perp_text`
      - Print or return these baseline metrics
    """
    # 1) Measure latency & throughput
    latency, throughput = measure_latency_and_throughput(model, tokenizer, prompt, device)
    # 2) Measure peak GPU memory & perplexity
    peak_mem, perplexity = measure_peak_mem_and_perplexity(model, tokenizer, perp_text, device)

    # 3) Print baseline metrics
    print(f"[Baseline] latency   = {latency:.3f}s")
    print(f"[Baseline] throughput= {throughput:.1f} tok/s")
    print(f"[Baseline] peak GPU  = {peak_mem:.1f} MiB")
    print(f"[Baseline] perplexity= {perplexity:.3f}")

    # Return in case caller wants to use them programmatically
    return {
        "latency": latency,
        "throughput": throughput,
        "peak_gpu_mem": peak_mem,
        "perplexity": perplexity
    }

# Prune MLP Rows and Columns
This function:
- Prunes the MLP layers in the model by:
  - Zeroing out a fraction of rows in the `gate_proj` and `up_proj` layers.
  - Zeroing out the corresponding columns in the `down_proj` layer.
- Uses structured pruning to remove entire rows or columns.
- Ensures pruning is performed on the CPU to avoid GPU memory issues.
- Removes the pruning reparameterizations after applying the masks.

In [5]:
def prune_mlp_rows_and_cols(model: nn.Module, prune_frac: float):
    """
    TODO:
      - Move model to CPU
      - For each layer in model.model.layers:
          • Zero out `prune_frac` of rows in gate_proj and up_proj
          • Zero out corresponding `prune_frac` of columns in down_proj
      - Remove pruning reparameterizations
    """
    # 1) Ensure we prune on CPU to avoid GPU OOM
    model.cpu()
    torch.cuda.empty_cache()

    # 2) Iterate through each decoder layer’s MLP
    for layer in model.model.layers:
        gate = layer.mlp.gate_proj   # [inner, hidden]
        up   = layer.mlp.up_proj     # [inner, hidden]
        down = layer.mlp.down_proj   # [hidden, inner]

        # 2a) Zero out rows in gate_proj and up_proj
        for proj in (gate, up):
            prune.ln_structured(
                proj,
                name="weight",
                amount=prune_frac,
                n=1,
                dim=0,           # prune entire rows
            )
            prune.remove(proj, "weight")

        # 2b) Zero out corresponding columns in down_proj
        prune.ln_structured(
            down,
            name="weight",
            amount=prune_frac,
            n=1,
            dim=1,               # prune entire columns
        )
        prune.remove(down, "weight")

    # 3) Return the model (now with zeros in place)
    return model

# Rebuild MLP Blocks
This function:
- Reconstructs the pruned MLP layers with reduced dimensions.
- Identifies the neurons that were not pruned in the `gate_proj` layer.
- Creates new `nn.Linear` modules for `gate_proj`, `up_proj`, and `down_proj` with updated dimensions.
- Copies the weights and biases from the original layers to the new layers.
- Replaces the old modules with the new ones in the model.

In [6]:
def rebuild_mlp_blocks(model: nn.Module):
    """
    TODO:
      - For each layer in model.model.layers:
          1) Identify kept neuron indices in gate_proj
          2) Construct new nn.Linear modules for gate_proj, up_proj, down_proj
             with reduced dimensions
          3) Copy over weights and biases
          4) Replace the old modules on the model
    """
    for layer in model.model.layers:
        # original modules (still on CPU, dtype=original)
        old_gate = layer.mlp.gate_proj
        old_up   = layer.mlp.up_proj
        old_down = layer.mlp.down_proj

        # discover surviving rows in gate_proj
        Wg = old_gate.weight.data     # [inner_orig, hidden], dtype say torch.half
        keep_idx = (Wg.abs().sum(dim=1) != 0).nonzero(as_tuple=False).view(-1)
        inner_new = keep_idx.numel()
        hidden    = Wg.size(1)
        dtype     = Wg.dtype
        device    = Wg.device

        # helper to build a new Linear with the same dtype/device
        def make_linear(in_f, out_f, bias, old_weight, old_bias=None):
            nl = nn.Linear(in_f, out_f, bias=bias)
            # init in correct dtype & device
            nl.weight.data = old_weight.clone().to(device=device, dtype=dtype)
            if bias and old_bias is not None:
                nl.bias.data = old_bias.clone().to(device=device, dtype=dtype)
            return nl

        # rebuild gate_proj: hidden -> inner_new
        new_gate = make_linear(
            hidden, inner_new, 
            bias=(old_gate.bias is not None),
            old_weight=old_gate.weight.data[keep_idx],
            old_bias=old_gate.bias.data[keep_idx] if old_gate.bias is not None else None
        )

        # rebuild up_proj: hidden -> inner_new
        new_up = make_linear(
            hidden, inner_new,
            bias=(old_up.bias is not None),
            old_weight=old_up.weight.data[keep_idx],
            old_bias=old_up.bias.data[keep_idx] if old_up.bias is not None else None
        )

        # rebuild down_proj: inner_new -> hidden
        new_down = make_linear(
            inner_new, hidden,
            bias=(old_down.bias is not None),
            old_weight=old_down.weight.data[:, keep_idx],
            old_bias=old_down.bias.data if old_down.bias is not None else None
        )

        # swap in-place
        layer.mlp.gate_proj = new_gate
        layer.mlp.up_proj   = new_up
        layer.mlp.down_proj = new_down

    return model

# Measure Performance After Rebuilding
This function:
- Evaluates the performance of the rebuilt model after pruning and reconstruction.
- Measures:
  - **Latency**: Time taken to generate tokens for a given prompt.
  - **Throughput**: Tokens generated per second.
  - **Peak GPU Memory Usage**: Maximum memory used during inference.
  - **Perplexity**: A measure of how well the model predicts the given text.
- Prints the metrics for comparison with the baseline model.

In [7]:
def measure_rebuilt(model: nn.Module, tokenizer, prompt: str, perp_text: str, device: torch.device):
    """
    TODO:
      - Move rebuilt model to `device` & .eval()
      - Re-measure latency, throughput, peak memory, perplexity
      - Print or return these metrics
    """
    # 1) Move to device and set to eval
    model.to(device)
    model.eval()

    # 2) Measure latency & throughput
    latency, throughput = measure_latency_and_throughput(
        model, tokenizer, prompt, device
    )

    # 3) Measure peak GPU memory & perplexity
    peak_mem, perplexity = measure_peak_mem_and_perplexity(
        model, tokenizer, perp_text, device
    )

    # 4) Print results
    print(f"[Rebuilt] latency   = {latency:.3f}s")
    print(f"[Rebuilt] throughput= {throughput:.1f} tok/s")
    print(f"[Rebuilt] peak GPU  = {peak_mem:.1f} MiB")
    print(f"[Rebuilt] perplexity= {perplexity:.3f}")

    # 5) Return for further use if needed
    return {
        "latency": latency,
        "throughput": throughput,
        "peak_gpu_mem": peak_mem,
        "perplexity": perplexity
    }

# Save Model and Report Size
This function:
- Saves the pruned and rebuilt model to the specified output directory.
- Calculates the total size of the saved model files on disk.
- Prints the on-disk size of the model for comparison with the original model.

In [8]:
def save_and_report_size(model: nn.Module, output_dir: str):
    """
    TODO:
      - model.save_pretrained(output_dir)
      - Walk `output_dir` to sum file sizes (in MiB)
      - Print the on-disk size
    """
    # 1) Save
    model.save_pretrained(output_dir)

    # 2) Sum file sizes
    total_bytes = 0
    for root, _, files in os.walk(output_dir):
        for fname in files:
            total_bytes += os.path.getsize(os.path.join(root, fname))

    # 3) Convert to MiB and print
    size_mb = total_bytes / 1024**2
    print(f"[Rebuilt] on-disk size = {size_mb:.1f} MiB")

    return size_mb

# Main Execution Flow
This function:
- Loads the model and tokenizer.
- Measures the baseline performance of the model.
- Applies structured pruning to the MLP layers.
- Rebuilds the pruned MLP layers with reduced dimensions.
- Measures the performance of the rebuilt model.
- Saves the pruned and rebuilt model to disk and reports its size.

In [8]:
def start():
    device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer, model = load_model_and_tokenizer(MODEL_NAME, device)

    # Baseline
    measure_baseline(model, tokenizer, PROMPT, PERP_TEXT, device)

    # Prune on CPU
    prune_mlp_rows_and_cols(model, MLP_PRUNE_FRAC)

    # Rebuild smaller MLPs
    rebuild_mlp_blocks(model)

    # Re-benchmark rebuilt model
    measure_rebuilt(model, tokenizer, PROMPT, PERP_TEXT, device)

    # Save & report on-disk size
    save_and_report_size(model, "llama_pruned_rebuilt")

# Start the Pruning and Evaluation Process
This cell:
- Calls the `start` function to execute the entire pruning and evaluation pipeline.
- Outputs the baseline and post-pruning metrics, as well as the on-disk size of the pruned model.

In [12]:
start()

2025-08-31 00:54:22.776464: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-31 00:54:22.791036: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-08-31 00:54:22.809382: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-08-31 00:54:22.815104: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-31 00:54:22.828245: I tensorflow/core/platform/cpu_feature_guar

[Baseline] latency   = 1.164s
[Baseline] throughput= 43.0 tok/s
[Baseline] peak GPU  = 2129.6 MiB
[Baseline] perplexity= 4.557
[Rebuilt] latency   = 1.141s
[Rebuilt] throughput= 43.8 tok/s
[Rebuilt] peak GPU  = 1469.6 MiB
[Rebuilt] perplexity= 428383.216
[Rebuilt] on-disk size = 1372.2 MiB
