# Import Required Libraries
This cell:
- Imports standard libraries for file handling, timing, and mathematical operations.
- Imports PyTorch for deep learning operations and pruning utilities.
- Imports Hugging Face Transformers for model and tokenizer handling.

In [1]:
import os
import time
import math
import torch
import torch.nn as nn
import torch.nn.utils.prune as prune
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define Model and Evaluation Settings
This cell:
- Specifies the model name to be used for pruning and evaluation.
- Defines the fraction of neurons to prune in the MLP layers (`MLP_PRUNE_FRAC`).
- Sets the maximum number of tokens to generate during inference.
- Provides sample texts for benchmarking latency, throughput, and perplexity.

In [2]:
MODEL_NAME     = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
MLP_PRUNE_FRAC = 0.5       # fraction of inner neurons to prune
MAX_NEW_TOKENS = 50
PROMPT = (
    "Over the next decade, sustainable energy solutions will revolutionize "
    "global power grids, reducing carbon footprints and fostering resilient "
    "communities through innovative storage and distribution technologies."
)
PERP_TEXT = (
    "Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast "
    "to the natural intelligence displayed by humans and animals. Leading AI textbooks "
    "define the field as the study of intelligent agents: any system that perceives "
    "its environment and takes actions that maximize its chance of achieving its goals."
)


# Load Model and Tokenizer
This function:
- Loads the tokenizer and model using Hugging Face Transformers.
- Configures the model to use FP16 precision for faster inference.
- Moves the model to the specified device (CPU or GPU).
- Sets the model to evaluation mode to disable gradient computations.
- Returns the loaded tokenizer and model.

In [3]:
def load_model_and_tokenizer(model_name: str, device: torch.device):
    """
    TODO:
      - Load AutoTokenizer.from_pretrained(model_name, use_fast=True)
      - Load AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
      - Move model to `device` and set to .eval()
      - Return tokenizer, model
    """
    raise NotImplementedError

# Measure Baseline Performance
This function:
- Measures the baseline performance of the model before pruning.
- Evaluates:
  - **Latency**: Time taken to generate tokens for a given prompt.
  - **Throughput**: Tokens generated per second.
  - **Peak GPU Memory Usage**: Maximum memory used during inference.
  - **Perplexity**: A measure of how well the model predicts the given text.
- Prints the baseline metrics for comparison with the pruned model.

In [4]:
def measure_baseline(model: nn.Module, tokenizer, prompt: str, perp_text: str, device: torch.device):
    """
    TODO:
      - Warm up & measure generation latency & throughput on `prompt`
      - Measure peak GPU memory & perplexity on `perp_text`
      - Print or return these baseline metrics
    """
    raise NotImplementedError

# Prune MLP Rows and Columns
This function:
- Prunes the MLP layers in the model by:
  - Zeroing out a fraction of rows in the `gate_proj` and `up_proj` layers.
  - Zeroing out the corresponding columns in the `down_proj` layer.
- Uses structured pruning to remove entire rows or columns.
- Ensures pruning is performed on the CPU to avoid GPU memory issues.
- Removes the pruning reparameterizations after applying the masks.

In [5]:
def prune_mlp_rows_and_cols(model: nn.Module, prune_frac: float):
    """
    TODO:
      - Move model to CPU
      - For each layer in model.model.layers:
          • Zero out `prune_frac` of rows in gate_proj and up_proj
          • Zero out corresponding `prune_frac` of columns in down_proj
      - Remove pruning reparameterizations
    """
    raise NotImplementedError

# Rebuild MLP Blocks
This function:
- Reconstructs the pruned MLP layers with reduced dimensions.
- Identifies the neurons that were not pruned in the `gate_proj` layer.
- Creates new `nn.Linear` modules for `gate_proj`, `up_proj`, and `down_proj` with updated dimensions.
- Copies the weights and biases from the original layers to the new layers.
- Replaces the old modules with the new ones in the model.

In [6]:
def rebuild_mlp_blocks(model: nn.Module):
    """
    TODO:
      - For each layer in model.model.layers:
          1) Identify kept neuron indices in gate_proj
          2) Construct new nn.Linear modules for gate_proj, up_proj, down_proj
             with reduced dimensions
          3) Copy over weights and biases
          4) Replace the old modules on the model
    """
    raise NotImplementedError

# Measure Performance After Rebuilding
This function:
- Evaluates the performance of the rebuilt model after pruning and reconstruction.
- Measures:
  - **Latency**: Time taken to generate tokens for a given prompt.
  - **Throughput**: Tokens generated per second.
  - **Peak GPU Memory Usage**: Maximum memory used during inference.
  - **Perplexity**: A measure of how well the model predicts the given text.
- Prints the metrics for comparison with the baseline model.

In [7]:
def measure_rebuilt(model: nn.Module, tokenizer, prompt: str, perp_text: str, device: torch.device):
    """
    TODO:
      - Move rebuilt model to `device` & .eval()
      - Re-measure latency, throughput, peak memory, perplexity
      - Print or return these metrics
    """
    raise NotImplementedError

# Save Model and Report Size
This function:
- Saves the pruned and rebuilt model to the specified output directory.
- Calculates the total size of the saved model files on disk.
- Prints the on-disk size of the model for comparison with the original model.

In [8]:
def save_and_report_size(model: nn.Module, output_dir: str):
    """
    TODO:
      - model.save_pretrained(output_dir)
      - Walk `output_dir` to sum file sizes (in MiB)
      - Print the on-disk size
    """
    raise NotImplementedError

# Main Execution Flow
This function:
- Loads the model and tokenizer.
- Measures the baseline performance of the model.
- Applies structured pruning to the MLP layers.
- Rebuilds the pruned MLP layers with reduced dimensions.
- Measures the performance of the rebuilt model.
- Saves the pruned and rebuilt model to disk and reports its size.

In [None]:
def start():
    device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer, model = load_model_and_tokenizer(MODEL_NAME, device)

    # Baseline
    measure_baseline(model, tokenizer, PROMPT, PERP_TEXT, device)

    # Prune on CPU
    prune_mlp_rows_and_cols(model, MLP_PRUNE_FRAC)

    # Rebuild smaller MLPs
    rebuild_mlp_blocks(model)

    # Re-benchmark rebuilt model
    measure_rebuilt(model, tokenizer, PROMPT, PERP_TEXT, device)

    # Save & report on-disk size
    save_and_report_size(model, "llama_pruned_rebuilt")

# Start the Pruning and Evaluation Process
This cell:
- Calls the `start` function to execute the entire pruning and evaluation pipeline.
- Outputs the baseline and post-pruning metrics, as well as the on-disk size of the pruned model.

In [None]:
start()