In [1]:
import os
import time
import math
import torch
import torch.nn as nn
import torch.nn.utils.prune as prune
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
MODEL_NAME     = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
MLP_PRUNE_FRAC = 0.5       # fraction of inner neurons to prune
MAX_NEW_TOKENS = 50
PROMPT = (
    "Over the next decade, sustainable energy solutions will revolutionize "
    "global power grids, reducing carbon footprints and fostering resilient "
    "communities through innovative storage and distribution technologies."
)
PERP_TEXT = (
    "Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast "
    "to the natural intelligence displayed by humans and animals. Leading AI textbooks "
    "define the field as the study of intelligent agents: any system that perceives "
    "its environment and takes actions that maximize its chance of achieving its goals."
)


In [3]:
def load_model_and_tokenizer(model_name: str, device: torch.device):
    """
    TODO:
      - Load AutoTokenizer.from_pretrained(model_name, use_fast=True)
      - Load AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
      - Move model to `device` and set to .eval()
      - Return tokenizer, model
    """
    raise NotImplementedError

In [4]:
def measure_baseline(model: nn.Module, tokenizer, prompt: str, perp_text: str, device: torch.device):
    """
    TODO:
      - Warm up & measure generation latency & throughput on `prompt`
      - Measure peak GPU memory & perplexity on `perp_text`
      - Print or return these baseline metrics
    """
    raise NotImplementedError

In [5]:
def prune_mlp_rows_and_cols(model: nn.Module, prune_frac: float):
    """
    TODO:
      - Move model to CPU
      - For each layer in model.model.layers:
          • Zero out `prune_frac` of rows in gate_proj and up_proj
          • Zero out corresponding `prune_frac` of columns in down_proj
      - Remove pruning reparameterizations
    """
    raise NotImplementedError

In [6]:
def rebuild_mlp_blocks(model: nn.Module):
    """
    TODO:
      - For each layer in model.model.layers:
          1) Identify kept neuron indices in gate_proj
          2) Construct new nn.Linear modules for gate_proj, up_proj, down_proj
             with reduced dimensions
          3) Copy over weights and biases
          4) Replace the old modules on the model
    """
    raise NotImplementedError

In [7]:
def measure_rebuilt(model: nn.Module, tokenizer, prompt: str, perp_text: str, device: torch.device):
    """
    TODO:
      - Move rebuilt model to `device` & .eval()
      - Re-measure latency, throughput, peak memory, perplexity
      - Print or return these metrics
    """
    raise NotImplementedError

In [8]:
def save_and_report_size(model: nn.Module, output_dir: str):
    """
    TODO:
      - model.save_pretrained(output_dir)
      - Walk `output_dir` to sum file sizes (in MiB)
      - Print the on-disk size
    """
    raise NotImplementedError

In [None]:
def start():
    device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer, model = load_model_and_tokenizer(MODEL_NAME, device)

    # Baseline
    measure_baseline(model, tokenizer, PROMPT, PERP_TEXT, device)

    # Prune on CPU
    prune_mlp_rows_and_cols(model, MLP_PRUNE_FRAC)

    # Rebuild smaller MLPs
    rebuild_mlp_blocks(model)

    # Re-benchmark rebuilt model
    measure_rebuilt(model, tokenizer, PROMPT, PERP_TEXT, device)

    # Save & report on-disk size
    save_and_report_size(model, "llama_pruned_rebuilt")

In [None]:
start()