In [1]:
!pip install torch torchvision torchaudio transformers datasets accelerate sentencepiece protobuf lm-eval numpy pandas matplotlib seaborn tqdm huggingface-hub



In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Tue_Oct_29_23:50:19_PDT_2024
Cuda compilation tools, release 12.6, V12.6.85
Build cuda_12.6.r12.6/compiler.35059454_0


In [3]:
# ==========================================
# WANDA PRUNING EXPERIMENT
# Paper: "A Simple and Effective Pruning Approach for Large Language Models"
# Models: LLaMA-2-7B, LLaMA-3-8B
# Assignment: Demonstrate Wanda > Magnitude Pruning
# By: Zineb Abercha & Omar Alfarouq Bouhadi
# ==========================================

import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import gc
import time
import os
import math
from tqdm.auto import tqdm
import json
import warnings
from collections import defaultdict

warnings.filterwarnings('ignore')

from huggingface_hub import login
HF_TOKEN = "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"  
login(token=HF_TOKEN)
print("✓ Logged in to HuggingFace")

# Import LM Evaluation Harness
try:
    from lm_eval import evaluator
    from lm_eval.models.huggingface import HFLM
    print("✓ lm-evaluation-harness imported successfully")
except ImportError:
    print("ERROR: lm-evaluation-harness not installed! Run: pip install lm-eval")

# Constants
DATASET_ID = "wikitext"
DATASET_CONFIG = "wikitext-2-raw-v1"
NSAMPLES = 64          
SEQ_LEN = 4096 

# Experiment Configuration (aligned with Wanda paper)
SPARSITY_RATIOS = [0.3, 0.5, 0.7] 
ZERO_SHOT_TASKS = ["piqa", "hellaswag", "arc_easy", "boolq", "rte"]
SEED = 0
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

torch.manual_seed(SEED)
np.random.seed(SEED)

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print(f"Device: {DEVICE}")

# --- UTILITIES ---

def get_wikitext2(tokenizer, nsamples, seq_len):
    print(f"\n[Data] Loading {DATASET_ID}...")
    traindata = load_dataset(DATASET_ID, DATASET_CONFIG, split='train')
    testdata = load_dataset(DATASET_ID, DATASET_CONFIG, split='test')
    trainenc = tokenizer("\n\n".join(traindata['text']), return_tensors='pt')
    testenc = tokenizer("\n\n".join(testdata['text']), return_tensors='pt')
    import random
    random.seed(SEED)
    trainloader = []
    print(f"[Data] Selecting {nsamples} calibration sequences...")
    for _ in range(nsamples):
        i = random.randint(0, trainenc.input_ids.shape[1] - seq_len - 1)
        j = i + seq_len
        inp = trainenc.input_ids[:, i:j]
        tar = inp.clone()
        tar[:, :-1] = -100
        trainloader.append((inp, tar))
    return trainloader, testenc

def find_layers(module, layers=[nn.Linear], name=''):
    """Recursively find linear layers (aligned with official Wanda repo)."""
    if type(module) in layers: 
        return {name: module}
    res = {}
    for name1, child in module.named_children():
        res.update(find_layers(child, layers=layers, name=name + '.' + name1 if name != '' else name1))
    return res

def check_sparsity(model):
    """Verify actual sparsity matches target (from official Wanda repo)."""
    use_cache = model.config.use_cache
    model.config.use_cache = False
    layers = model.model.layers
    count = 0 
    total_params = 0
    layer_sparsities = []
    
    for i in range(len(layers)):
        layer = layers[i]
        subset = find_layers(layer)
        sub_count = 0
        sub_params = 0
        for name in subset:
            W = subset[name].weight.data
            count += (W==0).sum().item()
            total_params += W.numel()
            sub_count += (W==0).sum().item()
            sub_params += W.numel()
        layer_sparsities.append(float(sub_count)/sub_params if sub_params > 0 else 0)
    
    model.config.use_cache = use_cache
    return float(count)/total_params, layer_sparsities

# --- PRUNER CLASS ---

class WandaPruner:
    def __init__(self, model_id):
        self.model_id = model_id
        self.model_name = model_id.split('/')[-1]
        self.is_llama3 = "Llama-3" in model_id or "llama-3" in model_id
        self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
        if self.tokenizer.pad_token_id is None:
            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
        self.model = None
        self.original_state_dict = None
        self.pruning_history = []
        self.timing_stats = defaultdict(float)
    
    def load_model(self):
        print(f"\n[Model] Loading {self.model_id}...")
        start = time.time()
        model_kwargs = {"torch_dtype": torch.float16, "device_map": "cuda", "low_cpu_mem_usage": True}
        if self.is_llama3: 
            model_kwargs["attn_implementation"] = "eager"
        self.model = AutoModelForCausalLM.from_pretrained(self.model_id, **model_kwargs)
        self.model.seqlen = SEQ_LEN
        
        # Count parameters
        total_params = sum(p.numel() for p in self.model.parameters())
        print(f"[Model] Total Parameters: {total_params:,}")
        
        # CPU Caching for Fast Resets
        print("[Model] Caching weights to CPU...")
        self.original_state_dict = {k: v.cpu() for k, v in self.model.state_dict().items()}
        
        load_time = time.time() - start
        self.timing_stats['model_load'] = load_time
        print(f"[Model] Loaded in {load_time:.2f}s")
        return total_params
    
    def reset_model(self):
        print("\n[Model] Resetting model weights from cache...")
        self.model.load_state_dict(self.original_state_dict)
        torch.cuda.empty_cache()
    
    def prune(self, method="wanda", sparsity=0.5, dataloader=None):
        """
        Prune model using Wanda or Magnitude method.
        Implementation verified against official Wanda repo (locuslab/wanda).
        """
        if not self.model: self.load_model()
        print(f"\n{'='*40}\nPRUNING: {method.upper()} @ {sparsity:.0%}\n{'='*40}")
        
        prune_start = time.time()
        use_cache = self.model.config.use_cache
        self.model.config.use_cache = False
        
        if hasattr(self.model, 'model'): 
            layers = self.model.model.layers
        else: 
            layers = self.model.transformer.h

        dtype = next(iter(self.model.parameters())).dtype
        inps = torch.zeros((NSAMPLES, SEQ_LEN, self.model.config.hidden_size), dtype=dtype, device=DEVICE)
        cache = {'i': 0, 'args': None, 'kwargs': None}
        
        class Catcher(nn.Module):
            def __init__(self, module):
                super().__init__(); self.module = module
            def forward(self, inp, *args, **kwargs):
                inps[cache['i']] = inp; cache['i'] += 1
                cache['args'] = args; cache['kwargs'] = kwargs
                raise ValueError
        
        # Capture inputs (Phase 1)
        layers[0] = Catcher(layers[0])
        for batch_idx, batch in enumerate(tqdm(dataloader, desc="Calibration")):
            try: 
                self.model(batch[0].to(DEVICE))
            except ValueError: 
                pass
        layers[0] = layers[0].module
        torch.cuda.empty_cache()
        
        forward_args = cache['args'] if cache['args'] else ()
        forward_kwargs = cache['kwargs'] if cache['kwargs'] else {}
        
        # Layer-by-layer pruning (Phase 2)
        print("\n[Phase 2] Layer-by-layer pruning...")
        stats = {
            "total_params": 0, 
            "pruned_params": 0, 
            "layer_sparsities": [],
            "layer_names": []
        }
        
        for layer_idx, layer in enumerate(tqdm(layers, desc="Processing Layers")):
            subset = find_layers(layer)
            subset_sq_sums = {}
            
            # Compute activation norms for Wanda (matches official repo)
            if method == "wanda":
                def hook(name): 
                    def fn(m, i, o):
                        inp = i[0].reshape(-1, i[0].shape[-1])
                        sq = inp.pow(2).sum(dim=0)  # Sum of squares per feature
                        subset_sq_sums[name] = subset_sq_sums.get(name, 0) + sq
                    return fn
                handles = [subset[n].register_forward_hook(hook(n)) for n in subset]
                for j in range(NSAMPLES):
                    with torch.no_grad(): 
                        layer(inps[j].unsqueeze(0), *forward_args, **forward_kwargs)
                for h in handles: 
                    h.remove()
            
            # Prune each linear layer
            layer_zeros = 0
            layer_total = 0
            for name in subset:
                W = subset[name].weight.data
                if method == "wanda":
                    # Wanda metric: |W| * sqrt(sum of squared activations)
                    # Verified equivalent to official: |W| * sqrt(scaler_row)
                    norms = torch.sqrt(subset_sq_sums[name] + 1e-6)
                    metric = torch.abs(W) * norms.reshape(1, -1)
                else: 
                    # Magnitude pruning baseline
                    metric = torch.abs(W)
                
                # Compute threshold (stable sort as per official repo)
                thresh = torch.sort(metric, dim=1, stable=True)[0]
                thresh = thresh[:, int(W.shape[1] * sparsity)]
                mask = metric > thresh.unsqueeze(1)
                W.mul_(mask)
                
                zeros = (W == 0).sum().item()
                total = W.numel()
                stats["total_params"] += total
                stats["pruned_params"] += zeros
                layer_zeros += zeros
                layer_total += total
            
            stats["layer_sparsities"].append(layer_zeros / layer_total if layer_total > 0 else 0)
            stats["layer_names"].append(f"Layer {layer_idx}")
            
            # Forward pass through pruned layer
            for j in range(NSAMPLES):
                with torch.no_grad(): 
                    inps[j] = layer(inps[j].unsqueeze(0), *forward_args, **forward_kwargs)[0]
        
        self.model.config.use_cache = use_cache
        stats["actual_sparsity"] = stats["pruned_params"] / stats["total_params"]
        
        prune_time = time.time() - prune_start
        self.timing_stats[f'prune_{method}_{int(sparsity*100)}'] = prune_time
        
        # Verify sparsity matches target
        verified_sparsity, _ = check_sparsity(self.model)
        print(f"[Verification] Target: {sparsity:.2%}, Actual: {verified_sparsity:.2%}")
        
        return stats

# --- EVALUATION ---

def eval_perplexity(model, testenc, stride=SEQ_LEN):
    """Evaluate perplexity on WikiText-2 (aligned with official Wanda repo)."""
    print("\n[Perplexity] Evaluating on WikiText-2...")
    eval_start = time.time()
    model.eval()
    input_ids = testenc.input_ids.to(DEVICE)
    seq_len = input_ids.size(1)
    nlls = []
    prev_end_loc = 0
    
    for begin_loc in tqdm(range(0, seq_len, stride), desc="Perplexity"):
        end_loc = min(begin_loc + SEQ_LEN, seq_len)
        trg_len = end_loc - prev_end_loc
        inp = input_ids[:, begin_loc:end_loc]
        tar = inp.clone()
        tar[:, :-trg_len] = -100
        if inp.size(1) == 0: break
        
        with torch.no_grad():
            outputs = model(inp, labels=tar)
            nlls.append(outputs.loss * trg_len)
        prev_end_loc = end_loc
        if end_loc == seq_len: break
    
    ppl = torch.exp(torch.stack(nlls).sum() / end_loc).item()
    eval_time = time.time() - eval_start
    print(f"Perplexity: {ppl:.4f} (eval time: {eval_time:.1f}s)")
    return ppl, eval_time

def eval_zero_shot_with_harness(model, tokenizer, model_name, tasks=ZERO_SHOT_TASKS, limit=None):
    """Evaluate zero-shot accuracy using lm-evaluation-harness."""
    print(f"\n[Zero-Shot] Tasks: {tasks}")
    eval_start = time.time()
    
    # L40S optimized batch size
    lm = HFLM(pretrained=model, tokenizer=tokenizer, batch_size="auto", max_batch_size=64, device="cuda")
    results = evaluator.simple_evaluate(model=lm, tasks=tasks, num_fewshot=0, limit=limit, log_samples=False)
    
    task_results = {}
    for task in tasks:
        if task in results['results']:
            data = results['results'][task]
            val = 0.0
            # Try multiple key formats from lm-eval
            for k, v in data.items():
                if k in ['acc_norm,none', 'acc,none', 'acc_norm', 'acc', 'accuracy']: 
                    val = v
                    break
            if val == 0.0:
                for k, v in data.items():
                    if not k.endswith('stderr') and not k =='alias':
                        try: 
                            float(v)
                            val = v
                            break
                        except: 
                            continue
            task_results[task] = float(val)
            print(f"  {task}: {task_results[task]*100:.2f}%")
    
    avg = np.mean(list(task_results.values()))
    task_results['average'] = avg
    
    eval_time = time.time() - eval_start
    print(f"Average: {avg*100:.2f}% (eval time: {eval_time:.1f}s)")
    return task_results, eval_time

# --- ENHANCED PLOTTING (8 PLOTS) ---

def plot_results(results, output_dir):
    """Generate comprehensive visualization suite (8 plots total)."""
    print(f"\n{'='*80}\nGENERATING VISUALIZATIONS -> {output_dir}\n{'='*80}")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    n_models = len(results)
    model_names = list(results.keys())
    
    # PLOT 1: Perplexity Comparison (Bar Chart)
    fig, axes = plt.subplots(1, n_models, figsize=(7*n_models, 6))
    if n_models == 1: axes = [axes]
    for idx, (model_name, ax) in enumerate(zip(model_names, axes)):
        ppl_data = results[model_name]['perplexity']
        methods = ['Dense']
        values = [ppl_data.get('dense', 0)]
        colors = ['#2ecc71']
        for s in SPARSITY_RATIOS:
            s_int = int(s*100)
            methods.extend([f'Mag\n{s_int}%', f'Wanda\n{s_int}%'])
            values.extend([ppl_data.get(f'magnitude_{s_int}', 0), ppl_data.get(f'wanda_{s_int}', 0)])
            colors.extend(['#e74c3c', '#3498db'])
        bars = ax.bar(methods, values, color=colors, alpha=0.8, edgecolor='black', linewidth=1.5)
        ax.set_ylabel('Perplexity (Lower is Better)', fontsize=12, fontweight='bold')
        ax.set_title(f'{model_name}\nPerplexity vs Sparsity', fontsize=14, fontweight='bold')
        ax.grid(axis='y', alpha=0.3, linestyle='--')
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height, f'{height:.2f}', 
                   ha='center', va='bottom', fontsize=10, fontweight='bold')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, '1_perplexity_comparison.png'), dpi=300, bbox_inches='tight')
    plt.close()

    # PLOT 2: Performance Degradation (Line Plot)
    fig, axes = plt.subplots(1, n_models, figsize=(7*n_models, 6))
    if n_models == 1: axes = [axes]
    for idx, (model_name, ax) in enumerate(zip(model_names, axes)):
        ppl_data = results[model_name]['perplexity']
        base_ppl = ppl_data.get('dense', 1)
        sparsities = [int(s*100) for s in SPARSITY_RATIOS]
        mag_deg = [(ppl_data.get(f'magnitude_{s}', base_ppl) - base_ppl)/base_ppl*100 for s in sparsities]
        wanda_deg = [(ppl_data.get(f'wanda_{s}', base_ppl) - base_ppl)/base_ppl*100 for s in sparsities]
        ax.plot(sparsities, mag_deg, marker='s', label='Magnitude', color='#e74c3c', linewidth=3, markersize=10)
        ax.plot(sparsities, wanda_deg, marker='o', label='Wanda', color='#3498db', linewidth=3, markersize=10)
        ax.set_ylabel('Perplexity Increase (%)', fontsize=12, fontweight='bold')
        ax.set_xlabel('Sparsity (%)', fontsize=12, fontweight='bold')
        ax.set_title(f'{model_name}\nPerformance Degradation', fontsize=14, fontweight='bold')
        ax.legend(fontsize=11, loc='best')
        ax.grid(True, alpha=0.3, linestyle='--')
        # Add improvement annotation
        if len(sparsities) > 0:
            for i, s in enumerate(sparsities):
                improvement = mag_deg[i] - wanda_deg[i]
                if improvement > 0:
                    ax.annotate(f'+{improvement:.1f}pp', 
                               xy=(s, wanda_deg[i]), 
                               xytext=(s, wanda_deg[i] - 2),
                               ha='center', fontsize=9, color='green', fontweight='bold')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, '2_degradation.png'), dpi=300, bbox_inches='tight')
    plt.close()
    
    # PLOT 3: Zero-Shot Summary (Bar Chart)
    fig, axes = plt.subplots(1, n_models, figsize=(7*n_models, 6))
    if n_models == 1: axes = [axes]
    for idx, (model_name, ax) in enumerate(zip(model_names, axes)):
        zs_data = results[model_name]['zero_shot']
        methods = ['Dense']
        values = [zs_data.get('dense', {}).get('average', 0)*100]
        colors = ['#2ecc71']
        for s in SPARSITY_RATIOS:
            s_int = int(s*100)
            methods.extend([f'Mag {s_int}%', f'Wanda {s_int}%'])
            values.extend([
                zs_data.get(f'magnitude_{s_int}', {}).get('average', 0)*100, 
                zs_data.get(f'wanda_{s_int}', {}).get('average', 0)*100
            ])
            colors.extend(['#e74c3c', '#3498db'])
        bars = ax.bar(methods, values, color=colors, alpha=0.8, edgecolor='black', linewidth=1.5)
        ax.set_ylabel('Accuracy (%)', fontsize=12, fontweight='bold')
        ax.set_title(f'{model_name}\nZero-Shot Accuracy (Average)', fontsize=14, fontweight='bold')
        ax.set_ylim([0, 100])
        ax.grid(axis='y', alpha=0.3, linestyle='--')
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height, f'{height:.1f}%', 
                   ha='center', va='bottom', fontsize=10, fontweight='bold')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, '3_zero_shot_summary.png'), dpi=300, bbox_inches='tight')
    plt.close()

    # PLOT 4: Per-Task Breakdown (All Sparsities)
    for s in SPARSITY_RATIOS:
        s_int = int(s*100)
        fig, axes = plt.subplots(1, n_models, figsize=(8*n_models, 6))
        if n_models == 1: axes = [axes]
        for idx, (model_name, ax) in enumerate(zip(model_names, axes)):
            zs_data = results[model_name]['zero_shot']
            tasks = ZERO_SHOT_TASKS
            dense_scores = [zs_data.get('dense', {}).get(t, 0)*100 for t in tasks]
            mag_scores = [zs_data.get(f'magnitude_{s_int}', {}).get(t, 0)*100 for t in tasks]
            wanda_scores = [zs_data.get(f'wanda_{s_int}', {}).get(t, 0)*100 for t in tasks]
            
            x = np.arange(len(tasks))
            width = 0.25
            ax.bar(x - width, dense_scores, width, label='Dense', color='#2ecc71', alpha=0.8)
            ax.bar(x, mag_scores, width, label=f'Magnitude {s_int}%', color='#e74c3c', alpha=0.8)
            ax.bar(x + width, wanda_scores, width, label=f'Wanda {s_int}%', color='#3498db', alpha=0.8)
            
            ax.set_xticks(x)
            ax.set_xticklabels(tasks, fontsize=11)
            ax.set_ylabel('Accuracy (%)', fontsize=12, fontweight='bold')
            ax.set_title(f'{model_name}: Task Breakdown ({s_int}% Sparsity)', fontsize=14, fontweight='bold')
            ax.legend(fontsize=10)
            ax.set_ylim([0, 100])
            ax.grid(axis='y', alpha=0.3, linestyle='--')
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f'4_task_breakdown_{s_int}.png'), dpi=300, bbox_inches='tight')
        plt.close()
    
    # PLOT 5: Per-Layer Sparsity Distribution (ENHANCED - Magnitude vs Wanda)
    fig, axes = plt.subplots(1, n_models, figsize=(12*n_models, 6))
    if n_models == 1: axes = [axes]
    for idx, (model_name, ax) in enumerate(zip(model_names, axes)):
        # Compare Magnitude vs Wanda at 50% sparsity
        stats_mag = results[model_name]['sparsity_stats'].get('magnitude_50', {})
        stats_wanda = results[model_name]['sparsity_stats'].get('wanda_50', {})
        
        if 'layer_sparsities' in stats_mag and 'layer_sparsities' in stats_wanda:
            mag_sparsities = np.array(stats_mag['layer_sparsities']) * 100
            wanda_sparsities = np.array(stats_wanda['layer_sparsities']) * 100
            layers_idx = range(len(mag_sparsities))
            
            # Plot both methods
            ax.plot(layers_idx, mag_sparsities, marker='s', linewidth=2.5, 
                   markersize=6, label='Magnitude 50%', color='#e74c3c', alpha=0.8)
            ax.plot(layers_idx, wanda_sparsities, marker='o', linewidth=2.5, 
                   markersize=6, label='Wanda 50%', color='#3498db', alpha=0.8)
            
            # Add target sparsity line
            ax.axhline(y=50, color='black', linestyle='--', linewidth=2, 
                      label='Target (50%)', alpha=0.5)
            
            # Highlight layers with biggest difference
            diff = np.abs(mag_sparsities - wanda_sparsities)
            max_diff_idx = np.argmax(diff)
            if diff[max_diff_idx] > 5:  # Only annotate if difference > 5%
                ax.annotate(f'Max Δ: {diff[max_diff_idx]:.1f}%',
                           xy=(max_diff_idx, wanda_sparsities[max_diff_idx]),
                           xytext=(max_diff_idx, wanda_sparsities[max_diff_idx] + 10),
                           arrowprops=dict(arrowstyle='->', color='green', lw=2),
                           fontsize=10, fontweight='bold', color='green')
        
        ax.set_xlabel('Layer Index', fontsize=12, fontweight='bold')
        ax.set_ylabel('Sparsity (%)', fontsize=12, fontweight='bold')
        ax.set_title(f'{model_name}\nPer-Layer Sparsity: Magnitude vs Wanda', fontsize=14, fontweight='bold')
        ax.legend(fontsize=10, loc='best')
        ax.grid(True, alpha=0.3, linestyle='--')
        ax.set_ylim([0, 100])
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, '5_layer_sparsity_comparison.png'), dpi=300, bbox_inches='tight')
    plt.close()
    
    # PLOT 6: Wanda vs Magnitude Heatmap
    fig, axes = plt.subplots(1, n_models, figsize=(7*n_models, 6))
    if n_models == 1: axes = [axes]
    for idx, (model_name, ax) in enumerate(zip(model_names, axes)):
        ppl_data = results[model_name]['perplexity']
        comparison_matrix = []
        for sparsity in SPARSITY_RATIOS:
            s_int = int(sparsity*100)
            mag_ppl = ppl_data.get(f'magnitude_{s_int}', 0)
            wanda_ppl = ppl_data.get(f'wanda_{s_int}', 0)
            improvement = ((mag_ppl - wanda_ppl) / mag_ppl) * 100
            comparison_matrix.append([mag_ppl, wanda_ppl, improvement])
        
        comparison_matrix = np.array(comparison_matrix)
        im = ax.imshow(comparison_matrix, cmap='RdYlGn', aspect='auto', vmin=0)
        
        ax.set_xticks([0, 1, 2])
        ax.set_xticklabels(['Mag PPL', 'Wanda PPL', 'Improvement (%)'], fontsize=11)
        ax.set_yticks(range(len(SPARSITY_RATIOS)))
        ax.set_yticklabels([f'{int(s*100)}%' for s in SPARSITY_RATIOS], fontsize=11)
        ax.set_ylabel('Sparsity Level', fontsize=12, fontweight='bold')
        ax.set_title(f'{model_name}\nWanda vs Magnitude Heatmap', fontsize=14, fontweight='bold')
        
        for i in range(len(SPARSITY_RATIOS)):
            for j in range(3):
                text = ax.text(j, i, f'{comparison_matrix[i, j]:.2f}',
                             ha="center", va="center", color="white" if j < 2 else "black", 
                             fontsize=12, fontweight='bold')
        
        plt.colorbar(im, ax=ax, label='Value')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, '6_wanda_vs_magnitude_heatmap.png'), dpi=300, bbox_inches='tight')
    plt.close()
    
    # PLOT 7: Accuracy Retention (%)
    fig, axes = plt.subplots(1, n_models, figsize=(7*n_models, 6))
    if n_models == 1: axes = [axes]
    for idx, (model_name, ax) in enumerate(zip(model_names, axes)):
        zs_data = results[model_name]['zero_shot']
        base_acc = zs_data.get('dense', {}).get('average', 0) * 100
        sparsities = [int(s*100) for s in SPARSITY_RATIOS]
        
        mag_retention = [(zs_data.get(f'magnitude_{s}', {}).get('average', 0)*100 / base_acc)*100 for s in sparsities]
        wanda_retention = [(zs_data.get(f'wanda_{s}', {}).get('average', 0)*100 / base_acc)*100 for s in sparsities]
        
        ax.plot(sparsities, mag_retention, marker='s', label='Magnitude', color='#e74c3c', linewidth=3, markersize=10)
        ax.plot(sparsities, wanda_retention, marker='o', label='Wanda', color='#3498db', linewidth=3, markersize=10)
        ax.axhline(y=100, color='green', linestyle='--', linewidth=2, label='Dense Baseline', alpha=0.7)
        
        ax.set_xlabel('Sparsity (%)', fontsize=12, fontweight='bold')
        ax.set_ylabel('Accuracy Retention (%)', fontsize=12, fontweight='bold')
        ax.set_title(f'{model_name}\nZero-Shot Accuracy Retention', fontsize=14, fontweight='bold')
        ax.legend(fontsize=11)
        ax.grid(True, alpha=0.3, linestyle='--')
        ax.set_ylim([70, 105])
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, '7_accuracy_retention.png'), dpi=300, bbox_inches='tight')
    plt.close()
    
    # PLOT 8: Timing Breakdown
    fig, ax = plt.subplots(1, 1, figsize=(10, 6))
    timing_data = []
    labels = []
    for model_name in model_names:
        if 'timing' in results[model_name]:
            timing = results[model_name]['timing']
            for key, value in timing.items():
                if 'prune' in key or 'ppl' in key or 'zero_shot' in key:
                    timing_data.append(value)
                    labels.append(f"{model_name}_{key}")
    
    if timing_data:
        colors_timing = plt.cm.viridis(np.linspace(0, 1, len(timing_data)))
        bars = ax.barh(range(len(timing_data)), timing_data, color=colors_timing, alpha=0.8)
        ax.set_yticks(range(len(timing_data)))
        ax.set_yticklabels(labels, fontsize=9)
        ax.set_xlabel('Time (seconds)', fontsize=12, fontweight='bold')
        ax.set_title('Timing Breakdown', fontsize=14, fontweight='bold')
        ax.grid(axis='x', alpha=0.3, linestyle='--')
        
        for i, (bar, val) in enumerate(zip(bars, timing_data)):
            ax.text(val, i, f' {val:.1f}s', va='center', fontsize=9, fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, '8_timing_breakdown.png'), dpi=300, bbox_inches='tight')
    plt.close()
    
    print("✓ All 8 plots saved.")

def save_detailed_results(results, output_dir):
    """Export comprehensive results to CSV and JSON."""
    print(f"\n[Export] Saving detailed results to {output_dir}...")
    
    # CSV 1: Perplexity Results
    ppl_rows = []
    for model_name, data in results.items():
        ppl_data = data['perplexity']
        for key, value in ppl_data.items():
            ppl_rows.append({
                'Model': model_name,
                'Method': key,
                'Perplexity': value
            })
    pd.DataFrame(ppl_rows).to_csv(os.path.join(output_dir, 'perplexity_results.csv'), index=False)
    
    # CSV 2: Zero-Shot Results (Detailed)
    zs_rows = []
    for model_name, data in results.items():
        zs_data = data['zero_shot']
        for config, tasks in zs_data.items():
            if isinstance(tasks, dict):
                for task, acc in tasks.items():
                    zs_rows.append({
                        'Model': model_name,
                        'Configuration': config,
                        'Task': task,
                        'Accuracy': acc
                    })
    pd.DataFrame(zs_rows).to_csv(os.path.join(output_dir, 'zero_shot_results.csv'), index=False)
    
    # CSV 3: Per-Layer Sparsity
    layer_rows = []
    for model_name, data in results.items():
        if 'sparsity_stats' in data:
            for config, stats in data['sparsity_stats'].items():
                if 'layer_sparsities' in stats:
                    for i, sparsity in enumerate(stats['layer_sparsities']):
                        layer_rows.append({
                            'Model': model_name,
                            'Configuration': config,
                            'Layer': i,
                            'Sparsity': sparsity
                        })
    pd.DataFrame(layer_rows).to_csv(os.path.join(output_dir, 'layer_sparsity.csv'), index=False)
    
    # JSON: Complete Results
    with open(os.path.join(output_dir, 'results_complete.json'), 'w') as f:
        json.dump(results, f, indent=2, default=str)
    
    print("✓ Exported: perplexity_results.csv, zero_shot_results.csv, layer_sparsity.csv, results_complete.json")

def save_final_report(results, output_dir):
    """Save comprehensive final report to markdown file."""
    for model_name, data in results.items():
        report_path = os.path.join(output_dir, 'final_report.md')
        
        with open(report_path, 'w') as f:
            f.write(f"# Final Report: {model_name}\n\n")
            
            ppl = data['perplexity']
            f.write("## Perplexity Results (WikiText-2)\n\n")
            f.write("| Method | PPL | Δ from Dense |\n")
            f.write("|--------|-----|--------------|\n")
            
            base = ppl.get('dense', 0)
            f.write(f"| Dense (Baseline) | {base:.4f} | - |\n")
            
            for s in SPARSITY_RATIOS:
                s_int = int(s*100)
                for m in ['magnitude', 'wanda']:
                    key = f"{m}_{s_int}"
                    val = ppl.get(key, 0)
                    deg = ((val - base)/base)*100
                    f.write(f"| {m.title()} {s_int}% | {val:.4f} | {deg:+.2f}% |\n")
            
            # Zero-Shot Summary
            zs = data['zero_shot']
            f.write(f"\n## Zero-Shot Accuracy\n\n")
            f.write(f"*Average across {len(ZERO_SHOT_TASKS)} tasks: {', '.join(ZERO_SHOT_TASKS)}*\n\n")
            f.write("| Method | Accuracy | Retention |\n")
            f.write("|--------|----------|-----------|\n")
            
            base_acc = zs.get('dense', {}).get('average', 0) * 100
            f.write(f"| Dense (Baseline) | {base_acc:.2f}% | 100.00% |\n")
            
            for s in SPARSITY_RATIOS:
                s_int = int(s*100)
                for m in ['magnitude', 'wanda']:
                    key = f"{m}_{s_int}"
                    acc = zs.get(key, {}).get('average', 0) * 100
                    retention = (acc / base_acc) * 100 if base_acc > 0 else 0
                    f.write(f"| {m.title()} {s_int}% | {acc:.2f}% | {retention:.2f}% |\n")
            
            # Key Findings
            f.write("\n## Key Findings\n\n")
            for s in SPARSITY_RATIOS:
                s_int = int(s*100)
                mag_ppl = ppl.get(f'magnitude_{s_int}', 0)
                wanda_ppl = ppl.get(f'wanda_{s_int}', 0)
                ppl_improvement = ((mag_ppl - wanda_ppl) / mag_ppl) * 100
                
                mag_acc = zs.get(f'magnitude_{s_int}', {}).get('average', 0) * 100
                wanda_acc = zs.get(f'wanda_{s_int}', {}).get('average', 0) * 100
                acc_improvement = wanda_acc - mag_acc
                
                f.write(f"\n### {s_int}% Sparsity\n\n")
                f.write(f"- **Perplexity Improvement:** Wanda improves by {ppl_improvement:.2f}% over Magnitude\n")
                f.write(f"- **Accuracy Improvement:** Wanda improves by {acc_improvement:.2f} percentage points\n")
            
            # Per-task breakdown
            f.write("\n## Per-Task Zero-Shot Breakdown\n\n")
            for s in SPARSITY_RATIOS:
                s_int = int(s*100)
                f.write(f"\n### {s_int}% Sparsity\n\n")
                f.write("| Task | Dense | Magnitude | Wanda |\n")
                f.write("|------|-------|-----------|-------|\n")
                
                for task in ZERO_SHOT_TASKS:
                    if task != 'average':
                        dense_acc = zs.get('dense', {}).get(task, 0) * 100
                        mag_acc = zs.get(f'magnitude_{s_int}', {}).get(task, 0) * 100
                        wanda_acc = zs.get(f'wanda_{s_int}', {}).get(task, 0) * 100
                        f.write(f"| {task} | {dense_acc:.2f}% | {mag_acc:.2f}% | {wanda_acc:.2f}% |\n")
        
        print(f"✓ Saved: {report_path}")

def save_theoretical_analysis(results, output_dir):
    """Save theoretical analysis to markdown file."""
    for model_name, data in results.items():
        analysis_path = os.path.join(output_dir, 'theoretical_analysis.md')
        
        with open(analysis_path, 'w') as f:
            f.write(f"# Theoretical Analysis: {model_name}\n\n")
            
            f.write("## 1. Why Wanda Outperforms Magnitude Pruning\n\n")
            f.write("Wanda (Weight AND Activation) pruning improves upon magnitude-only pruning by ")
            f.write("incorporating **input activation statistics** into the pruning decision.\n\n")
            
            f.write("### Core Insight\n\n")
            f.write("A weight's importance ≠ just its magnitude, but rather:\n\n")
            f.write("```\nImportance = |Weight| × Input_Activation_Norm\n```\n\n")
            
            f.write("### Why This Matters\n\n")
            f.write("- **Magnitude pruning:** Removes smallest weights → assumes magnitude = importance\n")
            f.write("- **Wanda:** Considers BOTH weight size AND how much it's \"used\" by real data\n\n")
            
            f.write("### Mathematical Formulation\n\n")
            f.write("```\n")
            f.write("Magnitude metric: S_mag(w_ij) = |w_ij|\n")
            f.write("Wanda metric:     S_wanda(w_ij) = |w_ij| × ||X_j||₂\n")
            f.write("```\n\n")
            f.write("where `X_j` is the j-th input feature across calibration samples.\n\n")
            
            f.write("### Practical Example\n\n")
            f.write("Consider two weights:\n\n")
            f.write("- **Weight A:** Large magnitude (0.5), rarely activated (low X norm)\n")
            f.write("- **Weight B:** Medium magnitude (0.3), frequently activated (high X norm)\n\n")
            f.write("**Magnitude pruning** → Keeps A, removes B ❌ (wrong!)\n\n")
            f.write("**Wanda** → Keeps B, removes A ✅ (correct!)\n\n")
            
            f.write("### Result\n\n")
            f.write("Wanda preserves weights that are actually important for the model's computations ")
            f.write("on real data, leading to better performance retention.\n\n")
            
            # Layer-wise behavior
            f.write("## 2. Layer-Wise Behavior\n\n")
            stats_wanda_50 = data['sparsity_stats'].get('wanda_50', {})
            if 'layer_sparsities' in stats_wanda_50:
                layer_sparsities = np.array(stats_wanda_50['layer_sparsities']) * 100
                mean_sparsity = np.mean(layer_sparsities)
                std_sparsity = np.std(layer_sparsities)
                min_layer = np.argmin(layer_sparsities)
                max_layer = np.argmax(layer_sparsities)
                
                f.write("### Observed Layer Statistics (Wanda 50%)\n\n")
                f.write(f"- **Mean sparsity:** {mean_sparsity:.2f}%\n")
                f.write(f"- **Std deviation:** {std_sparsity:.2f}%\n")
                f.write(f"- **Min sparsity:** {layer_sparsities[min_layer]:.2f}% (Layer {min_layer})\n")
                f.write(f"- **Max sparsity:** {layer_sparsities[max_layer]:.2f}% (Layer {max_layer})\n\n")
                
                f.write("### Interpretation\n\n")
                early_late = 'lower' if min_layer < len(layer_sparsities)//2 else 'higher'
                f.write(f"- **Lower layers (early):** Tend to have {early_late} sparsity\n")
                f.write("  - More critical for basic feature extraction\n")
                late_early = 'higher' if max_layer > len(layer_sparsities)//2 else 'lower'
                f.write(f"- **Higher layers (late):** Tend to have {late_early} sparsity\n")
                f.write("  - More redundancy in high-level representations\n\n")
                
                variation = 'significant' if std_sparsity > 5 else 'relatively uniform'
                f.write(f"Standard deviation of {std_sparsity:.2f}% indicates **{variation}** variation ")
                f.write("across layers, showing Wanda adapts pruning to layer importance.\n\n")
            
            # Failure modes
            f.write("## 3. Failure Modes & Limitations\n\n")
            
            zs_data = data['zero_shot']
            dense_zs = zs_data.get('dense', {})
            wanda_50_zs = zs_data.get('wanda_50', {})
            
            task_degradations = {}
            for task in ZERO_SHOT_TASKS:
                if task != 'average':
                    dense_acc = dense_zs.get(task, 0) * 100
                    wanda_acc = wanda_50_zs.get(task, 0) * 100
                    degradation = dense_acc - wanda_acc
                    task_degradations[task] = degradation
            
            if task_degradations:
                most_affected_task = max(task_degradations, key=task_degradations.get)
                least_affected_task = min(task_degradations, key=task_degradations.get)
                
                f.write("### Failure Mode 1: Task-Specific Degradation\n\n")
                f.write(f"- **Most affected task:** {most_affected_task} ({task_degradations[most_affected_task]:.2f}% accuracy drop)\n")
                f.write(f"- **Least affected task:** {least_affected_task} ({task_degradations[least_affected_task]:.2f}% accuracy drop)\n\n")
                f.write(f"**Why:** Tasks requiring more complex reasoning (e.g., {most_affected_task}) are more ")
                f.write("sensitive to pruning. Wanda preserves perplexity well but may lose some higher-order ")
                f.write("reasoning capacity.\n\n")
                
                f.write("### Failure Mode 2: Calibration Data Dependency\n\n")
                f.write("Wanda requires calibration data (WikiText-2) to compute activation norms.\n\n")
                f.write("**Risk:** If calibration data distribution ≠ target task distribution, Wanda may ")
                f.write("optimize for wrong activations.\n\n")
                f.write("**Example:** WikiText (formal text) vs Code generation → activation patterns differ\n\n")
                f.write("**Mitigation:** Use diverse calibration data or task-specific calibration.\n\n")
                
                f.write("### Failure Mode 3: High Sparsity Regime\n\n")
                f.write("At 70% sparsity and beyond, BOTH magnitude and Wanda degrade significantly.\n\n")
                f.write("**Root cause:** Too many weights removed → model capacity fundamentally limited\n\n")
                
                ppl_data = data['perplexity']
                dense_ppl = ppl_data.get('dense', 0)
                wanda_70_ppl = ppl_data.get('wanda_70', 0)
                wanda_50_ppl = ppl_data.get('wanda_50', 0)
                
                deg_50 = ((wanda_50_ppl - dense_ppl) / dense_ppl) * 100
                deg_70 = ((wanda_70_ppl - dense_ppl) / dense_ppl) * 100
                
                f.write("**Observation from results:**\n\n")
                f.write(f"- 50% sparsity: {deg_50:.2f}% perplexity increase (manageable)\n")
                severity = 'severe' if deg_70 > 50 else 'significant'
                f.write(f"- 70% sparsity: {deg_70:.2f}% perplexity increase ({severity})\n\n")
                f.write("**Insight:** Wanda helps, but cannot overcome fundamental capacity limits.\n\n")
                
                f.write("### Failure Mode 4: Unstructured Pruning Overhead\n\n")
                f.write("Wanda (in this implementation) uses **unstructured** pruning → sparse weights ")
                f.write("scattered throughout matrices.\n\n")
                f.write("**Problem:** Modern hardware (GPUs, NPUs) doesn't efficiently accelerate unstructured ")
                f.write("sparsity without specialized kernels.\n\n")
                f.write("**Real-world implication:** 50% sparsity may only yield ~10-20% speedup, not 2x.\n\n")
                f.write("**Solution:** Structured pruning (N:M sparsity) trades some accuracy for guaranteed ")
                f.write("hardware acceleration.\n\n")
                
                f.write("### Failure Mode 5: No Fine-Tuning Recovery\n\n")
                f.write("This experiment uses \"one-shot\" pruning without post-pruning fine-tuning.\n\n")
                f.write("**Limitation:** Model doesn't adapt to sparsity pattern → performance ceiling.\n\n")
                f.write("**Potential improvement:** Brief fine-tuning (few epochs) could recover significant ")
                f.write("accuracy, especially at high sparsity.\n\n")
            
            # Recommendations
            f.write("## 4. Practical Recommendations\n\n")
            f.write("### For Production Deployment\n\n")
            f.write("1. Use Wanda with structured (2:4 or 4:8) sparsity for hardware efficiency\n")
            f.write("2. Calibrate on data matching your target distribution\n")
            f.write("3. Consider brief fine-tuning after pruning for recovery\n")
            f.write("4. Start conservatively (30-50% sparsity) before pushing higher\n")
            f.write("5. Monitor task-specific degradation, not just overall metrics\n\n")
            
            f.write("### For Research Extensions\n\n")
            f.write("1. Test with multiple calibration datasets to study distribution sensitivity\n")
            f.write("2. Compare layer-wise vs global sparsity allocation\n")
            f.write("3. Investigate learned pruning schedules (varying sparsity by layer)\n")
            f.write("4. Combine Wanda with quantization for compound compression\n")
        
        print(f"✓ Saved: {analysis_path}")

# ==========================================
# EXPERIMENT ENGINE
# ==========================================

def run_experiment(model_id, output_dir):
    """Run comprehensive experiment for a single model."""
    print(f"\n{'!'*80}\nSTARTING EXPERIMENT\nModel: {model_id}\nOutput: {output_dir}\n{'!'*80}")
    
    # Initialize
    pruner = WandaPruner(model_id)
    pruner.load_model()
    trainloader, testenc = get_wikitext2(pruner.tokenizer, NSAMPLES, SEQ_LEN)
    
    all_results = {
        pruner.model_name: {
            'perplexity': {}, 
            'zero_shot': {},
            'sparsity_stats': {},
            'timing': {}
        }
    }

    # Baseline Evaluation
    print("\n" + "="*80)
    print("EVALUATING DENSE BASELINE")
    print("="*80)
    base_ppl, ppl_time = eval_perplexity(pruner.model, testenc)
    base_zs, zs_time = eval_zero_shot_with_harness(pruner.model, pruner.tokenizer, pruner.model_name)
    all_results[pruner.model_name]['perplexity']['dense'] = base_ppl
    all_results[pruner.model_name]['zero_shot']['dense'] = base_zs
    all_results[pruner.model_name]['timing']['ppl_dense'] = ppl_time
    all_results[pruner.model_name]['timing']['zero_shot_dense'] = zs_time

    # Pruning Loop
    for sparsity in SPARSITY_RATIOS:
        s_int = int(sparsity*100)
        
        # Magnitude Pruning
        print(f"\n{'='*80}\nMAGNITUDE PRUNING @ {s_int}%\n{'='*80}")
        pruner.reset_model()
        stats_mag = pruner.prune("magnitude", sparsity, trainloader)
        ppl_mag, ppl_time = eval_perplexity(pruner.model, testenc)
        zs_mag, zs_time = eval_zero_shot_with_harness(pruner.model, pruner.tokenizer, "mag")
        
        key = f"magnitude_{s_int}"
        all_results[pruner.model_name]['perplexity'][key] = ppl_mag
        all_results[pruner.model_name]['zero_shot'][key] = zs_mag
        all_results[pruner.model_name]['sparsity_stats'][key] = stats_mag
        all_results[pruner.model_name]['timing'][f'ppl_{key}'] = ppl_time
        all_results[pruner.model_name]['timing'][f'zero_shot_{key}'] = zs_time
        
        # Wanda Pruning
        print(f"\n{'='*80}\nWANDA PRUNING @ {s_int}%\n{'='*80}")
        pruner.reset_model()
        stats_wanda = pruner.prune("wanda", sparsity, trainloader)
        ppl_wanda, ppl_time = eval_perplexity(pruner.model, testenc)
        zs_wanda, zs_time = eval_zero_shot_with_harness(pruner.model, pruner.tokenizer, "wanda")
        
        key = f"wanda_{s_int}"
        all_results[pruner.model_name]['perplexity'][key] = ppl_wanda
        all_results[pruner.model_name]['zero_shot'][key] = zs_wanda
        all_results[pruner.model_name]['sparsity_stats'][key] = stats_wanda
        all_results[pruner.model_name]['timing'][f'ppl_{key}'] = ppl_time
        all_results[pruner.model_name]['timing'][f'zero_shot_{key}'] = zs_time

    # Add pruner timing stats
    all_results[pruner.model_name]['timing'].update(pruner.timing_stats)
    
    # Generate outputs
    plot_results(all_results, output_dir)
    save_detailed_results(all_results, output_dir)
    save_final_report(all_results, output_dir)
    save_theoretical_analysis(all_results, output_dir)
        
    # Cleanup
    print("\n[Cleanup] Freeing GPU memory...")
    del pruner.model
    del pruner
    gc.collect()
    torch.cuda.empty_cache()
    print("✓ Experiment Complete.\n")
    
    return all_results

✓ Logged in to HuggingFace
✓ lm-evaluation-harness imported successfully
Device: cuda


In [4]:
if __name__ == "__main__":
    # Run LLaMA-2
    print("\n\n" + "█"*80)
    print("EXPERIMENT 1: LLaMA-2-7B")
    print("█"*80)
    results_llama2 = run_experiment("meta-llama/Llama-2-7b-hf", "results_llama2")



████████████████████████████████████████████████████████████████████████████████
EXPERIMENT 1: LLaMA-2-7B
████████████████████████████████████████████████████████████████████████████████

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
STARTING EXPERIMENT
Model: meta-llama/Llama-2-7b-hf
Output: results_llama2
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]


[Model] Loading meta-llama/Llama-2-7b-hf...


config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

[Model] Total Parameters: 6,738,415,616
[Model] Caching weights to CPU...
[Model] Loaded in 53.67s

[Data] Loading wikitext...
[Data] Selecting 64 calibration sequences...

EVALUATING DENSE BASELINE

[Perplexity] Evaluating on WikiText-2...


Perplexity:   0%|          | 0/84 [00:00<?, ?it/s]

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration


Perplexity: 5.1140 (eval time: 33.3s)

[Zero-Shot] Tasks: ['piqa', 'hellaswag', 'arc_easy', 'boolq', 'rte']


[Task: boolq] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: boolq] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
[Task: rte] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: rte] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
Overwriting default num_fewshot of rte from None to 0
Overwriting default num_fewshot of boolq from None to 0
Overwriting default num_fewshot of arc_easy from None to 0
Overwriting default num_fewshot of hellaswag from None to 0
Overwriting default num_fewshot of piqa from None to 0
100%|██████████| 277/277 [00:00<00:00, 2571.11it/s]
100%|██████████| 3270/3270 [00:01<00:00, 2627.91it/s]
100%|██████████| 2376/2376 [00:01<00:00, 1468.94it/s]
100%|██████████| 10042/10042 [00:02<00:00, 3353.69it/s]
100%|██████████| 1838/1838 [00:01<00:00, 1435.52it/s]
Running loglikelihood requests:   0%|          | 0/60439 [00:0

Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 25


Running loglikelihood requests: 100%|██████████| 60439/60439 [08:32<00:00, 117.94it/s]
fatal: not a git repository (or any parent up to mount point /teamspace/studios)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


  piqa: 78.13%
  hellaswag: 57.10%
  arc_easy: 75.46%
  boolq: 79.33%
  rte: 63.90%
Average: 70.78% (eval time: 569.6s)

MAGNITUDE PRUNING @ 30%

[Model] Resetting model weights from cache...

PRUNING: MAGNITUDE @ 30%


Calibration:   0%|          | 0/64 [00:00<?, ?it/s]


[Phase 2] Layer-by-layer pruning...


Processing Layers:   0%|          | 0/32 [00:00<?, ?it/s]

[Verification] Target: 30.00%, Actual: 30.08%

[Perplexity] Evaluating on WikiText-2...


Perplexity:   0%|          | 0/84 [00:00<?, ?it/s]

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration


Perplexity: 5.7350 (eval time: 33.1s)

[Zero-Shot] Tasks: ['piqa', 'hellaswag', 'arc_easy', 'boolq', 'rte']


[Task: boolq] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: boolq] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
[Task: rte] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: rte] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
Overwriting default num_fewshot of rte from None to 0
Overwriting default num_fewshot of boolq from None to 0
Overwriting default num_fewshot of arc_easy from None to 0
Overwriting default num_fewshot of hellaswag from None to 0
Overwriting default num_fewshot of piqa from None to 0
100%|██████████| 277/277 [00:00<00:00, 2663.03it/s]
100%|██████████| 3270/3270 [00:01<00:00, 2656.81it/s]
100%|██████████| 2376/2376 [00:01<00:00, 1515.71it/s]
100%|██████████| 10042/10042 [00:02<00:00, 3669.75it/s]
100%|██████████| 1838/1838 [00:01<00:00, 1414.12it/s]
Running loglikelihood requests:   0%|          | 0/60439 [00:0

Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 25


Running loglikelihood requests: 100%|██████████| 60439/60439 [08:16<00:00, 121.79it/s]
fatal: not a git repository (or any parent up to mount point /teamspace/studios)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


  piqa: 78.40%
  hellaswag: 58.13%
  arc_easy: 75.97%
  boolq: 75.20%
  rte: 57.76%
Average: 69.09% (eval time: 550.2s)

WANDA PRUNING @ 30%

[Model] Resetting model weights from cache...

PRUNING: WANDA @ 30%


Calibration:   0%|          | 0/64 [00:00<?, ?it/s]


[Phase 2] Layer-by-layer pruning...


Processing Layers:   0%|          | 0/32 [00:00<?, ?it/s]

[Verification] Target: 30.00%, Actual: 30.56%

[Perplexity] Evaluating on WikiText-2...


Perplexity:   0%|          | 0/84 [00:00<?, ?it/s]

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration


Perplexity: 5.5244 (eval time: 33.4s)

[Zero-Shot] Tasks: ['piqa', 'hellaswag', 'arc_easy', 'boolq', 'rte']


[Task: boolq] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: boolq] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
[Task: rte] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: rte] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
Overwriting default num_fewshot of rte from None to 0
Overwriting default num_fewshot of boolq from None to 0
Overwriting default num_fewshot of arc_easy from None to 0
Overwriting default num_fewshot of hellaswag from None to 0
Overwriting default num_fewshot of piqa from None to 0
100%|██████████| 277/277 [00:00<00:00, 2634.96it/s]
100%|██████████| 3270/3270 [00:01<00:00, 2666.53it/s]
100%|██████████| 2376/2376 [00:01<00:00, 1525.31it/s]
100%|██████████| 10042/10042 [00:02<00:00, 3671.87it/s]
100%|██████████| 1838/1838 [00:01<00:00, 1409.01it/s]
Running loglikelihood requests:   0%|          | 0/60439 [00:0

Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 25


Running loglikelihood requests: 100%|██████████| 60439/60439 [08:14<00:00, 122.10it/s]
fatal: not a git repository (or any parent up to mount point /teamspace/studios)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


  piqa: 78.02%
  hellaswag: 56.56%
  arc_easy: 75.59%
  boolq: 78.56%
  rte: 56.68%
Average: 69.08% (eval time: 548.8s)

MAGNITUDE PRUNING @ 50%

[Model] Resetting model weights from cache...

PRUNING: MAGNITUDE @ 50%


Calibration:   0%|          | 0/64 [00:00<?, ?it/s]


[Phase 2] Layer-by-layer pruning...


Processing Layers:   0%|          | 0/32 [00:00<?, ?it/s]

[Verification] Target: 50.00%, Actual: 50.13%

[Perplexity] Evaluating on WikiText-2...


Perplexity:   0%|          | 0/84 [00:00<?, ?it/s]

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration


Perplexity: 11.8324 (eval time: 32.5s)

[Zero-Shot] Tasks: ['piqa', 'hellaswag', 'arc_easy', 'boolq', 'rte']


[Task: boolq] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: boolq] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
[Task: rte] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: rte] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
Overwriting default num_fewshot of rte from None to 0
Overwriting default num_fewshot of boolq from None to 0
Overwriting default num_fewshot of arc_easy from None to 0
Overwriting default num_fewshot of hellaswag from None to 0
Overwriting default num_fewshot of piqa from None to 0
100%|██████████| 277/277 [00:00<00:00, 2623.43it/s]
100%|██████████| 3270/3270 [00:01<00:00, 2666.54it/s]
100%|██████████| 2376/2376 [00:01<00:00, 1468.62it/s]
100%|██████████| 10042/10042 [00:02<00:00, 3657.17it/s]
100%|██████████| 1838/1838 [00:01<00:00, 1385.19it/s]
Running loglikelihood requests:   0%|          | 0/60439 [00:0

Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 25


Running loglikelihood requests: 100%|██████████| 60439/60439 [08:04<00:00, 124.64it/s]
fatal: not a git repository (or any parent up to mount point /teamspace/studios)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


  piqa: 74.59%
  hellaswag: 53.09%
  arc_easy: 67.85%
  boolq: 63.43%
  rte: 52.35%
Average: 62.26% (eval time: 539.2s)

WANDA PRUNING @ 50%

[Model] Resetting model weights from cache...

PRUNING: WANDA @ 50%


Calibration:   0%|          | 0/64 [00:00<?, ?it/s]


[Phase 2] Layer-by-layer pruning...


Processing Layers:   0%|          | 0/32 [00:00<?, ?it/s]

[Verification] Target: 50.00%, Actual: 50.04%

[Perplexity] Evaluating on WikiText-2...


Perplexity:   0%|          | 0/84 [00:00<?, ?it/s]

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration


Perplexity: 6.4154 (eval time: 32.5s)

[Zero-Shot] Tasks: ['piqa', 'hellaswag', 'arc_easy', 'boolq', 'rte']


[Task: boolq] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: boolq] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
[Task: rte] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: rte] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
Overwriting default num_fewshot of rte from None to 0
Overwriting default num_fewshot of boolq from None to 0
Overwriting default num_fewshot of arc_easy from None to 0
Overwriting default num_fewshot of hellaswag from None to 0
Overwriting default num_fewshot of piqa from None to 0
100%|██████████| 277/277 [00:00<00:00, 2545.29it/s]
100%|██████████| 3270/3270 [00:01<00:00, 2637.57it/s]
100%|██████████| 2376/2376 [00:01<00:00, 1505.88it/s]
100%|██████████| 10042/10042 [00:02<00:00, 3568.79it/s]
100%|██████████| 1838/1838 [00:01<00:00, 1407.82it/s]
Running loglikelihood requests:   0%|          | 0/60439 [00:0

Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 25


Running loglikelihood requests: 100%|██████████| 60439/60439 [08:00<00:00, 125.87it/s]
fatal: not a git repository (or any parent up to mount point /teamspace/studios)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


  piqa: 76.06%
  hellaswag: 51.82%
  arc_easy: 72.56%
  boolq: 76.09%
  rte: 54.51%
Average: 66.21% (eval time: 534.0s)

MAGNITUDE PRUNING @ 70%

[Model] Resetting model weights from cache...

PRUNING: MAGNITUDE @ 70%


Calibration:   0%|          | 0/64 [00:00<?, ?it/s]


[Phase 2] Layer-by-layer pruning...


Processing Layers:   0%|          | 0/32 [00:00<?, ?it/s]

[Verification] Target: 70.00%, Actual: 70.16%

[Perplexity] Evaluating on WikiText-2...


Perplexity:   0%|          | 0/84 [00:00<?, ?it/s]

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration


Perplexity: nan (eval time: 31.1s)

[Zero-Shot] Tasks: ['piqa', 'hellaswag', 'arc_easy', 'boolq', 'rte']


[Task: boolq] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: boolq] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
[Task: rte] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: rte] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
Overwriting default num_fewshot of rte from None to 0
Overwriting default num_fewshot of boolq from None to 0
Overwriting default num_fewshot of arc_easy from None to 0
Overwriting default num_fewshot of hellaswag from None to 0
Overwriting default num_fewshot of piqa from None to 0
100%|██████████| 277/277 [00:00<00:00, 2624.55it/s]
100%|██████████| 3270/3270 [00:01<00:00, 2649.47it/s]
100%|██████████| 2376/2376 [00:01<00:00, 1507.70it/s]
100%|██████████| 10042/10042 [00:02<00:00, 3584.44it/s]
100%|██████████| 1838/1838 [00:01<00:00, 1408.71it/s]
Running loglikelihood requests:   0%|          | 0/60439 [00:0

Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 25


Running loglikelihood requests: 100%|██████████| 60439/60439 [07:38<00:00, 131.86it/s]
fatal: not a git repository (or any parent up to mount point /teamspace/studios)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


  piqa: 52.77%
  hellaswag: 25.87%
  arc_easy: 25.63%
  boolq: 37.86%
  rte: 51.99%
Average: 38.82% (eval time: 512.7s)

WANDA PRUNING @ 70%

[Model] Resetting model weights from cache...

PRUNING: WANDA @ 70%


Calibration:   0%|          | 0/64 [00:00<?, ?it/s]


[Phase 2] Layer-by-layer pruning...


Processing Layers:   0%|          | 0/32 [00:00<?, ?it/s]

[Verification] Target: 70.00%, Actual: 70.03%

[Perplexity] Evaluating on WikiText-2...


Perplexity:   0%|          | 0/84 [00:00<?, ?it/s]

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration


Perplexity: 68.8045 (eval time: 31.1s)

[Zero-Shot] Tasks: ['piqa', 'hellaswag', 'arc_easy', 'boolq', 'rte']


[Task: boolq] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: boolq] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
[Task: rte] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: rte] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
Overwriting default num_fewshot of rte from None to 0
Overwriting default num_fewshot of boolq from None to 0
Overwriting default num_fewshot of arc_easy from None to 0
Overwriting default num_fewshot of hellaswag from None to 0
Overwriting default num_fewshot of piqa from None to 0
100%|██████████| 277/277 [00:00<00:00, 2648.18it/s]
100%|██████████| 3270/3270 [00:01<00:00, 2666.84it/s]
100%|██████████| 2376/2376 [00:01<00:00, 1465.35it/s]
100%|██████████| 10042/10042 [00:02<00:00, 3619.05it/s]
100%|██████████| 1838/1838 [00:01<00:00, 1353.92it/s]
Running loglikelihood requests:   0%|          | 0/60439 [00:0

Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 25


Running loglikelihood requests: 100%|██████████| 60439/60439 [07:41<00:00, 131.07it/s]
fatal: not a git repository (or any parent up to mount point /teamspace/studios)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).
posx and posy should be finite values
posx and posy should be finite values


  piqa: 54.24%
  hellaswag: 27.89%
  arc_easy: 29.55%
  boolq: 47.03%
  rte: 52.71%
Average: 42.28% (eval time: 514.8s)

GENERATING VISUALIZATIONS -> results_llama2


posx and posy should be finite values


✓ All 8 plots saved.

[Export] Saving detailed results to results_llama2...
✓ Exported: perplexity_results.csv, zero_shot_results.csv, layer_sparsity.csv, results_complete.json
✓ Saved: results_llama2/final_report.md
✓ Saved: results_llama2/theoretical_analysis.md

[Cleanup] Freeing GPU memory...
✓ Experiment Complete.



In [5]:
if __name__ == "__main__":
    # Run LLaMA-3.1
    print("\n\n" + "█"*80)
    print("EXPERIMENT 2: LLaMA-3.1-8B")
    print("█"*80)
    results_llama3 = run_experiment("meta-llama/Llama-3.1-8B", "results_llama3")



████████████████████████████████████████████████████████████████████████████████
EXPERIMENT 2: LLaMA-3.1-8B
████████████████████████████████████████████████████████████████████████████████

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
STARTING EXPERIMENT
Model: meta-llama/Llama-3.1-8B
Output: results_llama3
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

[Model] Loading meta-llama/Llama-3.1-8B...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[Model] Total Parameters: 8,030,261,248
[Model] Caching weights to CPU...
[Model] Loaded in 24.86s

[Data] Loading wikitext...


Token indices sequence length is longer than the specified maximum sequence length for this model (2436214 > 131072). Running this sequence through the model will result in indexing errors


[Data] Selecting 64 calibration sequences...

EVALUATING DENSE BASELINE

[Perplexity] Evaluating on WikiText-2...


Perplexity:   0%|          | 0/71 [00:00<?, ?it/s]

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration


Perplexity: 5.8464 (eval time: 72.9s)

[Zero-Shot] Tasks: ['piqa', 'hellaswag', 'arc_easy', 'boolq', 'rte']


[Task: boolq] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: boolq] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
[Task: rte] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: rte] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
Overwriting default num_fewshot of rte from None to 0
Overwriting default num_fewshot of boolq from None to 0
Overwriting default num_fewshot of arc_easy from None to 0
Overwriting default num_fewshot of hellaswag from None to 0
Overwriting default num_fewshot of piqa from None to 0
100%|██████████| 277/277 [00:00<00:00, 2434.83it/s]
100%|██████████| 3270/3270 [00:01<00:00, 2453.92it/s]
100%|██████████| 2376/2376 [00:01<00:00, 1456.40it/s]
100%|██████████| 10042/10042 [00:03<00:00, 3200.93it/s]
100%|██████████| 1838/1838 [00:01<00:00, 1350.09it/s]
Running loglikelihood requests:   0%|          | 0/60439 [00:0

Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 22


Running loglikelihood requests: 100%|██████████| 60439/60439 [08:59<00:00, 112.03it/s]
fatal: not a git repository (or any parent up to mount point /teamspace/studios)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


  piqa: 79.27%
  hellaswag: 60.67%
  arc_easy: 82.20%
  boolq: 83.09%
  rte: 71.12%
Average: 75.27% (eval time: 600.3s)

MAGNITUDE PRUNING @ 30%

[Model] Resetting model weights from cache...

PRUNING: MAGNITUDE @ 30%


Calibration:   0%|          | 0/64 [00:00<?, ?it/s]


[Phase 2] Layer-by-layer pruning...


Processing Layers:   0%|          | 0/32 [00:00<?, ?it/s]

[Verification] Target: 30.00%, Actual: 30.09%

[Perplexity] Evaluating on WikiText-2...


Perplexity:   0%|          | 0/71 [00:00<?, ?it/s]

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration


Perplexity: 8.5034 (eval time: 73.1s)

[Zero-Shot] Tasks: ['piqa', 'hellaswag', 'arc_easy', 'boolq', 'rte']


[Task: boolq] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: boolq] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
[Task: rte] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: rte] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
Overwriting default num_fewshot of rte from None to 0
Overwriting default num_fewshot of boolq from None to 0
Overwriting default num_fewshot of arc_easy from None to 0
Overwriting default num_fewshot of hellaswag from None to 0
Overwriting default num_fewshot of piqa from None to 0
100%|██████████| 277/277 [00:00<00:00, 2547.44it/s]
100%|██████████| 3270/3270 [00:01<00:00, 2521.87it/s]
100%|██████████| 2376/2376 [00:01<00:00, 1470.69it/s]
100%|██████████| 10042/10042 [00:02<00:00, 3467.60it/s]
100%|██████████| 1838/1838 [00:01<00:00, 1355.24it/s]
Running loglikelihood requests:   0%|          | 0/60439 [00:0

Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 22


Running loglikelihood requests: 100%|██████████| 60439/60439 [08:44<00:00, 115.20it/s]
fatal: not a git repository (or any parent up to mount point /teamspace/studios)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


  piqa: 76.61%
  hellaswag: 56.37%
  arc_easy: 78.37%
  boolq: 80.18%
  rte: 67.15%
Average: 71.74% (eval time: 586.0s)

WANDA PRUNING @ 30%

[Model] Resetting model weights from cache...

PRUNING: WANDA @ 30%


Calibration:   0%|          | 0/64 [00:00<?, ?it/s]


[Phase 2] Layer-by-layer pruning...


Processing Layers:   0%|          | 0/32 [00:00<?, ?it/s]

[Verification] Target: 30.00%, Actual: 30.01%

[Perplexity] Evaluating on WikiText-2...


Perplexity:   0%|          | 0/71 [00:00<?, ?it/s]

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration


Perplexity: 6.3185 (eval time: 72.9s)

[Zero-Shot] Tasks: ['piqa', 'hellaswag', 'arc_easy', 'boolq', 'rte']


[Task: boolq] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: boolq] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
[Task: rte] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: rte] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
Overwriting default num_fewshot of rte from None to 0
Overwriting default num_fewshot of boolq from None to 0
Overwriting default num_fewshot of arc_easy from None to 0
Overwriting default num_fewshot of hellaswag from None to 0
Overwriting default num_fewshot of piqa from None to 0
100%|██████████| 277/277 [00:00<00:00, 2442.82it/s]
100%|██████████| 3270/3270 [00:01<00:00, 2580.80it/s]
100%|██████████| 2376/2376 [00:01<00:00, 1422.93it/s]
100%|██████████| 10042/10042 [00:02<00:00, 3540.82it/s]
100%|██████████| 1838/1838 [00:01<00:00, 1304.49it/s]
Running loglikelihood requests:   0%|          | 0/60439 [00:0

Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 22


Running loglikelihood requests: 100%|██████████| 60439/60439 [08:48<00:00, 114.39it/s]
fatal: not a git repository (or any parent up to mount point /teamspace/studios)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


  piqa: 78.35%
  hellaswag: 59.55%
  arc_easy: 80.51%
  boolq: 82.94%
  rte: 71.12%
Average: 74.49% (eval time: 589.4s)

MAGNITUDE PRUNING @ 50%

[Model] Resetting model weights from cache...

PRUNING: MAGNITUDE @ 50%


Calibration:   0%|          | 0/64 [00:00<?, ?it/s]


[Phase 2] Layer-by-layer pruning...


Processing Layers:   0%|          | 0/32 [00:00<?, ?it/s]

[Verification] Target: 50.00%, Actual: 50.15%

[Perplexity] Evaluating on WikiText-2...


Perplexity:   0%|          | 0/71 [00:00<?, ?it/s]

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration


Perplexity: 37.8210 (eval time: 72.3s)

[Zero-Shot] Tasks: ['piqa', 'hellaswag', 'arc_easy', 'boolq', 'rte']


[Task: boolq] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: boolq] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
[Task: rte] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: rte] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
Overwriting default num_fewshot of rte from None to 0
Overwriting default num_fewshot of boolq from None to 0
Overwriting default num_fewshot of arc_easy from None to 0
Overwriting default num_fewshot of hellaswag from None to 0
Overwriting default num_fewshot of piqa from None to 0
100%|██████████| 277/277 [00:00<00:00, 2519.29it/s]
100%|██████████| 3270/3270 [00:01<00:00, 2595.93it/s]
100%|██████████| 2376/2376 [00:01<00:00, 1457.83it/s]
100%|██████████| 10042/10042 [00:02<00:00, 3484.96it/s]
100%|██████████| 1838/1838 [00:01<00:00, 1308.41it/s]
Running loglikelihood requests:   0%|          | 0/60439 [00:0

Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 22


Running loglikelihood requests: 100%|██████████| 60439/60439 [08:31<00:00, 118.05it/s]
fatal: not a git repository (or any parent up to mount point /teamspace/studios)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


  piqa: 70.46%
  hellaswag: 43.27%
  arc_easy: 62.21%
  boolq: 52.42%
  rte: 53.79%
Average: 56.43% (eval time: 574.4s)

WANDA PRUNING @ 50%

[Model] Resetting model weights from cache...

PRUNING: WANDA @ 50%


Calibration:   0%|          | 0/64 [00:00<?, ?it/s]


[Phase 2] Layer-by-layer pruning...


Processing Layers:   0%|          | 0/32 [00:00<?, ?it/s]

[Verification] Target: 50.00%, Actual: 50.03%

[Perplexity] Evaluating on WikiText-2...


Perplexity:   0%|          | 0/71 [00:00<?, ?it/s]

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration


Perplexity: 8.9966 (eval time: 72.2s)

[Zero-Shot] Tasks: ['piqa', 'hellaswag', 'arc_easy', 'boolq', 'rte']


[Task: boolq] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: boolq] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
[Task: rte] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: rte] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
Overwriting default num_fewshot of rte from None to 0
Overwriting default num_fewshot of boolq from None to 0
Overwriting default num_fewshot of arc_easy from None to 0
Overwriting default num_fewshot of hellaswag from None to 0
Overwriting default num_fewshot of piqa from None to 0
100%|██████████| 277/277 [00:00<00:00, 2485.84it/s]
100%|██████████| 3270/3270 [00:01<00:00, 2549.81it/s]
100%|██████████| 2376/2376 [00:01<00:00, 1455.37it/s]
100%|██████████| 10042/10042 [00:02<00:00, 3439.94it/s]
100%|██████████| 1838/1838 [00:01<00:00, 1352.21it/s]
Running loglikelihood requests:   0%|          | 0/60439 [00:0

Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 22


Running loglikelihood requests: 100%|██████████| 60439/60439 [08:28<00:00, 118.93it/s]
fatal: not a git repository (or any parent up to mount point /teamspace/studios)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


  piqa: 73.88%
  hellaswag: 50.53%
  arc_easy: 71.84%
  boolq: 79.14%
  rte: 54.87%
Average: 66.05% (eval time: 569.0s)

MAGNITUDE PRUNING @ 70%

[Model] Resetting model weights from cache...

PRUNING: MAGNITUDE @ 70%


Calibration:   0%|          | 0/64 [00:00<?, ?it/s]


[Phase 2] Layer-by-layer pruning...


Processing Layers:   0%|          | 0/32 [00:00<?, ?it/s]

[Verification] Target: 70.00%, Actual: 70.14%

[Perplexity] Evaluating on WikiText-2...


Perplexity:   0%|          | 0/71 [00:00<?, ?it/s]

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration


Perplexity: 263197.6250 (eval time: 71.6s)

[Zero-Shot] Tasks: ['piqa', 'hellaswag', 'arc_easy', 'boolq', 'rte']


[Task: boolq] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: boolq] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
[Task: rte] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: rte] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
Overwriting default num_fewshot of rte from None to 0
Overwriting default num_fewshot of boolq from None to 0
Overwriting default num_fewshot of arc_easy from None to 0
Overwriting default num_fewshot of hellaswag from None to 0
Overwriting default num_fewshot of piqa from None to 0
100%|██████████| 277/277 [00:00<00:00, 2490.71it/s]
100%|██████████| 3270/3270 [00:01<00:00, 2566.16it/s]
100%|██████████| 2376/2376 [00:01<00:00, 1455.56it/s]
100%|██████████| 10042/10042 [00:02<00:00, 3543.21it/s]
100%|██████████| 1838/1838 [00:01<00:00, 1359.04it/s]
Running loglikelihood requests:   0%|          | 0/60439 [00:0

Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 22


Running loglikelihood requests: 100%|██████████| 60439/60439 [08:10<00:00, 123.16it/s]
fatal: not a git repository (or any parent up to mount point /teamspace/studios)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


  piqa: 54.73%
  hellaswag: 26.44%
  arc_easy: 29.71%
  boolq: 37.83%
  rte: 52.71%
Average: 40.28% (eval time: 551.9s)

WANDA PRUNING @ 70%

[Model] Resetting model weights from cache...

PRUNING: WANDA @ 70%


Calibration:   0%|          | 0/64 [00:00<?, ?it/s]


[Phase 2] Layer-by-layer pruning...


Processing Layers:   0%|          | 0/32 [00:00<?, ?it/s]

[Verification] Target: 70.00%, Actual: 70.03%

[Perplexity] Evaluating on WikiText-2...


Perplexity:   0%|          | 0/71 [00:00<?, ?it/s]

`pretrained` model kwarg is not of type `str`. Many other model arguments may be ignored. Please do not launch via accelerate or use `parallelize=True` if passing an existing model this way.
Passed an already-initialized model through `pretrained`, assuming single-process call to evaluate() or custom distributed integration


Perplexity: 96.7458 (eval time: 71.3s)

[Zero-Shot] Tasks: ['piqa', 'hellaswag', 'arc_easy', 'boolq', 'rte']


[Task: boolq] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: boolq] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
[Task: rte] metric acc is defined, but aggregation is not. using default aggregation=mean
[Task: rte] metric acc is defined, but higher_is_better is not. using default higher_is_better=True
Overwriting default num_fewshot of rte from None to 0
Overwriting default num_fewshot of boolq from None to 0
Overwriting default num_fewshot of arc_easy from None to 0
Overwriting default num_fewshot of hellaswag from None to 0
Overwriting default num_fewshot of piqa from None to 0
100%|██████████| 277/277 [00:00<00:00, 2511.50it/s]
100%|██████████| 3270/3270 [00:01<00:00, 2587.37it/s]
100%|██████████| 2376/2376 [00:01<00:00, 1439.93it/s]
100%|██████████| 10042/10042 [00:02<00:00, 3491.92it/s]
100%|██████████| 1838/1838 [00:01<00:00, 1316.90it/s]
Running loglikelihood requests:   0%|          | 0/60439 [00:0

Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 22


Running loglikelihood requests: 100%|██████████| 60439/60439 [08:06<00:00, 124.26it/s]
fatal: not a git repository (or any parent up to mount point /teamspace/studios)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


  piqa: 55.17%
  hellaswag: 27.48%
  arc_easy: 32.32%
  boolq: 44.43%
  rte: 52.71%
Average: 42.42% (eval time: 547.5s)

GENERATING VISUALIZATIONS -> results_llama3
✓ All 8 plots saved.

[Export] Saving detailed results to results_llama3...
✓ Exported: perplexity_results.csv, zero_shot_results.csv, layer_sparsity.csv, results_complete.json
✓ Saved: results_llama3/final_report.md
✓ Saved: results_llama3/theoretical_analysis.md

[Cleanup] Freeing GPU memory...
✓ Experiment Complete.



In [6]:
print("\n\n" + "█"*80)
print("ALL EXPERIMENTS COMPLETED SUCCESSFULLY")
print("Results saved to: results_llama2/ and results_llama3/")
print("█"*80)



████████████████████████████████████████████████████████████████████████████████
ALL EXPERIMENTS COMPLETED SUCCESSFULLY
Results saved to: results_llama2/ and results_llama3/
████████████████████████████████████████████████████████████████████████████████
