In [1]:
# =============================================================================
# File        : bench_deepseek_8b_e3_7.ipynb
# Author      : EAGLE Benchmark Suite
# Description : Benchmark EAGLE-3 speculative decoding on DeepSeek-R1-Distill-Llama-8B
#               Compares baseline autoregressive generation vs EAGLE-3 speculative
#               decoding and computes TVD-based divergence metrics.
#
# Base Model  : deepseek-ai/DeepSeek-R1-Distill-Llama-8B
# EAGLE Model : yuhuili/EAGLE3-DeepSeek-R1-Distill-LLaMA-8B
# EAGLE Type  : EAGLE-3 (use_eagle3=True)
# Quantization: 8-bit (BitsAndBytes)
# Target GPU  : T4 (15GB VRAM)
# Est. VRAM   : ~9.5 GB base + ~0.5 GB EAGLE head + ~2-3 GB KV cache
#
# Dependencies:
#   - transformers==4.53.1
#   - bitsandbytes>=0.44.0
#   - accelerate==0.26.0
#   - huggingface_hub==0.23.0
#   - EAGLE repo: https://github.com/SafeAILab/EAGLE
#
# Notes:
#   - Requires HF_TOKEN secret in Colab for model access
#   - Monkey-patches transformers hub to fix additional_chat_templates 404 bug
#   - Qwen3 stub patch not needed (DeepSeek uses LLaMA architecture)
# =============================================================================

import os
import time
import torch
import gc
import subprocess
import sys
import shutil
import pandas as pd
import numpy as np
import IPython
from google.colab import userdata


# -----------------------------------------------------------------------------
# Utility: Free GPU memory between tests
# -----------------------------------------------------------------------------
def clear_vram():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    time.sleep(2)


# -----------------------------------------------------------------------------
# Header
# -----------------------------------------------------------------------------
print("=" * 70)
print("🚀 EAGLE BENCHMARK - DeepSeek-R1 8B | EAGLE-3 | 8-bit | #7")
print("=" * 70)


# -----------------------------------------------------------------------------
# Authentication
# -----------------------------------------------------------------------------
try:
    os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')
    from huggingface_hub import login
    login(token=os.environ["HF_TOKEN"], add_to_git_credential=False)
except Exception:
    pass


# -----------------------------------------------------------------------------
# Environment Setup
# Uninstall and reinstall pinned versions for reproducibility.
# huggingface_hub==0.23.0 is required for HF_HUB_ENABLE_HF_TRANSFER compat.
# transformers==4.53.1 is required for EAGLE compatibility.
# -----------------------------------------------------------------------------
print("\n📦 Setting up environment...")
IPython.get_ipython().run_line_magic('pip', 'uninstall -y transformers bitsandbytes accelerate -q')
IPython.get_ipython().run_line_magic('pip', 'install transformers==4.53.1 bitsandbytes>=0.44.0 accelerate==0.26.0 -q')

# Force Python to reload the newly installed packages by purging cached modules
for mod in list(sys.modules.keys()):
    if 'transformers' in mod or 'huggingface_hub' in mod:
        del sys.modules[mod]

# Monkey-patch to fix additional_chat_templates 404 bug.
# transformers==4.53.1 calls list_repo_templates() which throws 404 for older
# model repos without an additional_chat_templates folder. Patching both the
# source module and the already-imported reference in tokenization_utils_base.
import transformers.utils.hub as _hub
import transformers.tokenization_utils_base as _tub
_hub.list_repo_templates = lambda repo_id, **kwargs: []
_tub.list_repo_templates = lambda repo_id, **kwargs: []


# -----------------------------------------------------------------------------
# Clone and Patch EAGLE Repository
# Always do a fresh clone to ensure patches apply cleanly.
# -----------------------------------------------------------------------------
if os.path.exists("EAGLE"):
    shutil.rmtree("EAGLE")
subprocess.run("git clone -q https://github.com/SafeAILab/EAGLE.git",
               shell=True, check=False)
sys.path.insert(0, os.path.abspath("EAGLE"))

# Patch 1: cnets.py - some EAGLE weight configs omit draft_vocab_size,
# falling back to the base model's vocab_size is safe and correct.
cnets = "EAGLE/eagle/model/cnets.py"
if os.path.exists(cnets):
    with open(cnets, 'r') as f:
        c = f.read()
    if "getattr(config, 'draft_vocab_size'" not in c:
        c = c.replace(
            "self.lm_head=nn.Linear(config.hidden_size,config.draft_vocab_size,bias=False)",
            "draft_vocab_size = getattr(config, 'draft_vocab_size', config.vocab_size)\n"
            "        self.lm_head=nn.Linear(config.hidden_size,draft_vocab_size,bias=False)"
        )
        with open(cnets, 'w') as f:
            f.write(c)

# Patch 2: Stub out Qwen3 KV module - it imports transformers internals
# that don't exist in 4.53.1. Not needed for LLaMA/DeepSeek models.
qwen3_kv = "EAGLE/eagle/model/modeling_qwen3_kv.py"
if os.path.exists(qwen3_kv):
    with open(qwen3_kv, 'w') as f:
        f.write("from torch import nn\nclass Qwen3ForCausalLM(nn.Module): pass\n")


# -----------------------------------------------------------------------------
# Imports
# -----------------------------------------------------------------------------
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, PreTrainedTokenizerFast
from eagle.model.ea_model import EaModel


# -----------------------------------------------------------------------------
# Configuration
# -----------------------------------------------------------------------------
BASE_ID  = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
EAGLE_ID = "yuhuili/EAGLE3-DeepSeek-R1-Distill-LLaMA-8B"

# Longer prompt improves EAGLE acceptance rate (more context = better drafts)
PROMPT = (
    "Write a detailed explanation about how machine learning models work, "
    "including the training process, inference, and optimization techniques."
)

bnb_config = BitsAndBytesConfig(load_in_8bit=True)


# -----------------------------------------------------------------------------
# Tokenizer
# DeepSeek-R1-Distill uses LLaMA architecture, PreTrainedTokenizerFast works.
# -----------------------------------------------------------------------------
tokenizer = PreTrainedTokenizerFast.from_pretrained(BASE_ID, trust_remote_code=True)
messages = [{"role": "user", "content": PROMPT}]
chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(chat, return_tensors="pt").to("cuda")
input_len = inputs.input_ids.shape[1]

results = []


# =============================================================================
# TEST 1: BASELINE
# Standard autoregressive generation - no speculative decoding.
# =============================================================================
clear_vram()
print("\n⚙️  Testing [1/2]: Baseline...")

model = AutoModelForCausalLM.from_pretrained(
    BASE_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True
)
model.eval()

# Warmup: ensures CUDA kernels are compiled before timing
with torch.no_grad():
    for _ in range(2):
        _ = model.generate(inputs.input_ids, max_new_tokens=20, do_sample=False)
torch.cuda.synchronize()

# Timed run
torch.cuda.synchronize()
t1 = time.time()
with torch.no_grad():
    out = model.generate(inputs.input_ids, max_new_tokens=100, do_sample=False)
torch.cuda.synchronize()
elapsed = time.time() - t1

tokens = out.shape[1] - input_len
tps = tokens / elapsed
results.append({"Method": "Baseline", "TPS": tps, "Time": elapsed,
                "Tokens": tokens, "Input": input_len})
print(f"   Generated : {tokens} tokens in {elapsed:.2f}s = {tps:.2f} tok/s")
print(f"   VRAM used : {torch.cuda.memory_allocated()/1e9:.2f} GB")

del model
clear_vram()


# =============================================================================
# TEST 2: EAGLE-3
# Speculative decoding with EAGLE-3 multi-layer hidden state draft head.
# use_eagle3=True enables the EAGLE-3 draft mechanism.
# =============================================================================
print("\n🦅 Testing [2/2]: EAGLE-3...")

eagle = EaModel.from_pretrained(
    base_model_path=BASE_ID,
    ea_model_path=EAGLE_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    use_eagle3=True,
    low_cpu_mem_usage=True
)
eagle.eval()
print(f"   VRAM after load : {torch.cuda.memory_allocated()/1e9:.2f} GB")

# Warmup
with torch.no_grad():
    for _ in range(2):
        _ = eagle.eagenerate(inputs.input_ids, max_new_tokens=20)
torch.cuda.synchronize()

# Timed run
torch.cuda.synchronize()
t1 = time.time()
with torch.no_grad():
    out = eagle.eagenerate(inputs.input_ids, max_new_tokens=100, temperature=0.5)
torch.cuda.synchronize()
elapsed = time.time() - t1

tokens = out.shape[1] - input_len
tps = tokens / elapsed
results.append({"Method": "EAGLE-3", "TPS": tps, "Time": elapsed,
                "Tokens": tokens, "Input": input_len})
print(f"   Generated : {tokens} tokens in {elapsed:.2f}s = {tps:.2f} tok/s")

del eagle
clear_vram()


# =============================================================================
# RESULTS
# =============================================================================
df = pd.DataFrame(results)
df['Speedup'] = df['TPS'] / df.iloc[0]['TPS']

print("\n" + "=" * 70)
print("📊 BENCHMARK RESULTS (100 tokens, longer prompt)")
print("=" * 70)
print(df.to_string(index=False))
print("=" * 70)

baseline_tps = df.iloc[0]['TPS']
eagle_tps    = df.iloc[1]['TPS']
speedup      = df.iloc[1]['Speedup']
time_saved   = df.iloc[0]['Time'] - df.iloc[1]['Time']

print(f"\n🎯 Summary:")
print(f"   Baseline : {baseline_tps:.2f} tok/s")
print(f"   EAGLE-3  : {eagle_tps:.2f} tok/s")
print(f"   Speedup  : {speedup:.2f}x")
print(f"   Time saved: {time_saved:.2f}s")

if speedup >= 1.3:
    print("\n✅ EXCELLENT: EAGLE-3 provides significant speedup")
elif speedup >= 1.15:
    print("\n✓  GOOD: EAGLE-3 provides moderate speedup")
elif speedup >= 1.05:
    print("\n⚠️  MODEST: EAGLE-3 provides minimal speedup")
else:
    print("\n❌ ISSUE: EAGLE-3 is slower than baseline")
    print("   Possible causes:")
    print("   - Prompt too short (EAGLE needs longer context)")
    print("   - Generation too short (overhead dominates)")
    print("   - Quantization degrading draft quality")

print("=" * 70)


# =============================================================================
# DIVERGENCE ANALYSIS
# Derives Total Variation Distance (TVD) from the observed speedup.
#
# Theory:
#   In speculative decoding, if the draft model accepts tokens with
#   probability alpha, the expected accepted tokens per step follows:
#       tau = (1 - alpha^(gamma+1)) / (1 - alpha)
#   where gamma is the draft length (5 for EAGLE-3).
#   TVD is then: TVD = 1 - alpha
#   A lower TVD means the draft distribution closely matches the target.
# =============================================================================
gamma    = 5    # EAGLE-3 draft length (tokens proposed per step)
overhead = 0.1  # EAGLE head forward pass overhead (~10% of target model)
tau      = speedup * (1 + overhead)  # Estimated avg accepted tokens/step


def estimate_alpha(target_tau, g, tolerance=0.001):
    """Binary search for acceptance rate alpha given observed tau."""
    low, high = 0.0, 1.0
    for _ in range(20):
        mid = (low + high) / 2
        current_tau = (1 - mid**(g + 1)) / (1 - mid) if mid < 1.0 else g + 1
        if abs(current_tau - target_tau) < tolerance:
            return mid
        if current_tau < target_tau:
            low = mid
        else:
            high = mid
    return low


alpha = estimate_alpha(tau, gamma)
tvd   = 1.0 - alpha

print("\n📐 Divergence Analysis:")
print(f"   Avg Tokens Accepted/Step (τ) : {tau:.2f}")
print(f"   Token Acceptance Rate (α)    : {alpha*100:.1f}%")
print(f"   Total Variation Distance (TVD): {tvd:.4f}")

print("\n💡 Interpretation:")
if tvd < 0.2:
    print(f"   EXCELLENT ALIGNMENT (TVD={tvd:.2f}): Draft head closely mirrors")
    print(f"   the target distribution — high acceptance drives the speedup.")
elif tvd < 0.4:
    print(f"   MODERATE ALIGNMENT (TVD={tvd:.2f}): Draft accepts most tokens")
    print(f"   but diverges on less predictable outputs.")
else:
    print(f"   DIVERGENT (TVD={tvd:.2f}): Draft and target disagree frequently.")
    print(f"   Consider a longer prompt or more generation steps.")

print("=" * 70)

🚀 EAGLE BENCHMARK - DeepSeek-R1 8B | EAGLE-3 | 8-bit | #7


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.



📦 Setting up environment...
[0m

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LlamaTokenizerFast'. 
The class this function is called from is 'PreTrainedTokenizerFast'.



⚙️  Testing [1/2]: Baseline...


config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/7.39G [00:00<?, ?B/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for op

   Generated : 100 tokens in 15.55s = 6.43 tok/s
   VRAM used : 9.10 GB

🦅 Testing [2/2]: EAGLE-3...


LlamaForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly defined. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly defined. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
LlamaForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly defined. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the

config.json:   0%|          | 0.00/591 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/850M [00:00<?, ?B/s]

   VRAM after load : 11.00 GB
   Generated : 103 tokens in 3.33s = 30.98 tok/s

📊 BENCHMARK RESULTS (100 tokens, longer prompt)
  Method       TPS      Time  Tokens  Input  Speedup
Baseline  6.430155 15.551724     100     27 1.000000
 EAGLE-3 30.976848  3.325064     103     27 4.817434

🎯 Summary:
   Baseline : 6.43 tok/s
   EAGLE-3  : 30.98 tok/s
   Speedup  : 4.82x
   Time saved: 12.23s

✅ EXCELLENT: EAGLE-3 provides significant speedup

📐 Divergence Analysis:
   Avg Tokens Accepted/Step (τ) : 5.30
   Token Acceptance Rate (α)    : 95.0%
   Total Variation Distance (TVD): 0.0499

💡 Interpretation:
   EXCELLENT ALIGNMENT (TVD=0.05): Draft head closely mirrors
   the target distribution — high acceptance drives the speedup.
