In [4]:


# File        : bench_llama3_8b_e1_3.ipynb
# Base Model  : meta-llama/Meta-Llama-3-8B-Instruct
# EAGLE Model : yuhuili/EAGLE-LLaMA3-Instruct-8B
# EAGLE Type  : EAGLE (use_eagle3=False)
# Quantization: 8-bit (BitsAndBytes)
# Target GPU  : T4 (15GB VRAM)
# Est. VRAM   :
import os
import time
import torch
import gc
import subprocess
import sys
import pandas as pd
from google.colab import userdata

def clear_vram():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    time.sleep(2)

print("=" * 70)
print("üöÄ EAGLE BENCHMARK - LLaMA3 8B | EAGLE-1 | 8-bit | #8")
print("=" * 70)

try:
    os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')
    from huggingface_hub import login
    login(token=os.environ["HF_TOKEN"], add_to_git_credential=False)
except: pass

print("\nüì¶ Setting up environment...")
subprocess.run("pip uninstall -y transformers bitsandbytes accelerate -q", shell=True, check=False)
#subprocess.run("pip install transformers==4.53.1 bitsandbytes>=0.44.0 accelerate==0.26.0 -q", shell=True, check=False)

#### ADDED BY SHASVAT (CLAUDE)
subprocess.run("pip install transformers==4.53.1 bitsandbytes>=0.44.0 accelerate==0.26.0 huggingface_hub>=0.25.0 -q", shell=True, check=False)
# Force-reload huggingface_hub so the new version (or patches) take effect
import importlib
import huggingface_hub
import huggingface_hub.constants
importlib.reload(huggingface_hub.constants)
importlib.reload(huggingface_hub)

# Fallback: if the installed version STILL doesn't have it, set it manually
if not hasattr(huggingface_hub.constants, 'HF_HUB_ENABLE_HF_TRANSFER'):
    huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER = False

###########


if not os.path.exists("EAGLE"):
    subprocess.run("git clone -q https://github.com/SafeAILab/EAGLE.git", shell=True, check=False)
sys.path.insert(0, os.path.abspath("EAGLE"))

# Patch cnets
cnets = "EAGLE/eagle/model/cnets.py"
if os.path.exists(cnets):
    with open(cnets, 'r') as f: c = f.read()
    if "getattr(config, 'draft_vocab_size'" not in c:
        c = c.replace("self.lm_head=nn.Linear(config.hidden_size,config.draft_vocab_size,bias=False)", "draft_vocab_size = getattr(config, 'draft_vocab_size', config.vocab_size)\n        self.lm_head=nn.Linear(config.hidden_size,draft_vocab_size,bias=False)")
        with open(cnets, 'w') as f: f.write(c)

# Monkey-patch to fix additional_chat_templates 404 bug.
# transformers==4.53.1 calls list_repo_templates() which throws 404 for older
# model repos without an additional_chat_templates folder. Patching both the
# source module and the already-imported reference in tokenization_utils_base.
import transformers.utils.hub as _hub
import transformers.tokenization_utils_base as _tub
_hub.list_repo_templates = lambda repo_id, **kwargs: []
_tub.list_repo_templates = lambda repo_id, **kwargs: []

from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM
from eagle.model.ea_model import EaModel


BASE_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
EAGLE_ID = "yuhuili/EAGLE-LLaMA3-Instruct-8B"

# Use longer prompt for better EAGLE performance
PROMPT = "Write a detailed explanation about how machine learning models work, including the training process, inference, and optimization techniques."

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True
)

tokenizer = AutoTokenizer.from_pretrained(BASE_ID, trust_remote_code=True)
messages = [{"role": "user", "content": PROMPT}]
chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(chat, return_tensors="pt").to("cuda")
input_len = inputs.input_ids.shape[1]

results = []

# =============================================================================
# TEST 1: BASELINE
# =============================================================================
clear_vram()
print("\n‚öôÔ∏è  Testing [1/2]: Baseline...")
model = AutoModelForCausalLM.from_pretrained(
    BASE_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True
)
model.eval()

# Proper warmup
with torch.no_grad():
    for _ in range(2):
        _ = model.generate(inputs.input_ids, max_new_tokens=20, do_sample=False)
torch.cuda.synchronize()

# Benchmark with EXACTLY 100 tokens
torch.cuda.synchronize()
t1 = time.time()
with torch.no_grad():
    out = model.generate(inputs.input_ids, max_new_tokens=100, do_sample=False)
torch.cuda.synchronize()
elapsed = time.time() - t1

tokens = out.shape[1] - input_len
tps = tokens / elapsed
results.append({
    "Method": "Baseline",
    "TPS": tps,
    "Time": elapsed,
    "Tokens": tokens,
    "Input": input_len
})
print(f"   Generated: {tokens} tokens in {elapsed:.2f}s = {tps:.2f} tok/s")

del model
clear_vram()

# =============================================================================
# TEST 2: EAGLE
# =============================================================================
print("\nü¶Ö Testing [2/2]: EAGLE...")
eagle = EaModel.from_pretrained(
    base_model_path=BASE_ID,
    ea_model_path=EAGLE_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    use_eagle3=False,
    low_cpu_mem_usage=True
)
eagle.eval()

# Proper warmup
with torch.no_grad():
    for _ in range(2):
        _ = eagle.eagenerate(inputs.input_ids, max_new_tokens=20)
torch.cuda.synchronize()

# Benchmark with EXACTLY 100 tokens
torch.cuda.synchronize()
t1 = time.time()
with torch.no_grad():
    out = eagle.eagenerate(inputs.input_ids, max_new_tokens=100, temperature=0.5)
torch.cuda.synchronize()
elapsed = time.time() - t1

tokens = out.shape[1] - input_len
tps = tokens / elapsed
results.append({
    "Method": "EAGLE",
    "TPS": tps,
    "Time": elapsed,
    "Tokens": tokens,
    "Input": input_len
})
print(f"   Generated: {tokens} tokens in {elapsed:.2f}s = {tps:.2f} tok/s")

del eagle
clear_vram()

# =============================================================================
# RESULTS
# =============================================================================
df = pd.DataFrame(results)
df['Speedup'] = df['TPS'] / df.iloc[0]['TPS']

print("\n" + "="*70)
print("üìä BENCHMARK RESULTS (100 tokens, longer prompt)")
print("="*70)
print(df.to_string(index=False))
print("="*70)

baseline_tps = df.iloc[0]['TPS']
eagle_tps = df.iloc[1]['TPS']
speedup = df.iloc[1]['Speedup']
time_saved = df.iloc[0]['Time'] - df.iloc[1]['Time']

print(f"\nüéØ Results:")
print(f"   Baseline: {baseline_tps:.2f} tok/s")
print(f"   EAGLE:    {eagle_tps:.2f} tok/s")
print(f"   Speedup:  {speedup:.2f}x")
print(f"   Time saved: {time_saved:.2f}s")

if speedup >= 1.3:
    print("\n‚úÖ EXCELLENT: EAGLE provides significant speedup")
elif speedup >= 1.15:
    print("\n‚úì GOOD: EAGLE provides moderate speedup")
elif speedup >= 1.05:
    print("\n‚ö†Ô∏è MODEST: EAGLE provides minimal speedup")
else:
    print("\n‚ùå ISSUE: EAGLE is slower than baseline")
    print("   Possible causes:")
    print("   - Prompt too short (EAGLE needs longer context)")
    print("   - Generation too short (overhead dominates)")
    print("   - Quantization degrading draft quality too much")

print("="*70)


# =============================================================================
# DIVERGENCE ANALYSIS
# Derives Total Variation Distance (TVD) from the observed speedup.
#
# Theory:
#   In speculative decoding, if the draft model accepts tokens with
#   probability alpha, the expected accepted tokens per step follows:
#       tau = (1 - alpha^(gamma+1)) / (1 - alpha)
#   where gamma is the draft length (5 for EAGLE-1).
#   TVD is then: TVD = 1 - alpha
#   A lower TVD means the draft distribution closely matches the target.
# =============================================================================
gamma    = 5    # EAGLE-1 draft length (tokens proposed per step)
overhead = 0.1  # EAGLE head forward pass overhead (~10% of target model)
tau      = speedup * (1 + overhead)  # Estimated avg accepted tokens/step


def estimate_alpha(target_tau, g, tolerance=0.001):
    """Binary search for acceptance rate alpha given observed tau."""
    low, high = 0.0, 1.0
    for _ in range(20):
        mid = (low + high) / 2
        current_tau = (1 - mid**(g + 1)) / (1 - mid) if mid < 1.0 else g + 1
        if abs(current_tau - target_tau) < tolerance:
            return mid
        if current_tau < target_tau:
            low = mid
        else:
            high = mid
    return low


alpha = estimate_alpha(tau, gamma)
tvd   = 1.0 - alpha

print("\nüìê Divergence Analysis:")
print(f"   Avg Tokens Accepted/Step (œÑ) : {tau:.2f}")
print(f"   Token Acceptance Rate (Œ±)    : {alpha*100:.1f}%")
print(f"   Total Variation Distance (TVD): {tvd:.4f}")

print("\nüí° Interpretation:")
if tvd < 0.2:
    print(f"   EXCELLENT ALIGNMENT (TVD={tvd:.2f}): Draft head closely mirrors")
    print(f"   the target distribution ‚Äî high acceptance drives the speedup.")
elif tvd < 0.4:
    print(f"   MODERATE ALIGNMENT (TVD={tvd:.2f}): Draft accepts most tokens")
    print(f"   but diverges on less predictable outputs.")
else:
    print(f"   DIVERGENT (TVD={tvd:.2f}): Draft and target disagree frequently.")
    print(f"   Consider a longer prompt or more generation steps.")

print("="*70)


üöÄ EAGLE BENCHMARK - LLaMA3 8B | EAGLE-1 | 8-bit | #8

üì¶ Setting up environment...

‚öôÔ∏è  Testing [1/2]: Baseline...


  `use_auth_token` will definitely not be supported.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for op

   Generated: 100 tokens in 16.18s = 6.18 tok/s

ü¶Ö Testing [2/2]: EAGLE...


LlamaForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly defined. However, it doesn't directly inherit from `GenerationMixin`. From üëâv4.50üëà onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B-Instruct and are newly initialized: ['model.layers.0.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.19.self_attn.rotary_emb.inv_freq', 'model.layers.2.self_attn.rotary_emb.inv_freq', 'model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.22.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_a

config.json:   0%|          | 0.00/606 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.55G [00:00<?, ?B/s]

   Generated: 103 tokens in 8.58s = 12.00 tok/s

üìä BENCHMARK RESULTS (100 tokens, longer prompt)
  Method       TPS      Time  Tokens  Input  Speedup
Baseline  6.181451 16.177431     100     33 1.000000
   EAGLE 12.001922  8.581959     103     33 1.941603

üéØ Results:
   Baseline: 6.18 tok/s
   EAGLE:    12.00 tok/s
   Speedup:  1.94x
   Time saved: 7.60s

‚úÖ EXCELLENT: EAGLE provides significant speedup

üìê Divergence Analysis:
   Avg Tokens Accepted/Step (œÑ) : 2.14
   Token Acceptance Rate (Œ±)    : 54.4%
   Total Variation Distance (TVD): 0.4561

üí° Interpretation:
   DIVERGENT (TVD=0.46): Draft and target disagree frequently.
   Consider a longer prompt or more generation steps.
