# Speculative Decoding Baseline Testing

This notebook provides an interactive interface for running speculative decoding benchmarks across multiple model configurations and datasets.

## Setup

First, ensure all dependencies are installed.

In [1]:
# Install dependencies (run once)
!pip install torch transformers accelerate datasets numpy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Sync Files to Colab (Choose One Method)

If you're running this notebook in Google Colab, you need to sync the `test_speculative_decoding.py` file.

In [2]:
# Sync test_speculative_decoding.py (for Colab users)
import os

try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

if IN_COLAB and not os.path.exists('test_speculative_decoding.py'):
    import urllib.request
    github_url = "https://raw.githubusercontent.com/tsurbs/SpecDec/main/test_speculative_decoding.py"
    try:
        urllib.request.urlretrieve(github_url, 'test_speculative_decoding.py')
        print("✓ Downloaded test_speculative_decoding.py from GitHub")
    except Exception as e:
        print(f"✗ Download failed: {e}")
        print("Manually upload the file via Colab's file browser")
elif IN_COLAB:
    print("✓ test_speculative_decoding.py already exists")
else:
    print("✓ Using local files")

✓ Using local files


In [3]:
# Import required modules
import sys
import json
import torch
import warnings
warnings.filterwarnings('ignore')

# Import testing framework
from test_speculative_decoding import (
    SpeculativeDecodingTester,
    load_pile_samples,
    load_stack_samples,
    generate_latex_table
)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.9.1
CUDA available: False


## Configuration

In [4]:
# Model Configurations
MODEL_CONFIGS = {
    'GPT-2': {
        'verifier': 'gpt2-large',
        'draft': 'distilgpt2'
    },
    'Qwen': {
        'verifier': 'Qwen/Qwen2.5-7B',
        'draft': 'Qwen/Qwen2.5-0.5B'
    },
    'Pythia': {
        'verifier': 'EleutherAI/pythia-12b',
        'draft': 'EleutherAI/pythia-70m'
    }
}

# Test Parameters
TEST_PARAMS = {
    'max_new_tokens': 100,
    'gamma': 5,
    'num_nl_samples': 3,
    'num_code_samples': 2
}

CODE_LANGUAGES = ['python', 'c', 'go', 'rust']

print(f"Models: {list(MODEL_CONFIGS.keys())}, Gamma: {TEST_PARAMS['gamma']}, Languages: {CODE_LANGUAGES}")

Models: ['GPT-2', 'Qwen', 'Pythia'], Gamma: 5, Languages: ['python', 'c', 'go', 'rust']


## Load Test Data

In [5]:
# Load Natural Language samples from The Pile
nl_prompts = load_pile_samples(TEST_PARAMS['num_nl_samples'])
print(f"Loaded {len(nl_prompts)} NL samples")

Loading samples from The Pile...
Error loading The Pile: Compression type zstd not supported
Using fallback NL prompts...
Loaded 3 NL samples
Error loading The Pile: Compression type zstd not supported
Using fallback NL prompts...
Loaded 3 NL samples


In [6]:
# Load Code samples from The Stack
code_prompts = load_stack_samples(CODE_LANGUAGES, TEST_PARAMS['num_code_samples'])
print(f"Loaded {len(code_prompts)} code samples")

Loading code samples from The Stack for languages: ['python', 'c', 'go', 'rust']...
  Error loading python: Dataset 'bigcode/the-stack-dedup' is a gated dataset on the Hub. Visit the dataset page at https://huggingface.co/datasets/bigcode/the-stack-dedup to ask for access.
  Using fallback prompts for python
  Error loading c: Dataset 'bigcode/the-stack-dedup' is a gated dataset on the Hub. Visit the dataset page at https://huggingface.co/datasets/bigcode/the-stack-dedup to ask for access.
  Using fallback prompts for c
  Error loading go: Dataset 'bigcode/the-stack-dedup' is a gated dataset on the Hub. Visit the dataset page at https://huggingface.co/datasets/bigcode/the-stack-dedup to ask for access.
  Using fallback prompts for go
  Error loading rust: Dataset 'bigcode/the-stack-dedup' is a gated dataset on the Hub. Visit the dataset page at https://huggingface.co/datasets/bigcode/the-stack-dedup to ask for access.
  Using fallback prompts for rust
Total code samples loaded: 4
Loade

In [7]:
# Combine all prompts
all_prompts = nl_prompts + code_prompts
print(f"Total: {len(all_prompts)} prompts ({len(nl_prompts)} NL + {len(code_prompts)} Code)")

Total: 7 prompts (3 NL + 4 Code)


## Test on Individual Model

In [8]:
# Select model to test
MODEL_TO_TEST = 'GPT-2'  # Options: 'GPT-2', 'Qwen', 'Pythia'

config = MODEL_CONFIGS[MODEL_TO_TEST]
print(f"Testing {MODEL_TO_TEST}: {config['verifier']} + {config['draft']}")

Testing GPT-2: gpt2-large + distilgpt2


In [9]:
# Initialize tester
tester = SpeculativeDecodingTester(
    verifier_checkpoint=config['verifier'],
    draft_checkpoint=config['draft']
)

Using device: cpu
Loading models...
  Verifier: gpt2-large
  Draft: distilgpt2


`torch_dtype` is deprecated! Use `dtype` instead!
Some parameters are on the meta device because they were offloaded to the disk.
Some parameters are on the meta device because they were offloaded to the disk.


Models loaded successfully!



In [10]:
# Run quick validation test
result = tester.run_single_test(
    prompt=all_prompts[0]['text'],
    max_new_tokens=50,
    gamma=TEST_PARAMS['gamma'],
    verbose=True
)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


RuntimeError: Placeholder storage has not been allocated on MPS device!

In [None]:
# Run full benchmark on selected model
print(f"Running full benchmark on {MODEL_TO_TEST}...")
results = tester.run_benchmark_suite(
    prompts=all_prompts,
    max_new_tokens=TEST_PARAMS['max_new_tokens'],
    gamma=TEST_PARAMS['gamma']
)

In [None]:
# Display results summary
print(f"\n{'='*80}")
print(f"RESULTS SUMMARY FOR {MODEL_TO_TEST}")
print(f"{'='*80}\n")

for ptype, metrics in results['summary'].items():
    print(f"{ptype}:")
    print(f"  Acceptance Rate: {metrics['avg_acceptance_rate']:.2%} ± {metrics['std_acceptance_rate']:.2%}")
    print(f"  Speedup: {metrics['avg_speedup']:.2f}x ± {metrics['std_speedup']:.2f}x")
    print(f"  Baseline Time: {metrics['avg_baseline_time']:.3f}s")
    print(f"  Speculative Time: {metrics['avg_speculative_time']:.3f}s")
    print(f"  Samples: {metrics['num_samples']}")
    print()

In [None]:
# Save results for this model
output_file = f"results_{MODEL_TO_TEST.lower()}.json"

def make_serializable(obj):
    import numpy as np
    if isinstance(obj, (np.floating, np.float32, np.float64)):
        return float(obj)
    elif isinstance(obj, (np.integer, np.int32, np.int64)):
        return int(obj)
    elif isinstance(obj, dict):
        return {k: make_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [make_serializable(i) for i in obj]
    return obj

with open(output_file, 'w') as f:
    json.dump(make_serializable(results), f, indent=2)

print(f"Results saved to: {output_file}")

## Run All Model Configurations

In [None]:
all_results = {}

for model_name, config in MODEL_CONFIGS.items():
    print(f"\n{'='*60}\nTesting {model_name}: {config['verifier']} + {config['draft']}\n{'='*60}")
    
    try:
        tester = SpeculativeDecodingTester(
            verifier_checkpoint=config['verifier'],
            draft_checkpoint=config['draft']
        )
        
        results = tester.run_benchmark_suite(
            prompts=all_prompts,
            max_new_tokens=TEST_PARAMS['max_new_tokens'],
            gamma=TEST_PARAMS['gamma']
        )
        
        all_results[model_name] = results
        
        # Print summary
        for ptype, metrics in results['summary'].items():
            print(f"{ptype}: Acc={metrics['avg_acceptance_rate']:.1%}, Speedup={metrics['avg_speedup']:.2f}x")
        
        # Clean up
        del tester
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        print(f"✓ {model_name} complete")
        
    except Exception as e:
        print(f"✗ Error: {e}")
        continue

print(f"\n{'='*60}\n✓ All tests complete\n{'='*60}")

## Results Analysis

In [None]:
# Create comprehensive comparison table
print("\n" + "="*100)
print("RESULTS")
print("="*100 + "\n")

print(f"{'Model':<20} {'Type':<15} {'Acc. Rate':<20} {'Speedup':<20} {'Samples':<10}")
print("-"*100)

for model_name, model_data in all_results.items():
    summary = model_data.get('summary', {})
    
    # Sort types: NL first, then code types
    sorted_types = sorted(summary.keys(), key=lambda x: (0 if x == 'NL' else 1, x))
    
    for i, ptype in enumerate(sorted_types):
        metrics = summary[ptype]
        
        model_col = model_name if i == 0 else ""
        acc_rate = f"{metrics['avg_acceptance_rate']*100:.1f}% ± {metrics['std_acceptance_rate']*100:.1f}%"
        speedup = f"{metrics['avg_speedup']:.2f}x ± {metrics['std_speedup']:.2f}x"
        samples = metrics['num_samples']
        
        print(f"{model_col:<20} {ptype:<15} {acc_rate:<20} {speedup:<20} {samples:<10}")
    
    print("-"*100)

In [None]:
# Compare models by prompt type
print("\n" + "="*100)
print("MODEL COMPARISON BY PROMPT TYPE")
print("="*100)

# Get all prompt types
all_types = set()
for model_data in all_results.values():
    all_types.update(model_data.get('summary', {}).keys())

for ptype in sorted(all_types, key=lambda x: (0 if x == 'NL' else 1, x)):
    print(f"\n{ptype}:")
    print(f"  {'Model':<15} {'Acc. Rate':<20} {'Speedup':<20}")
    print("  " + "-"*60)
    
    for model_name, model_data in all_results.items():
        summary = model_data.get('summary', {})
        if ptype in summary:
            metrics = summary[ptype]
            acc_rate = f"{metrics['avg_acceptance_rate']*100:.1f}% ± {metrics['std_acceptance_rate']*100:.1f}%"
            speedup = f"{metrics['avg_speedup']:.2f}x ± {metrics['std_speedup']:.2f}x"
            print(f"  {model_name:<15} {acc_rate:<20} {speedup:<20}")

In [None]:
# Save all results
with open("baseline_results_all.json", 'w') as f:
    json.dump(make_serializable(all_results), f, indent=2)
print("✓ Results saved to baseline_results_all.json")

## Generate LaTeX Table

In [None]:
# Generate LaTeX table
latex_lines = ["\\begin{tabular}{|l|l|c|c|}", "\\hline", "Model & Completion & Acce. Rate & Speedup \\\\", "\\hline"]

for model_name, model_data in all_results.items():
    summary = model_data.get('summary', {})
    verifier = model_data['verifier_model'].split('/')[-1]
    draft = model_data['draft_model'].split('/')[-1]
    sorted_types = sorted(summary.keys(), key=lambda x: (0 if x == 'NL' else 1, x))
    
    for i, ptype in enumerate(sorted_types):
        metrics = summary[ptype]
        model_col = f"{model_name} ({verifier}+{draft})" if i == 0 else ""
        acc_rate = f"{metrics['avg_acceptance_rate']*100:.1f}\\%"
        speedup = f"{metrics['avg_speedup']:.2f}x"
        latex_lines.append(f"{model_col} & {ptype} & {acc_rate} & {speedup} \\\\")
    
    latex_lines.append("\\hline")

latex_lines.append("\\end{tabular}")
latex_table = "\n".join(latex_lines)

print(latex_table)

with open("baseline_latex_table.tex", 'w') as f:
    f.write(latex_table)
print("\n✓ LaTeX table saved to baseline_latex_table.tex")