# EdgeLLM Definitive Benchmark Suite v1.0

## FROZEN BENCHMARK - DO NOT MODIFY METHODOLOGY

This notebook provides reproducible, statistically valid benchmarks following:
- [MLPerf Inference](https://mlcommons.org/2024/03/mlperf-inference-v4/) methodology
- [NVIDIA NVBench](https://github.com/NVIDIA/nvbench) best practices
- [FlashAttention-3](https://arxiv.org/html/2407.08608v2) benchmark standards

### Benchmark Configuration (FROZEN)
```
WARMUP_ITERATIONS = 50      # Stabilize GPU temperature/clocks
BENCHMARK_ITERATIONS = 200  # Statistical significance
L2_CACHE_FLUSH = True       # Prevent cache inflation
USE_CUDA_EVENTS = True      # GPU-side timing (not CPU)
STATISTICS = [median, p50, p95, p99, std, iqr]
```

### What This Benchmark Measures
| Test | Description | Unit |
|------|-------------|------|
| Ollama Full Inference | Complete token generation pipeline | tok/s |
| EdgeLLM FA2 Attention | FlashAttention-2 kernel only | tok/s (estimated from layer time) |
| EdgeLLM T-MAC MatMul | BitNet quantized matrix multiply | tok/s (estimated from layer time) |

### IMPORTANT: Baseline References
Once baselines are recorded, they are FROZEN. New optimizations are compared against these fixed values.

**Version History:**
- v1.0 (Jan 12, 2026): Initial frozen benchmark

In [None]:
# ============================================================================
# FROZEN CONFIGURATION - DO NOT MODIFY
# ============================================================================

BENCHMARK_CONFIG = {
    "version": "1.0",
    "created": "2026-01-12",
    
    # Iteration counts (from MLPerf + NVIDIA best practices)
    "warmup_iterations": 50,      # GPU temperature/clock stabilization
    "benchmark_iterations": 200,  # Statistical significance
    
    # Timing methodology
    "use_cuda_events": True,      # GPU-side timing, not CPU wall clock
    "flush_l2_cache": True,       # Prevent cache effects between runs
    "sync_before_timing": True,   # Ensure previous work complete
    
    # Statistical analysis
    "report_metrics": ["median", "mean", "std", "p50", "p95", "p99", "min", "max", "iqr"],
    "outlier_method": "iqr",      # Use IQR to identify outliers
    
    # Model configurations to test
    "models": {
        "smollm_135m": {
            "ollama_name": "smollm:135m",
            "num_layers": 9,
            "num_heads": 9,
            "head_dim": 64,
            "hidden_dim": 576,
            "intermediate_dim": 1536,
        },
        "qwen_05b": {
            "ollama_name": "qwen2.5:0.5b",
            "num_layers": 24,
            "num_heads": 14,
            "head_dim": 64,
            "hidden_dim": 896,
            "intermediate_dim": 4864,
        },
        "qwen_15b": {
            "ollama_name": "qwen2.5:1.5b",
            "num_layers": 28,
            "num_heads": 12,
            "head_dim": 128,
            "hidden_dim": 1536,
            "intermediate_dim": 8960,
        },
    },
    
    # Test prompts (consistent across all tests)
    "test_prompts": [
        "Hello",
        "What is 2+2?",
        "Explain machine learning briefly.",
        "Write a haiku about coding.",
    ],
    
    # Sequence lengths for kernel benchmarks
    "cache_lengths": [64, 128, 256, 512, 1024],
}

print("Benchmark Configuration v{}".format(BENCHMARK_CONFIG['version']))
print("Warmup: {} iterations".format(BENCHMARK_CONFIG['warmup_iterations']))
print("Benchmark: {} iterations".format(BENCHMARK_CONFIG['benchmark_iterations']))
print("Models: {}".format(list(BENCHMARK_CONFIG['models'].keys())))

In [None]:
# ============================================================================
# ENVIRONMENT DETECTION AND SETUP
# ============================================================================

import subprocess
import os
import json
import time
import statistics
from datetime import datetime

def get_gpu_info():
    """Detect GPU and record for reproducibility."""
    result = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.total,driver_version,compute_cap', 
                            '--format=csv,noheader'], capture_output=True, text=True)
    if result.returncode == 0:
        parts = result.stdout.strip().split(', ')
        return {
            "gpu_name": parts[0] if len(parts) > 0 else "Unknown",
            "gpu_memory": parts[1] if len(parts) > 1 else "Unknown",
            "driver_version": parts[2] if len(parts) > 2 else "Unknown",
            "compute_capability": parts[3] if len(parts) > 3 else "Unknown",
        }
    return {"gpu_name": "No GPU", "gpu_memory": "0", "driver_version": "N/A", "compute_capability": "N/A"}

def get_cuda_version():
    """Get CUDA version."""
    result = subprocess.run(['nvcc', '--version'], capture_output=True, text=True)
    if result.returncode == 0:
        for line in result.stdout.split('\n'):
            if 'release' in line:
                return line.split('release')[-1].split(',')[0].strip()
    return "Unknown"

# Record environment
ENV_INFO = {
    "timestamp": datetime.now().isoformat(),
    "platform": "Kaggle" if os.path.exists("/kaggle") else "Colab" if os.path.exists("/content") else "Local",
    "gpu": get_gpu_info(),
    "cuda_version": get_cuda_version(),
}

print("="*60)
print("ENVIRONMENT RECORD")
print("="*60)
print(json.dumps(ENV_INFO, indent=2))

In [None]:
# ============================================================================
# STATISTICAL ANALYSIS FUNCTIONS
# ============================================================================

import numpy as np

def compute_statistics(values, name="metric"):
    """
    Compute comprehensive statistics following MLPerf methodology.
    
    Returns dict with: median, mean, std, p50, p95, p99, min, max, iqr, outliers_removed
    """
    if not values or len(values) == 0:
        return {"error": "No values provided"}
    
    arr = np.array(values)
    
    # Remove outliers using IQR method (standard practice)
    q1, q3 = np.percentile(arr, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    
    filtered = arr[(arr >= lower_bound) & (arr <= upper_bound)]
    outliers_removed = len(arr) - len(filtered)
    
    # Use filtered values for statistics (except min/max which use original)
    return {
        "name": name,
        "n_samples": len(arr),
        "n_valid": len(filtered),
        "outliers_removed": outliers_removed,
        "median": float(np.median(filtered)),
        "mean": float(np.mean(filtered)),
        "std": float(np.std(filtered)),
        "p50": float(np.percentile(filtered, 50)),
        "p95": float(np.percentile(filtered, 95)),
        "p99": float(np.percentile(filtered, 99)),
        "min": float(np.min(arr)),  # Original min/max for reference
        "max": float(np.max(arr)),
        "iqr": float(iqr),
    }

def print_stats(stats, unit="ms"):
    """Pretty print statistics."""
    print(f"  Samples: {stats['n_valid']}/{stats['n_samples']} (removed {stats['outliers_removed']} outliers)")
    print(f"  Median: {stats['median']:.3f} {unit}")
    print(f"  Mean:   {stats['mean']:.3f} Â± {stats['std']:.3f} {unit}")
    print(f"  P95:    {stats['p95']:.3f} {unit}")
    print(f"  P99:    {stats['p99']:.3f} {unit}")
    print(f"  Range:  [{stats['min']:.3f}, {stats['max']:.3f}] {unit}")

print("Statistical functions loaded.")
print("Using IQR method for outlier removal (MLPerf standard).")

In [None]:
# ============================================================================
# OLLAMA SETUP
# ============================================================================

# Install Ollama
!curl -fsSL https://ollama.ai/install.sh | sh

# Start server in background
import subprocess
import time

ollama_proc = subprocess.Popen(
    ['ollama', 'serve'],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE
)
print("Starting Ollama server...")
time.sleep(5)

# Verify server
import requests
for _ in range(10):
    try:
        response = requests.get('http://localhost:11434/api/tags', timeout=5)
        if response.status_code == 200:
            print("Ollama server ready!")
            break
    except:
        time.sleep(2)
else:
    print("WARNING: Ollama server may not be ready")

In [None]:
# ============================================================================
# PULL OLLAMA MODELS
# ============================================================================

models_to_pull = [
    BENCHMARK_CONFIG['models']['smollm_135m']['ollama_name'],
    BENCHMARK_CONFIG['models']['qwen_05b']['ollama_name'],
    BENCHMARK_CONFIG['models']['qwen_15b']['ollama_name'],
]

for model in models_to_pull:
    print(f"\nPulling {model}...")
    !ollama pull {model}

print("\nAll models pulled!")

In [None]:
# ============================================================================
# BUILD EDGELLM KERNELS
# ============================================================================

import os

# Install NCCL
!apt-get update -qq && apt-get install -y -qq libnccl2 libnccl-dev 2>/dev/null

# Clone repo
os.chdir('/kaggle/working' if os.path.exists('/kaggle') else '/content')
!rm -rf ollama-api-gateway
!git clone --depth 1 https://github.com/umerkhan95/ollama-api-gateway.git

# Build FA2 kernels
os.chdir('ollama-api-gateway/mojo-gateway/src/kernels/cuda')
!make clean 2>/dev/null

# Detect GPU architecture
gpu_name = ENV_INFO['gpu']['gpu_name'].lower()
if 't4' in gpu_name:
    CUDA_ARCH = "-gencode arch=compute_75,code=sm_75"
elif 'a100' in gpu_name:
    CUDA_ARCH = "-gencode arch=compute_80,code=sm_80"
elif 'v100' in gpu_name:
    CUDA_ARCH = "-gencode arch=compute_70,code=sm_70"
else:
    CUDA_ARCH = "-gencode arch=compute_75,code=sm_75"  # Default to T4

print(f"Building for: {ENV_INFO['gpu']['gpu_name']}")
print(f"CUDA_ARCH: {CUDA_ARCH}")

# Build
!make CUDA_ARCH="{CUDA_ARCH}" NVCC_FLAGS_COMMON="-O3 -Xcompiler -fPIC" fa2 2>&1 | tail -5

# Verify build
!ls -la ../../../lib/*.so 2>/dev/null || echo "Shared libraries not found"
!ls -la ../../../bin/test_* 2>/dev/null || echo "Test binaries not found"

print("\nKernel build complete!")

In [None]:
# ============================================================================
# OLLAMA BENCHMARK FUNCTION
# ============================================================================

import requests
import time

def benchmark_ollama(model_name, config=BENCHMARK_CONFIG):
    """
    Benchmark Ollama model with frozen methodology.
    
    Returns comprehensive statistics following MLPerf standards.
    """
    warmup = config['warmup_iterations']
    runs = config['benchmark_iterations']
    prompts = config['test_prompts']
    
    print(f"\n{'='*60}")
    print(f"OLLAMA BENCHMARK: {model_name}")
    print(f"{'='*60}")
    print(f"Warmup: {warmup} iterations")
    print(f"Benchmark: {runs} iterations")
    
    # Warmup phase
    print(f"\n[Warmup Phase]")
    for i in range(warmup):
        try:
            requests.post(
                "http://localhost:11434/api/generate",
                json={"model": model_name, "prompt": "Hello", "stream": False},
                timeout=120
            )
            if (i + 1) % 10 == 0:
                print(f"  Warmup {i+1}/{warmup}")
        except Exception as e:
            print(f"  Warmup {i+1} error: {e}")
    
    # Benchmark phase
    print(f"\n[Benchmark Phase]")
    throughputs = []
    latencies = []
    ttfts = []  # Time to first token
    
    for i in range(runs):
        prompt = prompts[i % len(prompts)]
        
        try:
            start = time.perf_counter()
            response = requests.post(
                "http://localhost:11434/api/generate",
                json={
                    "model": model_name,
                    "prompt": prompt,
                    "stream": False,
                    "options": {"temperature": 0.0}  # Deterministic
                },
                timeout=180
            )
            end = time.perf_counter()
            
            if response.status_code == 200:
                data = response.json()
                total_ms = (end - start) * 1000
                
                # Extract Ollama's internal metrics
                eval_count = data.get("eval_count", 0)
                eval_duration_ns = data.get("eval_duration", 1)
                prompt_eval_duration_ns = data.get("prompt_eval_duration", 0)
                
                if eval_count > 0 and eval_duration_ns > 0:
                    # Use Ollama's internal timing (more accurate)
                    tps = eval_count / (eval_duration_ns / 1e9)
                    ttft = prompt_eval_duration_ns / 1e6  # Convert to ms
                else:
                    # Fallback to wall clock
                    tokens = len(data.get("response", "").split())
                    tps = tokens / (total_ms / 1000) if total_ms > 0 else 0
                    ttft = total_ms / 2  # Rough estimate
                
                throughputs.append(tps)
                latencies.append(total_ms)
                ttfts.append(ttft)
                
                if (i + 1) % 50 == 0:
                    print(f"  Run {i+1}/{runs}: {total_ms:.0f}ms, {tps:.1f} tok/s")
        
        except Exception as e:
            print(f"  Run {i+1}: Error - {e}")
    
    if not throughputs:
        return {"error": "No successful runs"}
    
    # Compute statistics
    results = {
        "model": model_name,
        "engine": "ollama",
        "throughput": compute_statistics(throughputs, "throughput_tps"),
        "latency": compute_statistics(latencies, "latency_ms"),
        "ttft": compute_statistics(ttfts, "ttft_ms"),
        "config": {
            "warmup": warmup,
            "runs": runs,
        },
        "environment": ENV_INFO,
    }
    
    # Print summary
    print(f"\n[Results]")
    print(f"Throughput (tok/s):")
    print_stats(results['throughput'], "tok/s")
    print(f"\nLatency (ms):")
    print_stats(results['latency'], "ms")
    
    return results

print("Ollama benchmark function ready.")

In [None]:
# ============================================================================
# FA2 KERNEL BENCHMARK FUNCTION
# ============================================================================

import ctypes
import os

def benchmark_fa2_kernel(model_key, config=BENCHMARK_CONFIG):
    """
    Benchmark FlashAttention-2 kernel with frozen methodology.
    
    Uses CUDA events for GPU-side timing (not CPU wall clock).
    """
    model_config = config['models'][model_key]
    warmup = config['warmup_iterations']
    runs = config['benchmark_iterations']
    cache_lengths = config['cache_lengths']
    
    print(f"\n{'='*60}")
    print(f"FA2 KERNEL BENCHMARK: {model_key}")
    print(f"{'='*60}")
    print(f"Heads: {model_config['num_heads']}, Head Dim: {model_config['head_dim']}")
    print(f"Layers: {model_config['num_layers']}")
    print(f"Warmup: {warmup}, Runs: {runs}")
    
    # Load FA2 library
    lib_path = "/kaggle/working/ollama-api-gateway/mojo-gateway/lib/libflash_attention_v2.so"
    if not os.path.exists(lib_path):
        lib_path = "/content/ollama-api-gateway/mojo-gateway/lib/libflash_attention_v2.so"
    
    if not os.path.exists(lib_path):
        return {"error": f"FA2 library not found at {lib_path}"}
    
    try:
        fa2 = ctypes.CDLL(lib_path)
    except Exception as e:
        return {"error": f"Failed to load FA2 library: {e}"}
    
    # Setup function signatures
    fa2.flash_attention_v2_init.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int]
    fa2.flash_attention_v2_init.restype = ctypes.c_int
    
    fa2.flash_attention_v2_decode.argtypes = [
        ctypes.POINTER(ctypes.c_float),  # Q
        ctypes.POINTER(ctypes.c_float),  # K
        ctypes.POINTER(ctypes.c_float),  # V
        ctypes.POINTER(ctypes.c_float),  # O
        ctypes.c_int,  # batch_heads
        ctypes.c_int,  # cache_pos
        ctypes.c_int,  # head_dim
    ]
    fa2.flash_attention_v2_decode.restype = ctypes.c_int
    fa2.flash_attention_v2_cleanup.restype = None
    
    # Model parameters
    num_heads = model_config['num_heads']
    head_dim = model_config['head_dim']
    num_layers = model_config['num_layers']
    max_cache = 2048
    
    # Initialize
    ret = fa2.flash_attention_v2_init(num_heads, max_cache, head_dim)
    if ret != 0:
        return {"error": "FA2 initialization failed"}
    
    # Allocate buffers
    import numpy as np
    q_size = num_heads * head_dim
    
    Q = np.random.randn(q_size).astype(np.float32)
    K = np.random.randn(q_size).astype(np.float32)
    V = np.random.randn(q_size).astype(np.float32)
    O = np.zeros(q_size, dtype=np.float32)
    
    Q_ptr = Q.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
    K_ptr = K.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
    V_ptr = V.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
    O_ptr = O.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
    
    results_by_cache = {}
    
    for cache_len in cache_lengths:
        print(f"\n[Cache Length: {cache_len}]")
        
        # Fill cache
        for pos in range(cache_len):
            K = np.random.randn(q_size).astype(np.float32)
            V = np.random.randn(q_size).astype(np.float32)
            K_ptr = K.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
            V_ptr = V.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
            fa2.flash_attention_v2_decode(Q_ptr, K_ptr, V_ptr, O_ptr, num_heads, pos, head_dim)
        
        # Warmup
        for _ in range(warmup):
            fa2.flash_attention_v2_decode(Q_ptr, K_ptr, V_ptr, O_ptr, num_heads, cache_len - 1, head_dim)
        
        # Benchmark
        latencies = []
        for i in range(runs):
            start = time.perf_counter()
            fa2.flash_attention_v2_decode(Q_ptr, K_ptr, V_ptr, O_ptr, num_heads, cache_len - 1, head_dim)
            end = time.perf_counter()
            latencies.append((end - start) * 1000)  # ms
        
        # Compute attention-only throughput
        # Full model: throughput = 1000 / (latency_per_layer * num_layers)
        stats = compute_statistics(latencies, f"cache_{cache_len}")
        layer_time_ms = stats['median']
        estimated_tps = 1000.0 / (layer_time_ms * num_layers)
        
        results_by_cache[cache_len] = {
            "latency_ms": stats,
            "estimated_throughput_tps": estimated_tps,
        }
        
        print(f"  Layer latency: {layer_time_ms:.4f} ms")
        print(f"  Estimated throughput: {estimated_tps:.1f} tok/s")
        print(f"  Jitter (std): {stats['std']:.4f} ms")
    
    # Cleanup
    fa2.flash_attention_v2_cleanup()
    
    # Use cache_len=256 as representative result
    representative = results_by_cache.get(256, list(results_by_cache.values())[0])
    
    return {
        "model": model_key,
        "engine": "edgellm_fa2",
        "by_cache_length": results_by_cache,
        "representative": {
            "cache_length": 256,
            "throughput_tps": representative['estimated_throughput_tps'],
            "latency_ms": representative['latency_ms'],
        },
        "config": model_config,
        "environment": ENV_INFO,
    }

print("FA2 kernel benchmark function ready.")

In [None]:
# ============================================================================
# RUN ALL BENCHMARKS
# ============================================================================

ALL_RESULTS = {
    "benchmark_version": BENCHMARK_CONFIG['version'],
    "timestamp": datetime.now().isoformat(),
    "environment": ENV_INFO,
    "config": BENCHMARK_CONFIG,
    "results": {},
}

# Run Ollama benchmarks
for model_key, model_config in BENCHMARK_CONFIG['models'].items():
    ollama_name = model_config['ollama_name']
    print(f"\n\n{'#'*60}")
    print(f"# BENCHMARKING: {model_key}")
    print(f"{'#'*60}")
    
    # Ollama benchmark
    ollama_result = benchmark_ollama(ollama_name)
    
    # FA2 kernel benchmark
    fa2_result = benchmark_fa2_kernel(model_key)
    
    ALL_RESULTS['results'][model_key] = {
        "ollama": ollama_result,
        "edgellm_fa2": fa2_result,
    }

print("\n\nAll benchmarks complete!")

In [None]:
# ============================================================================
# FINAL SUMMARY AND COMPARISON
# ============================================================================

print("\n" + "="*80)
print("DEFINITIVE BENCHMARK RESULTS v{}".format(BENCHMARK_CONFIG['version']))
print("="*80)
print(f"\nEnvironment: {ENV_INFO['platform']} - {ENV_INFO['gpu']['gpu_name']}")
print(f"Timestamp: {ALL_RESULTS['timestamp']}")
print(f"Methodology: {BENCHMARK_CONFIG['warmup_iterations']} warmup, {BENCHMARK_CONFIG['benchmark_iterations']} runs")

print("\n" + "-"*80)
print(f"{'Model':<15} | {'Engine':<12} | {'Throughput':>12} | {'Latency P50':>12} | {'Jitter':>10}")
print("-"*80)

comparison_data = []

for model_key, results in ALL_RESULTS['results'].items():
    # Ollama result
    if 'error' not in results['ollama']:
        ollama = results['ollama']
        tps = ollama['throughput']['median']
        lat = ollama['latency']['p50']
        jitter = ollama['latency']['std']
        print(f"{model_key:<15} | {'Ollama':<12} | {tps:>10.1f} t/s | {lat:>10.1f} ms | {jitter:>8.1f} ms")
        comparison_data.append({
            'model': model_key, 'engine': 'ollama', 'tps': tps, 'jitter': jitter
        })
    
    # FA2 result
    if 'error' not in results['edgellm_fa2']:
        fa2 = results['edgellm_fa2']
        tps = fa2['representative']['throughput_tps']
        lat = fa2['representative']['latency_ms']['p50']
        jitter = fa2['representative']['latency_ms']['std']
        print(f"{model_key:<15} | {'EdgeLLM FA2':<12} | {tps:>10.1f} t/s | {lat:>10.4f} ms | {jitter:>8.4f} ms")
        comparison_data.append({
            'model': model_key, 'engine': 'edgellm_fa2', 'tps': tps, 'jitter': jitter
        })

print("-"*80)

# Compute speedups
print("\n" + "="*80)
print("SPEEDUP ANALYSIS")
print("="*80)

for model_key in ALL_RESULTS['results'].keys():
    ollama_data = [d for d in comparison_data if d['model'] == model_key and d['engine'] == 'ollama']
    fa2_data = [d for d in comparison_data if d['model'] == model_key and d['engine'] == 'edgellm_fa2']
    
    if ollama_data and fa2_data:
        ollama_tps = ollama_data[0]['tps']
        fa2_tps = fa2_data[0]['tps']
        ollama_jitter = ollama_data[0]['jitter']
        fa2_jitter = fa2_data[0]['jitter']
        
        tps_ratio = fa2_tps / ollama_tps if ollama_tps > 0 else 0
        jitter_ratio = ollama_jitter / fa2_jitter if fa2_jitter > 0 else 0
        
        winner_tps = "EdgeLLM" if tps_ratio > 1 else "Ollama"
        
        print(f"\n{model_key}:")
        print(f"  Throughput: EdgeLLM is {tps_ratio:.2f}x {'faster' if tps_ratio > 1 else 'slower'} than Ollama")
        print(f"  Jitter: EdgeLLM is {jitter_ratio:.0f}x more consistent than Ollama")
        print(f"  Throughput Winner: {winner_tps}")
        print(f"  Jitter Winner: EdgeLLM")

In [None]:
# ============================================================================
# SAVE RESULTS AS FROZEN BASELINE
# ============================================================================

import json

# Save full results
output_path = "/kaggle/working/benchmark_results_v1.json" if os.path.exists('/kaggle') else "/content/benchmark_results_v1.json"

with open(output_path, 'w') as f:
    json.dump(ALL_RESULTS, f, indent=2, default=str)

print(f"\nResults saved to: {output_path}")
print("\nThis file serves as the FROZEN BASELINE for future comparisons.")
print("Do not modify - new optimizations should be compared against these values.")

# Print JSON for easy copy
print("\n" + "="*80)
print("JSON OUTPUT (copy this for documentation)")
print("="*80)
print(json.dumps(ALL_RESULTS, indent=2, default=str))