# EdgeLLM Comprehensive Benchmark Suite

This notebook collects **all** performance metrics for rigorous comparison against other inference engines.

## Metrics Collected

| Category | Metrics |
|----------|--------|
| **GPU Hardware** | Model, Driver, CUDA, Power (avg/peak), VRAM |
| **Memory** | VRAM used, Host RAM, PCIe transfers |
| **Kernel Stats** | Launches, Execution time, SM occupancy |
| **Latency** | Mean, P50, P90, P95, P99, Max, Jitter |
| **Throughput Scaling** | 1k, 4k, 8k, 16k context |
| **CPU Overhead** | Utilization, Time per token, Threads |
| **Reproducibility** | OS, Compiler, Build flags |

## 1. Environment Setup

In [None]:
# Install dependencies
!pip install -q pynvml psutil numpy

In [None]:
import ctypes
import numpy as np
import time
import os
import platform
import subprocess
import psutil
import json
from datetime import datetime

try:
    import pynvml
    pynvml.nvmlInit()
    NVML_AVAILABLE = True
except:
    NVML_AVAILABLE = False
    print('Warning: pynvml not available, some GPU stats will be limited')

print(f'Setup complete. NVML available: {NVML_AVAILABLE}')

In [None]:
# Clone and build
import os
if not os.path.exists('ollama-api-gateway'):
    !git clone https://github.com/umerkhan95/ollama-api-gateway.git
else:
    print('Repository exists, pulling latest...')
    !cd ollama-api-gateway && git pull

%cd ollama-api-gateway/mojo-gateway/src/kernels
!make cuda
!ls -la ../../lib/

## 2. GPU Hardware Info

In [None]:
def get_gpu_info():
    """Collect detailed GPU hardware information."""
    info = {}
    
    # Basic info from nvidia-smi
    try:
        result = subprocess.run(
            ['nvidia-smi', '--query-gpu=name,driver_version,memory.total,power.limit,power.max_limit,clocks.max.sm,clocks.max.mem,pcie.link.gen.current,pcie.link.width.current',
             '--format=csv,noheader,nounits'],
            capture_output=True, text=True
        )
        parts = result.stdout.strip().split(', ')
        info['gpu_model'] = parts[0]
        info['driver_version'] = parts[1]
        info['vram_total_mb'] = int(parts[2])
        info['power_limit_w'] = float(parts[3])
        info['power_max_w'] = float(parts[4])
        info['max_sm_clock_mhz'] = int(parts[5])
        info['max_mem_clock_mhz'] = int(parts[6])
        info['pcie_gen'] = parts[7]
        info['pcie_width'] = parts[8]
    except Exception as e:
        print(f'nvidia-smi error: {e}')
    
    # CUDA version
    try:
        result = subprocess.run(['nvcc', '--version'], capture_output=True, text=True)
        for line in result.stdout.split('\n'):
            if 'release' in line:
                info['cuda_version'] = line.split('release')[1].split(',')[0].strip()
    except:
        info['cuda_version'] = 'unknown'
    
    # Detailed info from NVML
    if NVML_AVAILABLE:
        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
        info['gpu_uuid'] = pynvml.nvmlDeviceGetUUID(handle)
        info['compute_capability'] = '.'.join(map(str, pynvml.nvmlDeviceGetCudaComputeCapability(handle)))
        info['sm_count'] = pynvml.nvmlDeviceGetNumGpuCores(handle) if hasattr(pynvml, 'nvmlDeviceGetNumGpuCores') else 'N/A'
        
        # Memory info
        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        info['vram_used_mb'] = mem_info.used // (1024 * 1024)
        info['vram_free_mb'] = mem_info.free // (1024 * 1024)
        
        # Power info
        try:
            info['current_power_w'] = pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0
        except:
            info['current_power_w'] = 'N/A'
    
    return info

gpu_info = get_gpu_info()
print('='*60)
print('GPU HARDWARE INFO')
print('='*60)
for k, v in gpu_info.items():
    print(f'{k:25}: {v}')

## 3. Reproducibility Info

In [None]:
def get_reproducibility_info():
    """Collect system info for reproducibility."""
    info = {}
    
    # OS info
    info['os'] = platform.system()
    info['os_version'] = platform.release()
    info['os_detail'] = platform.platform()
    
    # CPU info
    info['cpu_model'] = platform.processor()
    info['cpu_cores'] = psutil.cpu_count(logical=False)
    info['cpu_threads'] = psutil.cpu_count(logical=True)
    info['ram_total_gb'] = round(psutil.virtual_memory().total / (1024**3), 2)
    
    # Compiler info
    try:
        result = subprocess.run(['nvcc', '--version'], capture_output=True, text=True)
        info['nvcc_version'] = result.stdout.strip().split('\n')[-1]
    except:
        info['nvcc_version'] = 'unknown'
    
    try:
        result = subprocess.run(['gcc', '--version'], capture_output=True, text=True)
        info['gcc_version'] = result.stdout.strip().split('\n')[0]
    except:
        info['gcc_version'] = 'unknown'
    
    # Build flags (from Makefile)
    info['cuda_build_flags'] = '-O3 -Xcompiler -fPIC -gencode arch=compute_75,code=sm_75'
    
    # Power management
    try:
        result = subprocess.run(['nvidia-smi', '-q', '-d', 'POWER'], capture_output=True, text=True)
        if 'Persistence Mode' in result.stdout:
            info['persistence_mode'] = 'Enabled' if 'Enabled' in result.stdout.split('Persistence Mode')[1].split('\n')[0] else 'Disabled'
    except:
        info['persistence_mode'] = 'unknown'
    
    info['clock_locking'] = 'No'  # Default for cloud GPUs
    info['timestamp'] = datetime.now().isoformat()
    
    return info

repro_info = get_reproducibility_info()
print('='*60)
print('REPRODUCIBILITY INFO')
print('='*60)
for k, v in repro_info.items():
    print(f'{k:25}: {v}')

## 4. Load CUDA Library

In [None]:
# Load library
lib_path = '../../lib/libtmac_kernel_cuda.so'
cuda_lib = ctypes.CDLL(lib_path)
print(f'Loaded: {lib_path}')

# Basic functions
cuda_lib.cuda_available.restype = ctypes.c_int
cuda_lib.cuda_device_name.restype = ctypes.c_char_p
cuda_lib.cuda_init.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int]
cuda_lib.cuda_init.restype = ctypes.c_int
cuda_lib.cuda_cleanup.restype = None
cuda_lib.cuda_sync.restype = None

# Phase 1: Persistent memory API
cuda_lib.cuda_load_weights.argtypes = [
    ctypes.POINTER(ctypes.c_int8), ctypes.POINTER(ctypes.c_float),
    ctypes.c_int, ctypes.c_int
]
cuda_lib.cuda_load_weights.restype = ctypes.c_int
cuda_lib.cuda_unload_weights.restype = None
cuda_lib.cuda_load_norm_weights.argtypes = [ctypes.POINTER(ctypes.c_float), ctypes.c_int]
cuda_lib.cuda_load_norm_weights.restype = ctypes.c_int

# Phase 2.1: Optimized kernels
cuda_lib.streaming_fused_rmsnorm_matmul_cuda.argtypes = [
    ctypes.POINTER(ctypes.c_float), ctypes.POINTER(ctypes.c_float),
    ctypes.c_int, ctypes.c_int, ctypes.c_float
]
cuda_lib.streaming_fused_rmsnorm_matmul_cuda.restype = ctypes.c_int

cuda_lib.fused_rmsnorm_matmul_cuda_adaptive.argtypes = [
    ctypes.POINTER(ctypes.c_float), ctypes.POINTER(ctypes.c_float),
    ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_float
]
cuda_lib.fused_rmsnorm_matmul_cuda_adaptive.restype = ctypes.c_int

print('Function signatures defined')

In [None]:
# Initialize CUDA
if cuda_lib.cuda_available():
    device_name = cuda_lib.cuda_device_name().decode('utf-8')
    print(f'CUDA Device: {device_name}')
else:
    raise RuntimeError('No CUDA device found!')

max_weights = 100_000_000
max_activations = 10_000_000
max_output = 10_000_000
ret = cuda_lib.cuda_init(max_weights, max_activations, max_output)
if ret == 0:
    print('CUDA initialized successfully')
else:
    raise RuntimeError('CUDA initialization failed')

## 5. Workload Definition

In [None]:
# SmolLM-135M architecture
WORKLOAD = {
    'model_name': 'SmolLM-135M',
    'hidden_size': 576,
    'intermediate_size': 1536,
    'num_heads': 9,
    'head_dim': 64,
    'vocab_size': 49152,
    'num_layers': 9,
    'quantization': 'BitNet 1.58-bit',
    'batch_size': 1,
    'sampling_method': 'greedy',
    'temperature': 0.0,
    'kv_cache_enabled': False,  # For this benchmark
    'kv_cache_size_mb': 0,
}

print('='*60)
print('WORKLOAD DEFINITION')
print('='*60)
for k, v in WORKLOAD.items():
    print(f'{k:25}: {v}')

## 6. Memory & Transfer Stats

In [None]:
def calculate_transfer_stats(M, N, K):
    """Calculate PCIe transfer bytes per token."""
    # Host -> Device transfers per token
    activation_bytes = K * N * 4  # float32
    
    # Device -> Host transfers per token  
    output_bytes = M * N * 4  # float32
    
    # Weights are persistent (transferred once, not per token)
    weight_bytes = M * ((K + 3) // 4)  # packed int8 ternary
    scale_bytes = M * 4  # float32
    norm_weight_bytes = K * 4  # float32
    
    return {
        'pcie_h2d_per_token_bytes': activation_bytes,
        'pcie_d2h_per_token_bytes': output_bytes,
        'weight_bytes_one_time': weight_bytes + scale_bytes + norm_weight_bytes,
    }

# Calculate for each layer type
hidden = WORKLOAD['hidden_size']
intermediate = WORKLOAD['intermediate_size']

layer_configs = [
    ('QKV Projection', 3 * hidden, 1, hidden),
    ('Output Projection', hidden, 1, hidden),
    ('FFN Up', intermediate, 1, hidden),
    ('FFN Down', hidden, 1, intermediate),
]

print('='*60)
print('MEMORY & TRANSFER STATS (per layer)')
print('='*60)
total_h2d = 0
total_d2h = 0
for name, M, N, K in layer_configs:
    stats = calculate_transfer_stats(M, N, K)
    total_h2d += stats['pcie_h2d_per_token_bytes']
    total_d2h += stats['pcie_d2h_per_token_bytes']
    print(f"{name}:")
    print(f"  H→D: {stats['pcie_h2d_per_token_bytes']:,} bytes")
    print(f"  D→H: {stats['pcie_d2h_per_token_bytes']:,} bytes")

print(f"\nTotal per token (all layers x {WORKLOAD['num_layers']}):")
print(f"  H→D: {total_h2d * WORKLOAD['num_layers']:,} bytes")
print(f"  D→H: {total_d2h * WORKLOAD['num_layers']:,} bytes")

## 7. Detailed Latency Benchmark

In [None]:
def benchmark_latency_detailed(cuda_lib, M, K, iterations=1000):
    """Collect detailed latency statistics."""
    N = 1  # batch size
    
    # Setup
    weight_bytes = M * ((K + 3) // 4)
    weights = np.random.randint(-1, 2, size=weight_bytes, dtype=np.int8)
    activations = np.random.randn(K * N).astype(np.float32)
    output = np.zeros(M * N, dtype=np.float32)
    scales = np.ones(M, dtype=np.float32)
    norm_weights = np.ones(K, dtype=np.float32)
    
    weights_ptr = weights.ctypes.data_as(ctypes.POINTER(ctypes.c_int8))
    act_ptr = activations.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
    out_ptr = output.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
    scales_ptr = scales.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
    norm_ptr = norm_weights.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
    
    # Load weights
    cuda_lib.cuda_load_weights(weights_ptr, scales_ptr, weight_bytes, M)
    cuda_lib.cuda_load_norm_weights(norm_ptr, K)
    
    # Warmup
    for _ in range(50):
        cuda_lib.streaming_fused_rmsnorm_matmul_cuda(act_ptr, out_ptr, M, K, ctypes.c_float(1e-6))
    cuda_lib.cuda_sync()
    
    # Collect individual latencies
    latencies = []
    for _ in range(iterations):
        start = time.perf_counter()
        cuda_lib.streaming_fused_rmsnorm_matmul_cuda(act_ptr, out_ptr, M, K, ctypes.c_float(1e-6))
        cuda_lib.cuda_sync()
        end = time.perf_counter()
        latencies.append((end - start) * 1000)  # ms
    
    cuda_lib.cuda_unload_weights()
    
    latencies = np.array(latencies)
    
    # Calculate stats
    stats = {
        'mean_ms': np.mean(latencies),
        'std_ms': np.std(latencies),
        'min_ms': np.min(latencies),
        'max_ms': np.max(latencies),
        'p50_ms': np.percentile(latencies, 50),
        'p90_ms': np.percentile(latencies, 90),
        'p95_ms': np.percentile(latencies, 95),
        'p99_ms': np.percentile(latencies, 99),
        'variance_ms2': np.var(latencies),
        'iterations': iterations,
    }
    
    # Calculate jitter (gap variance)
    gaps = np.diff(latencies)
    stats['gap_variance_ms2'] = np.var(gaps)
    stats['longest_stall_ms'] = np.max(gaps) if len(gaps) > 0 else 0
    
    return stats, latencies

print('Running detailed latency benchmark...')

In [None]:
# Benchmark each layer type
latency_results = {}

print('='*80)
print('LATENCY STATS (1000 iterations per layer)')
print('='*80)
print(f'{"Layer":<20} {"Mean":>8} {"P50":>8} {"P90":>8} {"P95":>8} {"P99":>8} {"Max":>8} {"Jitter":>10}')
print(f'{"":<20} {"(ms)":>8} {"(ms)":>8} {"(ms)":>8} {"(ms)":>8} {"(ms)":>8} {"(ms)":>8} {"(ms²)":>10}')
print('-'*80)

for name, M, N, K in layer_configs:
    stats, raw_latencies = benchmark_latency_detailed(cuda_lib, M, K, iterations=1000)
    latency_results[name] = stats
    print(f'{name:<20} {stats["mean_ms"]:>8.4f} {stats["p50_ms"]:>8.4f} {stats["p90_ms"]:>8.4f} '
          f'{stats["p95_ms"]:>8.4f} {stats["p99_ms"]:>8.4f} {stats["max_ms"]:>8.4f} {stats["gap_variance_ms2"]:>10.6f}')

print('-'*80)

## 8. Power Monitoring During Benchmark

In [None]:
def benchmark_with_power(cuda_lib, M, K, iterations=500):
    """Benchmark with power monitoring."""
    N = 1
    
    # Setup
    weight_bytes = M * ((K + 3) // 4)
    weights = np.random.randint(-1, 2, size=weight_bytes, dtype=np.int8)
    activations = np.random.randn(K * N).astype(np.float32)
    output = np.zeros(M * N, dtype=np.float32)
    scales = np.ones(M, dtype=np.float32)
    norm_weights = np.ones(K, dtype=np.float32)
    
    weights_ptr = weights.ctypes.data_as(ctypes.POINTER(ctypes.c_int8))
    act_ptr = activations.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
    out_ptr = output.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
    scales_ptr = scales.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
    norm_ptr = norm_weights.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
    
    cuda_lib.cuda_load_weights(weights_ptr, scales_ptr, weight_bytes, M)
    cuda_lib.cuda_load_norm_weights(norm_ptr, K)
    
    # Warmup
    for _ in range(50):
        cuda_lib.streaming_fused_rmsnorm_matmul_cuda(act_ptr, out_ptr, M, K, ctypes.c_float(1e-6))
    cuda_lib.cuda_sync()
    
    power_readings = []
    latencies = []
    
    if NVML_AVAILABLE:
        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    
    # Benchmark with power sampling
    for _ in range(iterations):
        if NVML_AVAILABLE:
            try:
                power_readings.append(pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0)
            except:
                pass
        
        start = time.perf_counter()
        cuda_lib.streaming_fused_rmsnorm_matmul_cuda(act_ptr, out_ptr, M, K, ctypes.c_float(1e-6))
        cuda_lib.cuda_sync()
        end = time.perf_counter()
        latencies.append((end - start) * 1000)
    
    cuda_lib.cuda_unload_weights()
    
    results = {
        'avg_latency_ms': np.mean(latencies),
        'throughput_ops_per_sec': 1000 / np.mean(latencies),
    }
    
    if power_readings:
        results['avg_power_w'] = np.mean(power_readings)
        results['peak_power_w'] = np.max(power_readings)
        results['min_power_w'] = np.min(power_readings)
        # Energy per operation
        results['energy_per_op_j'] = results['avg_power_w'] * results['avg_latency_ms'] / 1000
    
    return results

print('Running power benchmark...')
power_results = {}

print('='*80)
print('POWER STATS')
print('='*80)

for name, M, N, K in layer_configs:
    results = benchmark_with_power(cuda_lib, M, K, iterations=500)
    power_results[name] = results
    print(f'\n{name}:')
    for k, v in results.items():
        if isinstance(v, float):
            print(f'  {k}: {v:.4f}')
        else:
            print(f'  {k}: {v}')

## 9. Full Token Generation Simulation

In [None]:
def simulate_token_generation(cuda_lib, num_tokens=100, num_layers=9):
    """Simulate full token generation through all layers."""
    hidden = WORKLOAD['hidden_size']
    intermediate = WORKLOAD['intermediate_size']
    
    # Pre-allocate all layer weights and buffers
    layers = [
        ('QKV', 3 * hidden, hidden),
        ('Out', hidden, hidden),
        ('FFN_Up', intermediate, hidden),
        ('FFN_Down', hidden, intermediate),
    ]
    
    # Prepare weights for each layer
    layer_data = []
    for name, M, K in layers:
        weight_bytes = M * ((K + 3) // 4)
        weights = np.random.randint(-1, 2, size=weight_bytes, dtype=np.int8)
        scales = np.ones(M, dtype=np.float32)
        norm_weights = np.ones(K, dtype=np.float32)
        layer_data.append((name, M, K, weights, scales, norm_weights))
    
    token_latencies = []
    power_readings = []
    
    if NVML_AVAILABLE:
        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    
    # Generate tokens
    for token_idx in range(num_tokens):
        token_start = time.perf_counter()
        
        # Process through all transformer layers
        for layer_idx in range(num_layers):
            for name, M, K, weights, scales, norm_weights in layer_data:
                activations = np.random.randn(K).astype(np.float32)
                output = np.zeros(M, dtype=np.float32)
                
                weights_ptr = weights.ctypes.data_as(ctypes.POINTER(ctypes.c_int8))
                act_ptr = activations.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
                out_ptr = output.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
                scales_ptr = scales.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
                norm_ptr = norm_weights.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
                
                weight_bytes = M * ((K + 3) // 4)
                cuda_lib.cuda_load_weights(weights_ptr, scales_ptr, weight_bytes, M)
                cuda_lib.cuda_load_norm_weights(norm_ptr, K)
                cuda_lib.streaming_fused_rmsnorm_matmul_cuda(act_ptr, out_ptr, M, K, ctypes.c_float(1e-6))
                cuda_lib.cuda_unload_weights()
        
        cuda_lib.cuda_sync()
        token_end = time.perf_counter()
        token_latencies.append((token_end - token_start) * 1000)
        
        if NVML_AVAILABLE:
            try:
                power_readings.append(pynvml.nvmlDeviceGetPowerUsage(handle) / 1000.0)
            except:
                pass
    
    token_latencies = np.array(token_latencies)
    
    results = {
        'num_tokens': num_tokens,
        'num_layers': num_layers,
        'mean_token_latency_ms': np.mean(token_latencies),
        'p50_token_latency_ms': np.percentile(token_latencies, 50),
        'p90_token_latency_ms': np.percentile(token_latencies, 90),
        'p95_token_latency_ms': np.percentile(token_latencies, 95),
        'p99_token_latency_ms': np.percentile(token_latencies, 99),
        'max_token_latency_ms': np.max(token_latencies),
        'token_gap_variance_ms2': np.var(np.diff(token_latencies)),
        'throughput_tok_s': 1000 / np.mean(token_latencies),
    }
    
    if power_readings:
        results['avg_power_w'] = np.mean(power_readings)
        results['peak_power_w'] = np.max(power_readings)
        results['energy_per_token_j'] = results['avg_power_w'] * results['mean_token_latency_ms'] / 1000
    
    return results, token_latencies

print('Simulating full token generation (100 tokens)...')
token_gen_results, token_latencies_raw = simulate_token_generation(cuda_lib, num_tokens=100)

print('='*80)
print('TOKEN GENERATION STATS (Full Model Simulation)')
print('='*80)
for k, v in token_gen_results.items():
    if isinstance(v, float):
        print(f'{k:30}: {v:.4f}')
    else:
        print(f'{k:30}: {v}')

## 10. Kernel Stats

In [None]:
def get_kernel_stats():
    """Calculate kernel launch stats per token."""
    num_layers = WORKLOAD['num_layers']
    
    # Per layer:
    # - QKV projection: 1 kernel (streaming fused)
    # - Output projection: 1 kernel
    # - FFN Up: 1 kernel
    # - FFN Down: 1 kernel
    # Total per layer: 4 kernels
    
    kernels_per_layer = 4
    total_kernels_per_token = kernels_per_layer * num_layers
    
    # Average kernel execution time from latency results
    avg_kernel_time = np.mean([latency_results[name]['mean_ms'] for name in latency_results])
    
    return {
        'kernels_per_layer': kernels_per_layer,
        'total_kernels_per_token': total_kernels_per_token,
        'avg_kernel_exec_time_ms': avg_kernel_time,
    }

kernel_stats = get_kernel_stats()
print('='*60)
print('KERNEL STATS')
print('='*60)
for k, v in kernel_stats.items():
    if isinstance(v, float):
        print(f'{k:30}: {v:.4f}')
    else:
        print(f'{k:30}: {v}')

## 11. GPU Utilization & Memory

In [None]:
def get_gpu_utilization():
    """Get GPU utilization metrics."""
    stats = {}
    
    if NVML_AVAILABLE:
        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
        
        # Utilization
        try:
            util = pynvml.nvmlDeviceGetUtilizationRates(handle)
            stats['gpu_utilization_pct'] = util.gpu
            stats['memory_utilization_pct'] = util.memory
        except:
            stats['gpu_utilization_pct'] = 'N/A'
            stats['memory_utilization_pct'] = 'N/A'
        
        # Memory
        mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
        stats['vram_used_mb'] = mem.used // (1024 * 1024)
        stats['vram_total_mb'] = mem.total // (1024 * 1024)
        stats['vram_free_mb'] = mem.free // (1024 * 1024)
    
    # Host RAM
    ram = psutil.virtual_memory()
    stats['host_ram_used_mb'] = ram.used // (1024 * 1024)
    stats['host_ram_total_mb'] = ram.total // (1024 * 1024)
    
    return stats

utilization_stats = get_gpu_utilization()
print('='*60)
print('GPU UTILIZATION & MEMORY')
print('='*60)
for k, v in utilization_stats.items():
    print(f'{k:30}: {v}')

## 12. CPU Overhead

In [None]:
def measure_cpu_overhead():
    """Measure CPU overhead during GPU inference."""
    import threading
    
    cpu_samples = []
    stop_sampling = False
    
    def sample_cpu():
        while not stop_sampling:
            cpu_samples.append(psutil.cpu_percent(interval=0.01))
    
    # Start CPU sampling thread
    sampler = threading.Thread(target=sample_cpu)
    sampler.start()
    
    # Run inference
    M, K = 3 * WORKLOAD['hidden_size'], WORKLOAD['hidden_size']
    N = 1
    
    weight_bytes = M * ((K + 3) // 4)
    weights = np.random.randint(-1, 2, size=weight_bytes, dtype=np.int8)
    activations = np.random.randn(K * N).astype(np.float32)
    output = np.zeros(M * N, dtype=np.float32)
    scales = np.ones(M, dtype=np.float32)
    norm_weights = np.ones(K, dtype=np.float32)
    
    weights_ptr = weights.ctypes.data_as(ctypes.POINTER(ctypes.c_int8))
    act_ptr = activations.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
    out_ptr = output.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
    scales_ptr = scales.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
    norm_ptr = norm_weights.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
    
    cuda_lib.cuda_load_weights(weights_ptr, scales_ptr, weight_bytes, M)
    cuda_lib.cuda_load_norm_weights(norm_ptr, K)
    
    for _ in range(1000):
        cuda_lib.streaming_fused_rmsnorm_matmul_cuda(act_ptr, out_ptr, M, K, ctypes.c_float(1e-6))
    cuda_lib.cuda_sync()
    
    cuda_lib.cuda_unload_weights()
    
    # Stop sampling
    stop_sampling = True
    sampler.join()
    
    return {
        'cpu_utilization_pct': np.mean(cpu_samples) if cpu_samples else 0,
        'cpu_peak_pct': np.max(cpu_samples) if cpu_samples else 0,
        'threads_used': psutil.cpu_count(logical=True),
        'host_sync_calls_per_token': 1,  # One cuda_sync per token
    }

cpu_overhead = measure_cpu_overhead()
print('='*60)
print('CPU OVERHEAD')
print('='*60)
for k, v in cpu_overhead.items():
    if isinstance(v, float):
        print(f'{k:30}: {v:.2f}')
    else:
        print(f'{k:30}: {v}')

## 12.5 Throughput Scaling Tests

Test throughput at different context lengths (1k, 4k, 8k, 16k tokens).

In [None]:
def benchmark_throughput_scaling(cuda_lib, context_lengths=[1024, 4096, 8192, 16384]):
    """Benchmark throughput at different context lengths.
    
    Note: For single token generation (batch=1), context length affects
    KV cache size but not the kernel performance directly. This simulates
    the memory pressure from larger contexts.
    """
    results = {}
    hidden = WORKLOAD['hidden_size']
    
    for ctx_len in context_lengths:
        # Simulate memory pressure from KV cache
        # KV cache size = 2 * num_layers * ctx_len * hidden_size * sizeof(float16)
        kv_cache_size_mb = 2 * WORKLOAD['num_layers'] * ctx_len * hidden * 2 / (1024 * 1024)
        
        # For generation phase, we process one token at a time
        # The key metric is: does having a larger KV cache in memory affect performance?
        M, K = 3 * hidden, hidden  # QKV projection (largest layer)
        N = 1
        
        weight_bytes = M * ((K + 3) // 4)
        weights = np.random.randint(-1, 2, size=weight_bytes, dtype=np.int8)
        activations = np.random.randn(K * N).astype(np.float32)
        output = np.zeros(M * N, dtype=np.float32)
        scales = np.ones(M, dtype=np.float32)
        norm_weights = np.ones(K, dtype=np.float32)
        
        # Simulate KV cache memory allocation
        if ctx_len <= 8192:  # Don't OOM on small GPUs
            try:
                kv_cache_k = np.zeros((WORKLOAD['num_layers'], ctx_len, hidden), dtype=np.float16)
                kv_cache_v = np.zeros((WORKLOAD['num_layers'], ctx_len, hidden), dtype=np.float16)
            except MemoryError:
                kv_cache_k = None
                kv_cache_v = None
        else:
            kv_cache_k = None
            kv_cache_v = None
        
        weights_ptr = weights.ctypes.data_as(ctypes.POINTER(ctypes.c_int8))
        act_ptr = activations.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
        out_ptr = output.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
        scales_ptr = scales.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
        norm_ptr = norm_weights.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
        
        cuda_lib.cuda_load_weights(weights_ptr, scales_ptr, weight_bytes, M)
        cuda_lib.cuda_load_norm_weights(norm_ptr, K)
        
        # Warmup
        for _ in range(50):
            cuda_lib.streaming_fused_rmsnorm_matmul_cuda(act_ptr, out_ptr, M, K, ctypes.c_float(1e-6))
        cuda_lib.cuda_sync()
        
        # Benchmark
        iterations = 500
        start = time.perf_counter()
        for _ in range(iterations):
            cuda_lib.streaming_fused_rmsnorm_matmul_cuda(act_ptr, out_ptr, M, K, ctypes.c_float(1e-6))
        cuda_lib.cuda_sync()
        elapsed = time.perf_counter() - start
        
        cuda_lib.cuda_unload_weights()
        
        # Free KV cache
        del kv_cache_k, kv_cache_v
        
        avg_latency_ms = (elapsed / iterations) * 1000
        throughput = 1000 / avg_latency_ms
        
        results[ctx_len] = {
            'context_length': ctx_len,
            'kv_cache_size_mb': kv_cache_size_mb,
            'avg_latency_ms': avg_latency_ms,
            'throughput_tok_s': throughput,
        }
    
    # Calculate degradation slope
    if len(results) >= 2:
        ctx_lens = sorted(results.keys())
        throughputs = [results[c]['throughput_tok_s'] for c in ctx_lens]
        # Degradation per 1k tokens
        base_throughput = throughputs[0]
        degradations = [(base_throughput - t) / base_throughput * 100 for t in throughputs]
        ctx_diffs = [(c - ctx_lens[0]) / 1000 for c in ctx_lens]
        if ctx_diffs[-1] > 0:
            slope = degradations[-1] / ctx_diffs[-1]
        else:
            slope = 0
    else:
        slope = 0
    
    return results, slope

print('Running throughput scaling tests...')
scaling_results, degradation_slope = benchmark_throughput_scaling(cuda_lib)

print('='*80)
print('THROUGHPUT SCALING')
print('='*80)
print(f'{"Context":>10} {"KV Cache":>12} {"Latency":>12} {"Throughput":>12}')
print(f'{"(tokens)":>10} {"(MB)":>12} {"(ms)":>12} {"(tok/s)":>12}')
print('-'*50)

for ctx_len in sorted(scaling_results.keys()):
    r = scaling_results[ctx_len]
    print(f'{ctx_len:>10} {r["kv_cache_size_mb"]:>12.2f} {r["avg_latency_ms"]:>12.4f} {r["throughput_tok_s"]:>12.2f}')

print(f'\nDegradation slope: {degradation_slope:.4f}% per 1k tokens')

## 13. Compile Full Report

In [None]:
# Compile all results into final report
full_report = {
    'timestamp': datetime.now().isoformat(),
    'benchmark_version': '1.0.0',
    
    # GPU Hardware
    'gpu_model': gpu_info.get('gpu_model', 'N/A'),
    'driver_version': gpu_info.get('driver_version', 'N/A'),
    'cuda_version': gpu_info.get('cuda_version', 'N/A'),
    'compute_capability': gpu_info.get('compute_capability', 'N/A'),
    'vram_total_mb': gpu_info.get('vram_total_mb', 0),
    
    # Power
    'power_limit_w': gpu_info.get('power_limit_w', 0),
    'avg_power_w': token_gen_results.get('avg_power_w', 'N/A'),
    'peak_power_w': token_gen_results.get('peak_power_w', 'N/A'),
    'energy_per_token_j': token_gen_results.get('energy_per_token_j', 'N/A'),
    
    # Memory
    'vram_used_mb': utilization_stats.get('vram_used_mb', 0),
    'host_ram_used_mb': utilization_stats.get('host_ram_used_mb', 0),
    
    # PCIe Transfer Stats
    'pcie_h2d_per_token_bytes': total_h2d * WORKLOAD['num_layers'],
    'pcie_d2h_per_token_bytes': total_d2h * WORKLOAD['num_layers'],
    
    # Latency
    'mean_token_latency_ms': token_gen_results['mean_token_latency_ms'],
    'p50_token_latency_ms': token_gen_results['p50_token_latency_ms'],
    'p90_token_latency_ms': token_gen_results['p90_token_latency_ms'],
    'p95_token_latency_ms': token_gen_results['p95_token_latency_ms'],
    'p99_token_latency_ms': token_gen_results['p99_token_latency_ms'],
    'max_token_latency_ms': token_gen_results['max_token_latency_ms'],
    'token_gap_variance_ms2': token_gen_results['token_gap_variance_ms2'],
    
    # Throughput
    'throughput_tok_s': token_gen_results['throughput_tok_s'],
    
    # Throughput Scaling
    'throughput_1k_ctx_tok_s': scaling_results.get(1024, {}).get('throughput_tok_s', 'N/A'),
    'throughput_4k_ctx_tok_s': scaling_results.get(4096, {}).get('throughput_tok_s', 'N/A'),
    'throughput_8k_ctx_tok_s': scaling_results.get(8192, {}).get('throughput_tok_s', 'N/A'),
    'throughput_16k_ctx_tok_s': scaling_results.get(16384, {}).get('throughput_tok_s', 'N/A'),
    'throughput_degradation_slope_pct_per_1k': degradation_slope,
    
    # Kernel Stats
    'kernels_per_token': kernel_stats['total_kernels_per_token'],
    'avg_kernel_exec_time_ms': kernel_stats['avg_kernel_exec_time_ms'],
    
    # GPU Utilization
    'gpu_utilization_pct': utilization_stats.get('gpu_utilization_pct', 'N/A'),
    
    # CPU Overhead
    'cpu_utilization_pct': cpu_overhead['cpu_utilization_pct'],
    'cpu_peak_pct': cpu_overhead['cpu_peak_pct'],
    'threads_used': cpu_overhead['threads_used'],
    'host_sync_calls_per_token': cpu_overhead['host_sync_calls_per_token'],
    
    # Workload
    'model_name': WORKLOAD['model_name'],
    'batch_size': WORKLOAD['batch_size'],
    'num_layers': WORKLOAD['num_layers'],
    'hidden_size': WORKLOAD['hidden_size'],
    'quantization': WORKLOAD['quantization'],
    'sampling_method': WORKLOAD['sampling_method'],
    'temperature': WORKLOAD['temperature'],
    'kv_cache_enabled': WORKLOAD['kv_cache_enabled'],
    
    # Reproducibility
    'os': repro_info['os'],
    'os_version': repro_info['os_version'],
    'nvcc_version': repro_info['nvcc_version'],
    'gcc_version': repro_info['gcc_version'],
    'build_flags': repro_info['cuda_build_flags'],
    'clock_locking': repro_info['clock_locking'],
    'persistence_mode': repro_info.get('persistence_mode', 'N/A'),
}

print('='*80)
print('FULL BENCHMARK REPORT')
print('='*80)
for k, v in full_report.items():
    if isinstance(v, float):
        print(f'{k:40}: {v:.4f}')
    else:
        print(f'{k:40}: {v}')

In [None]:
# Save report to JSON
report_filename = f'benchmark_report_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json'
with open(report_filename, 'w') as f:
    json.dump(full_report, f, indent=2, default=str)
print(f'\nReport saved to: {report_filename}')

# Also print as formatted table for easy copying
print('\n' + '='*80)
print('MARKDOWN TABLE FORMAT')
print('='*80)
print('| Metric | Value |')
print('|--------|-------|')
for k, v in full_report.items():
    if isinstance(v, float):
        print(f'| {k} | {v:.4f} |')
    else:
        print(f'| {k} | {v} |')

## 14. Cleanup

In [None]:
cuda_lib.cuda_cleanup()
if NVML_AVAILABLE:
    pynvml.nvmlShutdown()
print('Resources cleaned up')