# Zenith Transformer End-to-End Benchmark

Tests complete BERT-style transformer inference:
- Self-Attention (Q, K, V projections + attention scores)
- Feed-Forward Network (Linear + GELU + Linear)
- LayerNorm + Residual connections

**GPU**: NVIDIA T4

In [None]:
# Cell 1: Setup
import torch
import torch.nn as nn
import numpy as np
import time

print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.version.cuda}")

In [None]:
# Cell 2: Define BERT-style Transformer Block
class TransformerBlock(nn.Module):
    def __init__(self, hidden_size, num_heads, intermediate_size, dropout=0.0):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.head_dim = hidden_size // num_heads
        
        # Self-Attention
        self.q_proj = nn.Linear(hidden_size, hidden_size)
        self.k_proj = nn.Linear(hidden_size, hidden_size)
        self.v_proj = nn.Linear(hidden_size, hidden_size)
        self.out_proj = nn.Linear(hidden_size, hidden_size)
        
        self.attn_norm = nn.LayerNorm(hidden_size)
        
        # Feed-Forward
        self.fc1 = nn.Linear(hidden_size, intermediate_size)
        self.fc2 = nn.Linear(intermediate_size, hidden_size)
        self.ffn_norm = nn.LayerNorm(hidden_size)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        batch, seq_len, _ = x.shape
        
        # Self-Attention
        residual = x
        x = self.attn_norm(x)
        
        q = self.q_proj(x).view(batch, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.k_proj(x).view(batch, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        v = self.v_proj(x).view(batch, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        
        # Scaled dot-product attention
        attn_weights = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)
        attn_weights = torch.softmax(attn_weights, dim=-1)
        attn_output = torch.matmul(attn_weights, v)
        
        attn_output = attn_output.transpose(1, 2).contiguous().view(batch, seq_len, self.hidden_size)
        attn_output = self.out_proj(attn_output)
        x = residual + self.dropout(attn_output)
        
        # Feed-Forward
        residual = x
        x = self.ffn_norm(x)
        x = self.fc1(x)
        x = torch.nn.functional.gelu(x)
        x = self.fc2(x)
        x = residual + self.dropout(x)
        
        return x

print("TransformerBlock defined.")

In [None]:
# Cell 3: Define benchmark function
def benchmark_model(model, input_tensor, warmup=10, runs=50):
    """Benchmark model inference."""
    model.eval()
    
    with torch.no_grad():
        # Warmup
        for _ in range(warmup):
            _ = model(input_tensor)
        
        torch.cuda.synchronize()
        times = []
        
        for _ in range(runs):
            torch.cuda.synchronize()
            start = time.perf_counter()
            _ = model(input_tensor)
            torch.cuda.synchronize()
            times.append((time.perf_counter() - start) * 1000)
    
    return {
        'mean_ms': np.mean(times),
        'std_ms': np.std(times),
        'min_ms': np.min(times),
        'throughput': 1000 / np.mean(times),
    }

print("Benchmark function ready.")

In [None]:
# Cell 4: BERT-Base Configuration Benchmark
print("=" * 60)
print("BERT-BASE CONFIGURATION")
print("=" * 60)

config_base = {
    'hidden_size': 768,
    'num_heads': 12,
    'intermediate_size': 3072,
    'num_layers': 12,
}

batch_sizes = [1, 8, 32]
seq_lengths = [128, 512]

results_base = []

for batch in batch_sizes:
    for seq_len in seq_lengths:
        # FP32
        model_fp32 = TransformerBlock(
            config_base['hidden_size'],
            config_base['num_heads'],
            config_base['intermediate_size']
        ).cuda().float()
        
        x_fp32 = torch.randn(batch, seq_len, config_base['hidden_size'], device='cuda', dtype=torch.float32)
        result_fp32 = benchmark_model(model_fp32, x_fp32)
        
        # FP16
        model_fp16 = TransformerBlock(
            config_base['hidden_size'],
            config_base['num_heads'],
            config_base['intermediate_size']
        ).cuda().half()
        
        x_fp16 = torch.randn(batch, seq_len, config_base['hidden_size'], device='cuda', dtype=torch.float16)
        result_fp16 = benchmark_model(model_fp16, x_fp16)
        
        speedup = result_fp32['mean_ms'] / result_fp16['mean_ms']
        
        print(f"Batch={batch}, Seq={seq_len}:")
        print(f"  FP32: {result_fp32['mean_ms']:.2f} ms ({result_fp32['throughput']:.1f} samples/s)")
        print(f"  FP16: {result_fp16['mean_ms']:.2f} ms ({result_fp16['throughput']:.1f} samples/s)")
        print(f"  Speedup: {speedup:.2f}x")
        
        results_base.append({
            'batch': batch,
            'seq_len': seq_len,
            'fp32_ms': result_fp32['mean_ms'],
            'fp16_ms': result_fp16['mean_ms'],
            'speedup': speedup,
        })
        
        del model_fp32, model_fp16, x_fp32, x_fp16
        torch.cuda.empty_cache()

In [None]:
# Cell 5: Full 12-Layer BERT Test
print("\n" + "=" * 60)
print("FULL 12-LAYER BERT-BASE")
print("=" * 60)

class BERTEncoder(nn.Module):
    def __init__(self, hidden_size, num_heads, intermediate_size, num_layers):
        super().__init__()
        self.layers = nn.ModuleList([
            TransformerBlock(hidden_size, num_heads, intermediate_size)
            for _ in range(num_layers)
        ])
    
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

# Test with batch=8, seq=128 (common inference scenario)
batch, seq_len = 8, 128

# FP32 Full Model
bert_fp32 = BERTEncoder(
    config_base['hidden_size'],
    config_base['num_heads'],
    config_base['intermediate_size'],
    config_base['num_layers']
).cuda().float()

x_fp32 = torch.randn(batch, seq_len, config_base['hidden_size'], device='cuda', dtype=torch.float32)
result_fp32 = benchmark_model(bert_fp32, x_fp32, warmup=5, runs=20)

# FP16 Full Model
bert_fp16 = BERTEncoder(
    config_base['hidden_size'],
    config_base['num_heads'],
    config_base['intermediate_size'],
    config_base['num_layers']
).cuda().half()

x_fp16 = torch.randn(batch, seq_len, config_base['hidden_size'], device='cuda', dtype=torch.float16)
result_fp16 = benchmark_model(bert_fp16, x_fp16, warmup=5, runs=20)

speedup = result_fp32['mean_ms'] / result_fp16['mean_ms']

print(f"Configuration: Batch={batch}, Seq={seq_len}, Layers=12")
print(f"FP32: {result_fp32['mean_ms']:.2f} ms")
print(f"FP16: {result_fp16['mean_ms']:.2f} ms")
print(f"Speedup: {speedup:.2f}x")

# Memory usage
torch.cuda.reset_peak_memory_stats()
_ = bert_fp32(x_fp32)
mem_fp32 = torch.cuda.max_memory_allocated() / 1e6

torch.cuda.reset_peak_memory_stats()
_ = bert_fp16(x_fp16)
mem_fp16 = torch.cuda.max_memory_allocated() / 1e6

print(f"\nMemory (peak):")
print(f"FP32: {mem_fp32:.1f} MB")
print(f"FP16: {mem_fp16:.1f} MB")
print(f"Savings: {(1 - mem_fp16/mem_fp32)*100:.1f}%")

In [None]:
# Cell 6: Summary
print("\n" + "=" * 60)
print("ZENITH TRANSFORMER BENCHMARK SUMMARY")
print("=" * 60)

print("\nSingle Layer Results:")
print(f"{'Batch':<8} {'Seq':<8} {'FP32 (ms)':<12} {'FP16 (ms)':<12} {'Speedup':<10}")
print("-" * 50)
for r in results_base:
    print(f"{r['batch']:<8} {r['seq_len']:<8} {r['fp32_ms']:<12.2f} {r['fp16_ms']:<12.2f} {r['speedup']:<10.2f}x")

print(f"\nFull 12-Layer BERT-Base (batch=8, seq=128):")
print(f"  FP32: {result_fp32['mean_ms']:.2f} ms")
print(f"  FP16: {result_fp16['mean_ms']:.2f} ms (Tensor Cores)")
print(f"  Speedup: {speedup:.2f}x")
print(f"  Memory Savings: {(1 - mem_fp16/mem_fp32)*100:.1f}%")

print("\n" + "=" * 60)
print("This benchmark validates Zenith's optimization targets:")
print("- FP16 Tensor Core utilization")
print("- Memory efficiency improvement")
print("- End-to-end transformer acceleration")
print("=" * 60)