# Zenith 0.3.5 - L2 Cache Optimized
**NEW:** GROUP_SIZE_M block grouping for L2 cache locality

In [None]:
!nvidia-smi | head -15
import torch
print(f"\nGPU: {torch.cuda.get_device_name(0)}")

In [None]:
!pip uninstall pyzenith -y 2>/dev/null
!pip install pyzenith==0.3.5 --no-cache-dir -q
import os; os.kill(os.getpid(), 9)

In [None]:
import zenith
print(f"Version: {zenith.__version__}")
assert zenith.__version__ == '0.3.5'

In [None]:
# Fused Kernel Benchmark
from zenith.runtime.triton_kernels import benchmark_fused_linear_gelu

result = benchmark_fused_linear_gelu(M=1024, N=4096, K=1024, runs=50)
print(f"Fused: {result['fused_ms']:.2f}ms")
print(f"Separate: {result['separate_ms']:.2f}ms")
print(f"Speedup: {result['speedup']:.2f}x")

In [None]:
# torch.compile Benchmark
import torch
import time

class MLP(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = torch.nn.Linear(512, 1024)
        self.fc2 = torch.nn.Linear(1024, 512)
    def forward(self, x):
        return self.fc2(torch.relu(self.fc1(x)))

model = MLP().cuda()
x = torch.randn(32, 512).cuda()

# Baseline
with torch.no_grad():
    for _ in range(10): model(x)
    torch.cuda.synchronize()
    t0 = time.perf_counter()
    for _ in range(100): model(x)
    torch.cuda.synchronize()
    baseline = (time.perf_counter() - t0) * 1000

# Zenith
compiled = torch.compile(model, backend='zenith')
with torch.no_grad():
    for _ in range(10): compiled(x)
    torch.cuda.synchronize()
    t0 = time.perf_counter()
    for _ in range(100): compiled(x)
    torch.cuda.synchronize()
    zenith_time = (time.perf_counter() - t0) * 1000

print(f"Baseline: {baseline:.2f}ms | Zenith: {zenith_time:.2f}ms | Speedup: {baseline/zenith_time:.2f}x")