# Zenith 0.3.4 Verification

**NEW in 0.3.4:** T4-optimized Triton autotune configs

In [None]:
# 1. GPU Check
!nvidia-smi | head -15
import torch
print(f"\nPyTorch: {torch.__version__}")
print(f"GPU: {torch.cuda.get_device_name(0)}" if torch.cuda.is_available() else "No GPU")

In [None]:
# 2. Install 0.3.4 (auto-restart)
!pip uninstall pyzenith -y 2>/dev/null
!pip install pyzenith==0.3.4 --no-cache-dir -q
import os; os.kill(os.getpid(), 9)

In [None]:
# 3. Verify
import zenith
print(f"Zenith: {zenith.__version__}")
assert zenith.__version__ == '0.3.4'

In [None]:
# 4. Triton Fused Kernel Benchmark
from zenith.runtime.triton_kernels import benchmark_fused_linear_gelu, is_available

print(f"Triton: {is_available()}")
result = benchmark_fused_linear_gelu(M=1024, N=4096, K=1024, runs=50)
print(f"Fused: {result['fused_ms']:.2f}ms | Separate: {result['separate_ms']:.2f}ms | Speedup: {result['speedup']:.2f}x")

In [None]:
# 5. torch.compile Benchmark
import torch, time

class MLP(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = torch.nn.Linear(512, 1024)
        self.fc2 = torch.nn.Linear(1024, 512)
    def forward(self, x):
        return self.fc2(torch.relu(self.fc1(x)))

model = MLP().cuda()
x = torch.randn(32, 512).cuda()

# Baseline
with torch.no_grad():
    for _ in range(10): model(x)
    torch.cuda.synchronize()
    t0 = time.perf_counter()
    for _ in range(100): model(x)
    torch.cuda.synchronize()
    baseline = (time.perf_counter() - t0) * 1000

# Zenith
compiled = torch.compile(model, backend='zenith')
with torch.no_grad():
    for _ in range(10): compiled(x)
    torch.cuda.synchronize()
    t0 = time.perf_counter()
    for _ in range(100): compiled(x)
    torch.cuda.synchronize()
    zenith_time = (time.perf_counter() - t0) * 1000

print(f"Baseline: {baseline:.2f}ms | Zenith: {zenith_time:.2f}ms | Speedup: {baseline/zenith_time:.2f}x")

In [None]:
# 6. Summary
print(f"\nZenith {zenith.__version__} on {torch.cuda.get_device_name(0)}")