# Zenith Phase 5 - Tensor Core WMMA Testing

**Target GPU**: NVIDIA T4 (65 TFLOPS FP16)

**Goal**: Verify Tensor Cores are working correctly

In [None]:
# Cell 1: Check GPU
!nvidia-smi

In [None]:
# Cell 2: Install CuPy for GPU operations
!pip install cupy-cuda11x -q

In [None]:
# Cell 3: NumPy Baseline (CPU)
import numpy as np
import time

print("=" * 50)
print("NUMPY BASELINE (CPU)")
print("=" * 50)

sizes = [(1024, 1024, 1024), (2048, 2048, 2048), (4096, 4096, 4096)]

for m, n, k in sizes:
    a = np.random.randn(m, k).astype(np.float32)
    b = np.random.randn(k, n).astype(np.float32)
    
    _ = a @ b  # Warmup
    
    times = []
    for _ in range(5):
        start = time.perf_counter()
        c = a @ b
        times.append((time.perf_counter() - start) * 1000)
    
    mean_ms = np.mean(times)
    tflops = (2 * m * n * k) / (mean_ms / 1000) / 1e12
    print(f"{m}x{n}x{k}: {mean_ms:.1f} ms, {tflops:.3f} TFLOPS")

In [None]:
# Cell 4: cuBLAS FP32 (GPU)
import cupy as cp

print("=" * 50)
print("CUBLAS FP32 (GPU)")
print("=" * 50)

for m, n, k in sizes:
    a = cp.random.randn(m, k).astype(cp.float32)
    b = cp.random.randn(k, n).astype(cp.float32)
    
    cp.cuda.Stream.null.synchronize()
    _ = a @ b
    cp.cuda.Stream.null.synchronize()
    
    times = []
    for _ in range(20):
        cp.cuda.Stream.null.synchronize()
        start = time.perf_counter()
        c = a @ b
        cp.cuda.Stream.null.synchronize()
        times.append((time.perf_counter() - start) * 1000)
    
    mean_ms = np.mean(times)
    tflops = (2 * m * n * k) / (mean_ms / 1000) / 1e12
    print(f"{m}x{n}x{k}: {mean_ms:.2f} ms, {tflops:.2f} TFLOPS")

print("\nT4 FP32 Theoretical: 8.1 TFLOPS")

In [None]:
# Cell 5: FP16 TENSOR CORE (GPU) - Main Test
print("=" * 50)
print("FP16 TENSOR CORE (GPU)")
print("=" * 50)

for m, n, k in sizes:
    a = cp.random.randn(m, k).astype(cp.float16)
    b = cp.random.randn(k, n).astype(cp.float16)
    
    cp.cuda.Stream.null.synchronize()
    _ = a @ b
    cp.cuda.Stream.null.synchronize()
    
    times = []
    for _ in range(20):
        cp.cuda.Stream.null.synchronize()
        start = time.perf_counter()
        c = a @ b
        cp.cuda.Stream.null.synchronize()
        times.append((time.perf_counter() - start) * 1000)
    
    mean_ms = np.mean(times)
    tflops = (2 * m * n * k) / (mean_ms / 1000) / 1e12
    efficiency = (tflops / 65) * 100
    print(f"{m}x{n}x{k}: {mean_ms:.2f} ms, {tflops:.1f} TFLOPS ({efficiency:.0f}% efficiency)")

print("\nT4 FP16 Theoretical: 65 TFLOPS")
print("Target: >40 TFLOPS (>60% efficiency)")

In [None]:
# Cell 6: Summary
print("\n" + "=" * 50)
print("ZENITH PHASE 5 VERIFICATION")
print("=" * 50)
print("")
print("If FP16 TFLOPS > 40: Tensor Cores WORKING")
print("If FP16 TFLOPS < 20: Tensor Cores NOT USED")
print("")
print("Report these results to continue to Phase 6.")