# EdgeLLM CUDA Inference Test

This notebook tests the CUDA T-MAC kernels for GPU-accelerated BitNet inference.

**Requirements:**
- NVIDIA GPU (Jetson, RTX, etc.)
- CUDA Toolkit 11.0+
- nvcc compiler

## 1. Check GPU Environment

In [None]:
# Check NVIDIA GPU
!nvidia-smi

In [None]:
# Check CUDA version
!nvcc --version

In [None]:
# Get GPU details
!nvidia-smi --query-gpu=name,memory.total,compute_cap --format=csv

## 2. Clone Repository and Build CUDA Kernels

In [None]:
# Clone the repository (if not already present)
import os
if not os.path.exists('ollama-api-gateway'):
    !git clone https://github.com/umerkhan95/ollama-api-gateway.git
else:
    print('Repository already exists, pulling latest changes...')
    !cd ollama-api-gateway && git pull

In [None]:
# Navigate to kernels directory
%cd ollama-api-gateway/mojo-gateway/src/kernels

In [None]:
# Build CUDA kernels
!make cuda

In [None]:
# Verify build output
!ls -la ../../lib/

## 3. Run CUDA Kernel Tests

In [None]:
# Run CUDA unit tests
!make cuda-test

## 4. Python CUDA Kernel Test

Test the CUDA kernels directly from Python using ctypes.

In [None]:
import ctypes
import numpy as np
import os

# Find the CUDA library
lib_path = '../../lib/libtmac_kernel_cuda.so'
if not os.path.exists(lib_path):
    raise FileNotFoundError(f'CUDA library not found at {lib_path}. Run make cuda first.')

# Load the library
cuda_lib = ctypes.CDLL(lib_path)
print(f'Loaded CUDA library: {lib_path}')

In [None]:
# Define function signatures
cuda_lib.cuda_available.restype = ctypes.c_int
cuda_lib.cuda_device_name.restype = ctypes.c_char_p
cuda_lib.cuda_init.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int]
cuda_lib.cuda_init.restype = ctypes.c_int
cuda_lib.cuda_cleanup.restype = None

# Check CUDA availability
if cuda_lib.cuda_available():
    device_name = cuda_lib.cuda_device_name().decode('utf-8')
    print(f'CUDA Available: Yes')
    print(f'Device: {device_name}')
else:
    print('CUDA Not Available')

In [None]:
# Initialize CUDA
max_weights = 10_000_000  # 10MB
max_activations = 1_000_000
max_output = 1_000_000

ret = cuda_lib.cuda_init(max_weights, max_activations, max_output)
if ret == 0:
    print('CUDA initialized successfully')
else:
    print('CUDA initialization failed')

In [None]:
# Test RMSNorm kernel
cuda_lib.rmsnorm_cuda.argtypes = [
    ctypes.POINTER(ctypes.c_float),  # output
    ctypes.POINTER(ctypes.c_float),  # input
    ctypes.POINTER(ctypes.c_float),  # weight
    ctypes.c_int,                     # batch_size
    ctypes.c_int,                     # size
    ctypes.c_float                    # eps
]
cuda_lib.rmsnorm_cuda.restype = ctypes.c_int

# Create test data
batch_size = 4
size = 256

input_data = np.random.randn(batch_size, size).astype(np.float32)
weight_data = np.ones(size, dtype=np.float32)
output_data = np.zeros((batch_size, size), dtype=np.float32)

# Run RMSNorm on GPU
ret = cuda_lib.rmsnorm_cuda(
    output_data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
    input_data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
    weight_data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
    batch_size,
    size,
    ctypes.c_float(1e-6)
)

if ret == 0:
    print('RMSNorm CUDA: SUCCESS')
    print(f'Input mean: {input_data.mean():.4f}')
    print(f'Output mean: {output_data.mean():.4f}')
    print(f'Output std: {output_data.std():.4f}')
else:
    print('RMSNorm CUDA: FAILED')

In [None]:
# Test Softmax kernel
cuda_lib.softmax_cuda.argtypes = [
    ctypes.POINTER(ctypes.c_float),  # output
    ctypes.POINTER(ctypes.c_float),  # input
    ctypes.c_int,                     # batch_size
    ctypes.c_int                      # size
]
cuda_lib.softmax_cuda.restype = ctypes.c_int

# Create test data
logits = np.random.randn(batch_size, size).astype(np.float32) * 2
probs = np.zeros((batch_size, size), dtype=np.float32)

# Run Softmax on GPU
ret = cuda_lib.softmax_cuda(
    probs.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
    logits.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
    batch_size,
    size
)

if ret == 0:
    print('Softmax CUDA: SUCCESS')
    # Verify softmax sums to 1
    for b in range(batch_size):
        row_sum = probs[b].sum()
        print(f'  Batch {b} sum: {row_sum:.6f} (should be ~1.0)')
else:
    print('Softmax CUDA: FAILED')

## 5. Performance Benchmark

In [None]:
import time

# Benchmark RMSNorm
batch_size = 32
size = 4096  # Typical hidden size
iterations = 1000

input_data = np.random.randn(batch_size, size).astype(np.float32)
weight_data = np.ones(size, dtype=np.float32)
output_data = np.zeros((batch_size, size), dtype=np.float32)

# Warmup
for _ in range(10):
    cuda_lib.rmsnorm_cuda(
        output_data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        input_data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        weight_data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        batch_size, size, ctypes.c_float(1e-6)
    )

# Benchmark
cuda_lib.cuda_sync()  # Ensure warmup is done
start = time.perf_counter()
for _ in range(iterations):
    cuda_lib.rmsnorm_cuda(
        output_data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        input_data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        weight_data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        batch_size, size, ctypes.c_float(1e-6)
    )
cuda_lib.cuda_sync()
end = time.perf_counter()

total_time = end - start
per_call = total_time / iterations * 1000  # ms
throughput = iterations / total_time

print(f'RMSNorm Benchmark ({batch_size}x{size}):')
print(f'  Total time: {total_time:.3f}s for {iterations} iterations')
print(f'  Per call: {per_call:.3f}ms')
print(f'  Throughput: {throughput:.1f} calls/sec')

In [None]:
# Benchmark Softmax
logits = np.random.randn(batch_size, size).astype(np.float32)
probs = np.zeros((batch_size, size), dtype=np.float32)

# Warmup
for _ in range(10):
    cuda_lib.softmax_cuda(
        probs.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        logits.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        batch_size, size
    )

# Benchmark
cuda_lib.cuda_sync()
start = time.perf_counter()
for _ in range(iterations):
    cuda_lib.softmax_cuda(
        probs.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        logits.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
        batch_size, size
    )
cuda_lib.cuda_sync()
end = time.perf_counter()

total_time = end - start
per_call = total_time / iterations * 1000
throughput = iterations / total_time

print(f'Softmax Benchmark ({batch_size}x{size}):')
print(f'  Total time: {total_time:.3f}s for {iterations} iterations')
print(f'  Per call: {per_call:.3f}ms')
print(f'  Throughput: {throughput:.1f} calls/sec')

In [None]:
# Cleanup
cuda_lib.cuda_cleanup()
print('CUDA resources cleaned up')

## 6. Summary

This notebook tested:
1. CUDA environment detection
2. Building CUDA kernels
3. RMSNorm kernel functionality
4. Softmax kernel functionality
5. Performance benchmarks

**Next Steps:**
- Test T-MAC matmul kernel with real model weights
- Compare performance vs CPU (AVX2/NEON)
- Run full inference pipeline