# Zenith Testing Notebook

Testing Triton Integration, Load Testing, dan QAT pada Google Colab.

**Features Tested:**
- Triton Client (Mock)
- Load Testing
- QAT (Quantization-Aware Training)
- Benchmark Utilities

## 1. Setup

In [None]:
# Clone repository
!git clone https://github.com/vibeswithkk/ZENITH.git
%cd ZENITH

In [None]:
# Install dependencies
!pip install -q numpy pytest requests

In [None]:
import sys
sys.path.insert(0, '.')

import numpy as np
print(f"NumPy version: {np.__version__}")
print("Setup complete!")

## 2. Triton Client Testing

In [None]:
from zenith.serving.triton_client import (
    MockTritonClient,
    InferenceInput,
    ModelMetadata
)

# Create mock client
client = MockTritonClient("localhost:8000")

# Register a test model
def model_handler(inputs):
    """Simple model that doubles the input."""
    return {"output": inputs[0].data * 2}

client.register_model(
    "test_model",
    metadata=ModelMetadata(name="test_model", platform="python", versions=["1"]),
    handler=model_handler
)

# Test health check
print(f"Server Live: {client.is_server_live()}")
print(f"Server Ready: {client.is_server_ready()}")
print(f"Model Ready: {client.is_model_ready('test_model')}")
print(f"Models: {client.list_models()}")

In [None]:
# Test inference
input_data = np.array([[1.0, 2.0, 3.0]], dtype=np.float32)
inputs = [InferenceInput(name="input", data=input_data)]

result = client.infer("test_model", inputs)

print(f"Success: {result.success}")
print(f"Model: {result.model_name}")
print(f"Latency: {result.latency_ms:.3f} ms")
print(f"Input: {input_data}")
print(f"Output: {result.get_output('output')}")

## 3. Load Testing

In [None]:
from tests.integration.triton_load_test import run_mock_load_test

# Run load test with 100 requests
result = run_mock_load_test(
    model_name="load_test_model",
    num_requests=100,
    concurrent_workers=10,
    verbose=True
)

In [None]:
# Run higher concurrency test
print("\n" + "="*60)
print("HIGH CONCURRENCY TEST")
print("="*60)

result = run_mock_load_test(
    model_name="high_concurrency_model",
    num_requests=500,
    concurrent_workers=50,
    verbose=True
)

## 4. QAT (Quantization-Aware Training) Testing

In [None]:
from zenith.optimization.qat import (
    FakeQuantize,
    QATConfig,
    fold_bn_into_conv
)

# Test FakeQuantize
fq = FakeQuantize(num_bits=8, symmetric=True)

# Generate test data
data = np.random.randn(1000).astype(np.float32) * 3

# Observe data (calibration)
fq.observe(data)

# Apply fake quantization
quantized = fq.forward(data)

# Calculate error
error = np.abs(data - quantized)
print(f"Max Error: {np.max(error):.6f}")
print(f"Mean Error: {np.mean(error):.6f}")
print(f"Scale: {fq.scale}")
print(f"Zero Point: {fq.zero_point}")

In [None]:
# Test per-channel quantization
fq_perchannel = FakeQuantize(num_bits=8, symmetric=True, per_channel=True, channel_axis=0)

# Create data with different ranges per channel
channel_data = np.stack([
    np.random.randn(100).astype(np.float32) * 1,   # Scale 1
    np.random.randn(100).astype(np.float32) * 10,  # Scale 10
    np.random.randn(100).astype(np.float32) * 100  # Scale 100
], axis=0)

fq_perchannel.observe(channel_data)
quantized = fq_perchannel.forward(channel_data)

print(f"Per-channel scales: {fq_perchannel.scale}")
print(f"Channel 0 error: {np.mean(np.abs(channel_data[0] - quantized[0])):.6f}")
print(f"Channel 1 error: {np.mean(np.abs(channel_data[1] - quantized[1])):.6f}")
print(f"Channel 2 error: {np.mean(np.abs(channel_data[2] - quantized[2])):.6f}")

In [None]:
# Test BatchNorm Folding
print("\nBatch Normalization Folding Test:")

# Create conv weights
weight = np.random.randn(4, 3, 3, 3).astype(np.float32)
bias = np.random.randn(4).astype(np.float32)

# Create BN parameters
bn_mean = np.random.randn(4).astype(np.float32)
bn_var = np.abs(np.random.randn(4).astype(np.float32)) + 0.1
bn_gamma = np.random.randn(4).astype(np.float32)
bn_beta = np.random.randn(4).astype(np.float32)

# Fold BN into conv
folded_weight, folded_bias = fold_bn_into_conv(
    weight, bias, bn_mean, bn_var, bn_gamma, bn_beta
)

print(f"Original weight shape: {weight.shape}")
print(f"Folded weight shape: {folded_weight.shape}")
print(f"Original bias shape: {bias.shape}")
print(f"Folded bias shape: {folded_bias.shape}")
print("BN folding successful!")

## 5. Run All Tests

In [None]:
# Run pytest for Triton integration
!python -m pytest tests/test_triton_integration.py -v --tb=short

In [None]:
# Run pytest for QAT
!python -m pytest tests/test_qat.py -v --tb=short

In [None]:
# Run pytest for Triton backend
!python -m pytest tests/test_triton_backend.py -v --tb=short

## 6. QAT Benchmark

In [None]:
# Run QAT benchmark
!python benchmarks/qat_benchmark.py --model resnet50 --iterations 50

In [None]:
# Run BERT benchmark
!python benchmarks/qat_benchmark.py --model bert-base --iterations 50

## Summary

All tests completed! Check the outputs above for:
- Triton Client: Mock inference working
- Load Testing: Throughput and latency metrics
- QAT: Quantization error bounds
- BN Folding: Weight transformation
- Full test suite: pytest results