# Zenith Testing Notebook

Testing Triton Integration, Load Testing, dan QAT pada Google Colab.


## 1. Setup

In [None]:
# Clone repository
!git clone https://github.com/vibeswithkk/ZENITH.git
%cd ZENITH

In [None]:
# Install dependencies
!pip install -q numpy pytest requests

In [None]:
import sys
sys.path.insert(0, '.')
sys.path.insert(0, './tests')
sys.path.insert(0, './tests/integration')

import numpy as np
print(f"NumPy version: {np.__version__}")
print("Setup complete!")

## 2. Triton Client Testing

In [None]:
from zenith.serving.triton_client import (
    MockTritonClient,
    InferenceInput,
    ModelMetadata
)

# Create mock client
client = MockTritonClient("localhost:8000")

# Register a test model
def model_handler(inputs):
    return {"output": inputs[0].data * 2}

client.register_model(
    "test_model",
    metadata=ModelMetadata(name="test_model", platform="python", versions=["1"]),
    handler=model_handler
)

print(f"Server Live: {client.is_server_live()}")
print(f"Server Ready: {client.is_server_ready()}")
print(f"Model Ready: {client.is_model_ready('test_model')}")
print(f"Models: {client.list_models()}")

In [None]:
# Test inference
input_data = np.array([[1.0, 2.0, 3.0]], dtype=np.float32)
inputs = [InferenceInput(name="input", data=input_data)]

result = client.infer("test_model", inputs)

print(f"Success: {result.success}")
print(f"Model: {result.model_name}")
print(f"Latency: {result.latency_ms:.3f} ms")
print(f"Input: {input_data}")
print(f"Output: {result.get_output('output')}")

## 3. Load Testing

In [None]:
# Import directly from integration folder
from triton_load_test import run_mock_load_test

# Run load test with 100 requests
result = run_mock_load_test(
    model_name="load_test_model",
    num_requests=100,
    concurrent_workers=10,
    verbose=True
)

In [None]:
# High concurrency test
print("\nHIGH CONCURRENCY TEST")
result = run_mock_load_test(
    model_name="high_concurrency_model",
    num_requests=500,
    concurrent_workers=50,
    verbose=True
)

## 4. QAT Testing

In [None]:
from zenith.optimization.qat import FakeQuantize, fold_bn_into_conv

# Test FakeQuantize
fq = FakeQuantize(num_bits=8, symmetric=True)
data = np.random.randn(1000).astype(np.float32) * 3
fq.observe(data)
quantized = fq.forward(data)

error = np.abs(data - quantized)
print(f"Max Error: {np.max(error):.6f}")
print(f"Mean Error: {np.mean(error):.6f}")
print(f"Scale: {fq.scale}")

In [None]:
# Test BN Folding
weight = np.random.randn(4, 3, 3, 3).astype(np.float32)
bias = np.random.randn(4).astype(np.float32)
bn_mean = np.random.randn(4).astype(np.float32)
bn_var = np.abs(np.random.randn(4).astype(np.float32)) + 0.1
bn_gamma = np.random.randn(4).astype(np.float32)
bn_beta = np.random.randn(4).astype(np.float32)

folded_weight, folded_bias = fold_bn_into_conv(weight, bias, bn_mean, bn_var, bn_gamma, bn_beta)
print(f"Weight shape: {folded_weight.shape}")
print(f"Bias shape: {folded_bias.shape}")
print("BN folding success!")

## 5. Run All Tests

In [None]:
!python -m pytest tests/test_triton_integration.py -v --tb=short 2>&1 | head -50

In [None]:
!python -m pytest tests/test_qat.py -v --tb=short 2>&1 | head -50

In [None]:
!python -m pytest tests/test_triton_backend.py -v --tb=short

## 6. QAT Benchmark

In [None]:
!python benchmarks/qat_benchmark.py --model resnet50 --iterations 50

In [None]:
!python benchmarks/qat_benchmark.py --model bert-base --iterations 50

## Done!

All tests should pass. Check outputs above.