# EdgeLLM INT8 Tensor Core Benchmark

This notebook benchmarks the INT8 Tensor Core implementation for EdgeLLM on Tesla T4 GPU.

**Tests:**
1. GPU hardware detection and Tensor Core capability
2. Kernel compilation with WMMA support
3. Weight expansion (2-bit to INT8)
4. Activation quantization (FP32 to INT8)
5. INT8 Tensor Core matmul performance
6. Comparison: Phase 2.1 vs INT8 TC
7. Precision validation

**Target:** 1000+ tok/s (vs 630 tok/s Phase 2.1)

## 1. Environment Setup

In [None]:
import subprocess
import os
import sys
import time
import json
from datetime import datetime

# Check GPU
result = subprocess.run(['nvidia-smi', '--query-gpu=name,compute_cap,memory.total,driver_version', 
                         '--format=csv,noheader'], capture_output=True, text=True)
gpu_info = result.stdout.strip()
print(f"GPU: {gpu_info}")

# Parse GPU info
parts = gpu_info.split(', ')
GPU_NAME = parts[0] if len(parts) > 0 else 'Unknown'
COMPUTE_CAP = parts[1] if len(parts) > 1 else '0.0'
VRAM_MB = int(parts[2].replace(' MiB', '')) if len(parts) > 2 else 0

# Check if INT8 Tensor Cores are available (compute 7.5+)
cc_major, cc_minor = map(int, COMPUTE_CAP.split('.'))
cc_int = cc_major * 10 + cc_minor
HAS_INT8_TC = cc_int >= 75

print(f"\nCompute Capability: {COMPUTE_CAP} (cc={cc_int})")
print(f"INT8 Tensor Cores: {'Available' if HAS_INT8_TC else 'NOT AVAILABLE'}")
print(f"VRAM: {VRAM_MB} MB")

In [None]:
# Clone repository
!rm -rf ollama-api-gateway
!git clone --depth 1 https://github.com/umerkhan95/ollama-api-gateway.git
%cd ollama-api-gateway/mojo-gateway

## 2. Build CUDA Kernels with Tensor Core Support

In [None]:
# Check CUDA version
!nvcc --version

In [None]:
# Build with T4 Tensor Core support
%cd src/kernels/cuda
!make clean
!make t4
print("\nBuild complete!")
!ls -la ../../../lib/*.so 2>/dev/null || echo "Library not found"

## 3. Create Test Harness

In [None]:
%%writefile test_int8_tc.cu
/**
 * INT8 Tensor Core Benchmark Test
 * Tests weight expansion, activation quantization, and TC matmul
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <cuda_runtime.h>
#include <chrono>
#include "tmac_kernel_cuda.h"

#define WARMUP_RUNS 10
#define BENCHMARK_RUNS 100

// Test configurations matching SmolLM-135M layers
struct LayerConfig {
    const char* name;
    int M;  // Output rows
    int K;  // Inner dimension
};

LayerConfig LAYERS[] = {
    {"QKV Projection", 1728, 576},
    {"Output Projection", 576, 576},
    {"FFN Up", 1536, 576},
    {"FFN Down", 576, 1536},
    {"Large Test", 4096, 4096},
};
const int NUM_LAYERS = sizeof(LAYERS) / sizeof(LAYERS[0]);

// Generate random ternary weights
void generate_ternary_weights(int8_t* packed, float* scales, int M, int K) {
    int packed_size = M * ((K + 3) / 4);
    for (int i = 0; i < packed_size; i++) {
        // Pack 4 ternary values (0,1,2 -> -1,0,+1)
        packed[i] = (rand() % 3) | ((rand() % 3) << 2) | 
                    ((rand() % 3) << 4) | ((rand() % 3) << 6);
    }
    for (int i = 0; i < M; i++) {
        scales[i] = 0.1f + (rand() / (float)RAND_MAX) * 0.1f;
    }
}

// Generate random activations
void generate_activations(float* act, int K) {
    for (int i = 0; i < K; i++) {
        act[i] = -1.0f + 2.0f * (rand() / (float)RAND_MAX);
    }
}

// Benchmark a kernel
double benchmark_kernel(const char* name, int (*kernel_fn)(const float*, float*, int, int, int),
                        float* activations, float* output, int M, int N, int K) {
    // Warmup
    for (int i = 0; i < WARMUP_RUNS; i++) {
        kernel_fn(activations, output, M, N, K);
    }
    cudaDeviceSynchronize();
    
    // Benchmark
    auto start = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < BENCHMARK_RUNS; i++) {
        kernel_fn(activations, output, M, N, K);
    }
    cudaDeviceSynchronize();
    auto end = std::chrono::high_resolution_clock::now();
    
    double total_ms = std::chrono::duration<double, std::milli>(end - start).count();
    return total_ms / BENCHMARK_RUNS;
}

int main() {
    printf("\n" "=" "="*78 "\n");
    printf("EdgeLLM INT8 Tensor Core Benchmark\n");
    printf("=" "="*78 "\n\n");
    
    // Get device info
    int cc = cuda_get_compute_capability();
    int has_tc = cuda_has_int8_tensorcore();
    int num_gpus = cuda_get_device_count();
    
    printf("Device Info:\n");
    printf("  GPU: %s\n", cuda_device_name());
    printf("  Compute Capability: %d.%d (cc=%d)\n", cc/10, cc%10, cc);
    printf("  INT8 Tensor Cores: %s\n", has_tc ? "YES" : "NO");
    printf("  Number of GPUs: %d\n\n", num_gpus);
    
    // Initialize CUDA
    int max_M = 4096;
    int max_K = 4096;
    int max_weights = max_M * ((max_K + 3) / 4);
    
    if (cuda_init(max_weights, max_K, max_M) != 0) {
        fprintf(stderr, "Failed to initialize CUDA\n");
        return 1;
    }
    
    // Allocate buffers
    int8_t* packed_weights = (int8_t*)malloc(max_weights);
    float* scales = (float*)malloc(max_M * sizeof(float));
    float* activations = (float*)malloc(max_K * sizeof(float));
    float* output_phase21 = (float*)malloc(max_M * sizeof(float));
    float* output_int8tc = (float*)malloc(max_M * sizeof(float));
    float* norm_weights = (float*)malloc(max_K * sizeof(float));
    
    // Initialize norm weights
    for (int i = 0; i < max_K; i++) {
        norm_weights[i] = 0.9f + 0.2f * (rand() / (float)RAND_MAX);
    }
    
    printf("\n" "-" "-"*78 "\n");
    printf("KERNEL BENCHMARK RESULTS\n");
    printf("-" "-"*78 "\n\n");
    
    printf("| %-20s | %8s | %8s | %8s | %8s | %8s |\n",
           "Layer", "M", "K", "Phase2.1", "INT8 TC", "Speedup");
    printf("|" "-" "-"*21 "|" "-" "-"*9 "|" "-" "-"*9 "|" "-" "-"*9 "|" "-" "-"*9 "|" "-" "-"*9 "|\n");
    
    double total_phase21 = 0;
    double total_int8tc = 0;
    
    for (int layer = 0; layer < NUM_LAYERS; layer++) {
        int M = LAYERS[layer].M;
        int K = LAYERS[layer].K;
        int N = 1;  // batch size = 1
        int weight_bytes = M * ((K + 3) / 4);
        
        // Generate test data
        generate_ternary_weights(packed_weights, scales, M, K);
        generate_activations(activations, K);
        
        // Load weights for Phase 2.1 (persistent)
        cuda_load_weights(packed_weights, scales, weight_bytes, M);
        cuda_load_norm_weights(norm_weights, K);
        
        // Benchmark Phase 2.1 (streaming fused)
        double ms_phase21 = benchmark_kernel("Phase2.1",
            [](const float* act, float* out, int m, int n, int k) -> int {
                return streaming_fused_rmsnorm_matmul_cuda(act, out, m, k, 1e-5f);
            },
            activations, output_phase21, M, N, K);
        
        // Load weights for INT8 TC
        cuda_load_weights_int8_tc(packed_weights, scales, weight_bytes, M, K);
        
        // Benchmark INT8 TC
        double ms_int8tc = benchmark_kernel("INT8 TC",
            [](const float* act, float* out, int m, int n, int k) -> int {
                return streaming_fused_rmsnorm_matmul_int8_tc(act, out, m, k, 1e-5f);
            },
            activations, output_int8tc, M, N, K);
        
        double speedup = ms_phase21 / ms_int8tc;
        total_phase21 += ms_phase21;
        total_int8tc += ms_int8tc;
        
        printf("| %-20s | %8d | %8d | %7.3fms | %7.3fms | %7.2fx |\n",
               LAYERS[layer].name, M, K, ms_phase21, ms_int8tc, speedup);
        
        // Precision validation
        float max_error = 0;
        float mean_error = 0;
        for (int i = 0; i < M; i++) {
            float err = fabsf(output_phase21[i] - output_int8tc[i]);
            float rel_err = err / (fabsf(output_phase21[i]) + 1e-6f);
            max_error = fmaxf(max_error, rel_err);
            mean_error += rel_err;
        }
        mean_error /= M;
        
        if (max_error > 0.02f) {
            printf("  WARNING: Precision issue! Max error: %.4f, Mean: %.6f\n", max_error, mean_error);
        }
        
        cuda_unload_weights_int8_tc();
        cuda_unload_weights();
    }
    
    printf("|" "-" "-"*21 "|" "-" "-"*9 "|" "-" "-"*9 "|" "-" "-"*9 "|" "-" "-"*9 "|" "-" "-"*9 "|\n");
    printf("| %-20s | %8s | %8s | %7.3fms | %7.3fms | %7.2fx |\n",
           "TOTAL", "", "", total_phase21, total_int8tc, total_phase21/total_int8tc);
    
    // Calculate throughput
    printf("\n" "-" "-"*78 "\n");
    printf("THROUGHPUT ESTIMATE\n");
    printf("-" "-"*78 "\n\n");
    
    // SmolLM-135M has 9 layers, each layer = QKV + Out + FFN_up + FFN_down
    double total_layer_ms = 0;
    for (int i = 0; i < 4; i++) {  // First 4 layers represent one transformer layer
        int M = LAYERS[i].M;
        int K = LAYERS[i].K;
        int weight_bytes = M * ((K + 3) / 4);
        
        generate_ternary_weights(packed_weights, scales, M, K);
        generate_activations(activations, K);
        cuda_load_weights_int8_tc(packed_weights, scales, weight_bytes, M, K);
        cuda_load_norm_weights(norm_weights, K);
        
        double ms = benchmark_kernel("INT8 TC",
            [](const float* act, float* out, int m, int n, int k) -> int {
                return streaming_fused_rmsnorm_matmul_int8_tc(act, out, m, k, 1e-5f);
            },
            activations, output_int8tc, M, 1, K);
        total_layer_ms += ms;
        
        cuda_unload_weights_int8_tc();
    }
    
    double per_token_ms = total_layer_ms * 9;  // 9 transformer layers
    double tokens_per_sec = 1000.0 / per_token_ms;
    
    printf("Per-layer latency (INT8 TC): %.3f ms\n", total_layer_ms);
    printf("Per-token latency (9 layers): %.3f ms\n", per_token_ms);
    printf("Estimated throughput: %.1f tok/s\n", tokens_per_sec);
    printf("\nPhase 2.1 baseline: 630 tok/s\n");
    printf("Speedup vs Phase 2.1: %.2fx\n", tokens_per_sec / 630.0);
    printf("\nOllama baseline: 423 tok/s\n");
    printf("Speedup vs Ollama: %.2fx\n", tokens_per_sec / 423.0);
    
    // Output JSON results
    printf("\n" "-" "-"*78 "\n");
    printf("JSON OUTPUT\n");
    printf("-" "-"*78 "\n\n");
    
    printf("{\n");
    printf("  \"gpu\": \"%s\",\n", cuda_device_name());
    printf("  \"compute_capability\": %d,\n", cc);
    printf("  \"has_int8_tensorcore\": %s,\n", has_tc ? "true" : "false");
    printf("  \"per_token_ms\": %.4f,\n", per_token_ms);
    printf("  \"throughput_tok_s\": %.2f,\n", tokens_per_sec);
    printf("  \"speedup_vs_phase21\": %.2f,\n", tokens_per_sec / 630.0);
    printf("  \"speedup_vs_ollama\": %.2f\n", tokens_per_sec / 423.0);
    printf("}\n");
    
    // Cleanup
    free(packed_weights);
    free(scales);
    free(activations);
    free(output_phase21);
    free(output_int8tc);
    free(norm_weights);
    cuda_cleanup();
    
    printf("\nBenchmark complete!\n");
    return 0;
}

In [None]:
# Compile and run benchmark
!nvcc -O3 -gencode arch=compute_75,code=sm_75 --expt-relaxed-constexpr \
    -o test_int8_tc test_int8_tc.cu tmac_kernel.cu -lcudart
print("Compilation complete!")

In [None]:
# Run benchmark
!./test_int8_tc

## 4. Precision Validation

In [None]:
%%writefile test_precision.cu
/**
 * INT8 TC Precision Validation
 * Compares FP32 reference vs INT8 Tensor Core output
 */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "tmac_kernel_cuda.h"

int main() {
    printf("INT8 Tensor Core Precision Validation\n");
    printf("="*50 "\n\n");
    
    // Test sizes
    int test_sizes[][2] = {
        {576, 576},
        {1728, 576},
        {1536, 576},
        {576, 1536},
    };
    int num_tests = 4;
    
    cuda_init(1024*1024, 4096, 4096);
    
    for (int t = 0; t < num_tests; t++) {
        int M = test_sizes[t][0];
        int K = test_sizes[t][1];
        int weight_bytes = M * ((K + 3) / 4);
        
        // Allocate
        int8_t* weights = (int8_t*)malloc(weight_bytes);
        float* scales = (float*)malloc(M * sizeof(float));
        float* activations = (float*)malloc(K * sizeof(float));
        float* norm_weights = (float*)malloc(K * sizeof(float));
        float* output_fp32 = (float*)malloc(M * sizeof(float));
        float* output_int8 = (float*)malloc(M * sizeof(float));
        
        // Initialize
        srand(42);  // Reproducible
        for (int i = 0; i < weight_bytes; i++) {
            weights[i] = rand() & 0xFF;
        }
        for (int i = 0; i < M; i++) {
            scales[i] = 0.1f + 0.1f * (rand() / (float)RAND_MAX);
        }
        for (int i = 0; i < K; i++) {
            activations[i] = -1.0f + 2.0f * (rand() / (float)RAND_MAX);
            norm_weights[i] = 0.9f + 0.2f * (rand() / (float)RAND_MAX);
        }
        
        // FP32 reference (Phase 2.1)
        cuda_load_weights(weights, scales, weight_bytes, M);
        cuda_load_norm_weights(norm_weights, K);
        streaming_fused_rmsnorm_matmul_cuda(activations, output_fp32, M, K, 1e-5f);
        cuda_unload_weights();
        
        // INT8 TC
        cuda_load_weights_int8_tc(weights, scales, weight_bytes, M, K);
        cuda_load_norm_weights(norm_weights, K);
        streaming_fused_rmsnorm_matmul_int8_tc(activations, output_int8, M, K, 1e-5f);
        cuda_unload_weights_int8_tc();
        
        // Compare
        float max_abs_err = 0;
        float max_rel_err = 0;
        float sum_rel_err = 0;
        int num_large_errors = 0;
        
        for (int i = 0; i < M; i++) {
            float abs_err = fabsf(output_fp32[i] - output_int8[i]);
            float rel_err = abs_err / (fabsf(output_fp32[i]) + 1e-6f);
            
            max_abs_err = fmaxf(max_abs_err, abs_err);
            max_rel_err = fmaxf(max_rel_err, rel_err);
            sum_rel_err += rel_err;
            
            if (rel_err > 0.01f) num_large_errors++;
        }
        float mean_rel_err = sum_rel_err / M;
        
        printf("Test %d: M=%d, K=%d\n", t+1, M, K);
        printf("  Max absolute error: %.6f\n", max_abs_err);
        printf("  Max relative error: %.4f (%.2f%%)\n", max_rel_err, max_rel_err*100);
        printf("  Mean relative error: %.6f (%.4f%%)\n", mean_rel_err, mean_rel_err*100);
        printf("  Errors > 1%%: %d / %d (%.2f%%)\n", num_large_errors, M, 100.0f*num_large_errors/M);
        printf("  Status: %s\n\n", max_rel_err < 0.02f ? "PASS" : "FAIL");
        
        free(weights);
        free(scales);
        free(activations);
        free(norm_weights);
        free(output_fp32);
        free(output_int8);
    }
    
    cuda_cleanup();
    return 0;
}

In [None]:
!nvcc -O3 -gencode arch=compute_75,code=sm_75 --expt-relaxed-constexpr \
    -o test_precision test_precision.cu tmac_kernel.cu -lcudart
!./test_precision

## 5. Summary Report

In [None]:
import json
from datetime import datetime

# Create summary report
report = {
    "timestamp": datetime.now().isoformat(),
    "gpu": GPU_NAME,
    "compute_capability": COMPUTE_CAP,
    "vram_mb": VRAM_MB,
    "has_int8_tensorcore": HAS_INT8_TC,
    "benchmark": "INT8 Tensor Core vs Phase 2.1",
    "target_throughput": "1000+ tok/s",
    "baseline_phase21": "630 tok/s",
    "baseline_ollama": "423 tok/s",
}

print(json.dumps(report, indent=2))

# Save report
with open('int8_tc_benchmark_report.json', 'w') as f:
    json.dump(report, f, indent=2)
print("\nReport saved to int8_tc_benchmark_report.json")

## 6. Cleanup

In [None]:
# Cleanup test files
!rm -f test_int8_tc test_precision test_int8_tc.cu test_precision.cu
print("Cleanup complete!")