# Conv2D vs Matmul ULP Comparison

This notebook compares ULP errors between TTNN and two references:
- **PyTorch CUDA reference** (generated on CUDA machine with high precision)
- **PyTorch FP32 ground truth** (local CPU float32 computation)

For both:
- Conv2D operation (3x3 kernel, padding (1,1))
- Equivalent matmul operation

## Data Flow:
1. Load CUDA reference outputs from `torch_reference_outputs/` (generated on CUDA)
2. Regenerate and validate input tensors using convenience functions
3. Compute PyTorch float32 ground truth using the validated inputs
4. Run TTNN operations with the same inputs
5. Compare TTNN outputs using ULP metrics:
   - **vs CUDA Reference**: Compare against high-precision CUDA outputs
   - **vs PyTorch FP32 Ground Truth**: Compare against local CPU float32 computation

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from typing import List, Dict, Tuple
from models.common.utility_functions import ulp
import os
from loguru import logger
import ttnn

# Import convenience functions for loading reference data
from load_torch_reference_outputs import (
    ReferenceDataLoader,
    get_conv2d_inputs,
    get_matmul_inputs,
    get_reference_outputs
)

# Disable TT logging
os.environ['TT_LOGGER_LEVEL'] = 'off'
os.environ["TT_METAL_CACHE"] = "/localdev/astancov/tt-metal/built" 
logger.disable('ttnn')

default_fig_size = (10, 6)

## Load Reference Data Configuration

Check what reference data is available from the CUDA generation.

In [None]:
# Initialize reference data loader
loader = ReferenceDataLoader("torch_reference_outputs")

# Print summary of available data
loader.print_summary()

# List all available test cases
cases = loader.list_available_cases()
print(f"\nAvailable test cases: {len(cases)}")
for dtype, ic, method in cases[:5]:  # Show first 5
    k = loader.ic_to_k[ic]
    print(f"  - dtype={dtype}, input_channels={ic}, method={method}, K={k}")
if len(cases) > 5:
    print(f"  ... and {len(cases) - 5} more")

## Utility Functions

In [None]:
def ulp_error(res, ref):
    """
    Compute the ULP error (in res ULPs) between two tensors.
    
    Args:
        res: Result tensor
        ref: Reference tensor
        
    Returns:
        ULP error between res and ref (in ULPs of res)
    """
    res_ulp = ulp(res)
    ref_as_res_dtype = ref.to(res.dtype)
    return torch.abs((res.to(torch.float64) - ref_as_res_dtype.to(torch.float64)) / res_ulp.to(torch.float64))

def compute_ulp_statistics(res, ref):
    """
    Compute ULP error statistics.
    
    Args:
        res: Result tensor
        ref: Reference tensor (numpy or torch)
        
    Returns:
        Dictionary with ULP statistics
    """
    # Convert numpy to torch if needed
    if isinstance(ref, np.ndarray):
        ref = torch.from_numpy(ref)
    
    ulp_errors = ulp_error(res, ref)
    ulp_vals = ulp_errors.detach().cpu().numpy().flatten()
    
    return {
        'mean': float(np.mean(ulp_vals)),
        'median': float(np.median(ulp_vals)),
        'max': float(np.max(ulp_vals)),
        'min': float(np.min(ulp_vals)),
        'p95': float(np.percentile(ulp_vals, 95)),
        'p99': float(np.percentile(ulp_vals, 99))
    }

def plot_ulp_comparison_dual(conv2d_stats, matmul_stats, config_name, ref_type):
    """
    Plot ULP statistics comparison between conv2d and matmul.
    """
    metrics = ['mean', 'median', 'max', 'p95', 'p99']
    conv2d_vals = [conv2d_stats[m] for m in metrics]
    matmul_vals = [matmul_stats[m] for m in metrics]
    
    x = np.arange(len(metrics))
    width = 0.35
    
    fig, ax = plt.subplots(figsize=default_fig_size)
    bars1 = ax.bar(x - width/2, conv2d_vals, width, label='Conv2D', alpha=0.8)
    bars2 = ax.bar(x + width/2, matmul_vals, width, label='Matmul', alpha=0.8)
    
    ax.set_ylabel('ULP Error')
    ax.set_title(f'ULP Error: TTNN vs {ref_type}\n{config_name}')
    ax.set_xticks(x)
    ax.set_xticklabels(metrics)
    ax.legend()
    ax.grid(True, alpha=0.3, axis='y')
    
    # Add value labels on bars
    for bars in [bars1, bars2]:
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:.2f}',
                   ha='center', va='bottom', fontsize=8)
    
    plt.tight_layout()
    plt.show()

def plot_ulp_histogram_comparison(conv2d_res, conv2d_ref, matmul_res, matmul_ref, config_name, ref_type, n_bins=100):
    """
    Plot ULP error histograms for both conv2d and matmul side by side.
    """
    # Convert numpy to torch if needed
    if isinstance(conv2d_ref, np.ndarray):
        conv2d_ref = torch.from_numpy(conv2d_ref)
    if isinstance(matmul_ref, np.ndarray):
        matmul_ref = torch.from_numpy(matmul_ref)
    
    conv2d_ulp = ulp_error(conv2d_res, conv2d_ref).detach().cpu().numpy().flatten()
    matmul_ulp = ulp_error(matmul_res, matmul_ref).detach().cpu().numpy().flatten()
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    
    # Conv2D histogram
    ax1.hist(conv2d_ulp, bins=n_bins, alpha=0.7, edgecolor='black', color='blue')
    ax1.set_xlabel('ULP Error')
    ax1.set_ylabel('Frequency')
    ax1.set_title(f'Conv2D ULP Error Histogram\n{config_name} vs {ref_type}')
    ax1.grid(True, alpha=0.3)
    
    # Matmul histogram
    ax2.hist(matmul_ulp, bins=n_bins, alpha=0.7, edgecolor='black', color='green')
    ax2.set_xlabel('ULP Error')
    ax2.set_ylabel('Frequency')
    ax2.set_title(f'Matmul ULP Error Histogram\n{config_name} vs {ref_type}')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

def plot_ulp_cdf_comparison(conv2d_res, conv2d_ref, matmul_res, matmul_ref, config_name, ref_type):
    """
    Plot ULP error CDFs for both conv2d and matmul on the same plot.
    """
    # Convert numpy to torch if needed
    if isinstance(conv2d_ref, np.ndarray):
        conv2d_ref = torch.from_numpy(conv2d_ref)
    if isinstance(matmul_ref, np.ndarray):
        matmul_ref = torch.from_numpy(matmul_ref)
    
    conv2d_ulp = ulp_error(conv2d_res, conv2d_ref).detach().cpu().numpy().flatten()
    matmul_ulp = ulp_error(matmul_res, matmul_ref).detach().cpu().numpy().flatten()
    
    conv2d_sorted = np.sort(conv2d_ulp)
    matmul_sorted = np.sort(matmul_ulp)
    
    conv2d_cdf = np.arange(1, len(conv2d_sorted) + 1) / len(conv2d_sorted)
    matmul_cdf = np.arange(1, len(matmul_sorted) + 1) / len(matmul_sorted)
    
    plt.figure(figsize=default_fig_size)
    plt.plot(conv2d_sorted, conv2d_cdf, linewidth=2, label='Conv2D', alpha=0.8)
    plt.plot(matmul_sorted, matmul_cdf, linewidth=2, label='Matmul', alpha=0.8)
    plt.xlabel('ULP Error')
    plt.ylabel('Cumulative Probability')
    plt.title(f'ULP Error CDF: TTNN vs {ref_type}\n{config_name}')
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.tight_layout()
    plt.show()

## PyTorch Ground Truth Functions

In [None]:
def run_torch_conv2d_ground_truth(input_tensor, weight_tensor, bias_tensor, padding, stride, use_bias=True):
    """
    Run conv2d in PyTorch float32 as ground truth.
    
    Args:
        input_tensor: Input tensor (torch)
        weight_tensor: Weight tensor (torch)
        bias_tensor: Bias tensor (torch)
        padding: Padding tuple
        stride: Stride tuple
        use_bias: Whether to use bias
        
    Returns:
        Output tensor in float32
    """
    # Convert all inputs to float32
    input_f32 = input_tensor.to(torch.float32)
    weight_f32 = weight_tensor.to(torch.float32)
    bias_f32 = bias_tensor.to(torch.float32) if use_bias else None
    
    # PyTorch expects bias as 1D
    if use_bias and bias_f32.dim() == 4:
        bias_f32 = bias_f32.reshape(-1)
    
    return torch.nn.functional.conv2d(
        input_f32,
        weight_f32,
        bias=bias_f32,
        stride=stride,
        padding=padding
    )

def run_torch_matmul_ground_truth(A, B):
    """
    Run matmul in PyTorch float32 as ground truth.
    
    Args:
        A: First input tensor
        B: Second input tensor
        
    Returns:
        Output tensor in float32
    """
    # Convert to float32
    A_f32 = A.to(torch.float32)
    B_f32 = B.to(torch.float32)
    
    return torch.matmul(A_f32, B_f32)

## TTNN Operation Wrappers

In [None]:
def run_ttnn_conv2d(input_tensor, weight_tensor, bias_tensor, padding, stride, fp32_acc=False, 
                    device=None, use_bias=True):
    """
    Run conv2d in ttnn.
    
    Args:
        input_tensor: Input tensor (torch) in NCHW format
        weight_tensor: Weight tensor (torch) in OIHW format
        bias_tensor: Bias tensor (torch)
        padding: Padding tuple (h, w)
        stride: Stride tuple (h, w)
        fp32_acc: Whether to use fp32 accumulation
        device: TTNN device
        use_bias: Whether to use bias
        
    Returns:
        Output tensor (torch) in NCHW format
    """
    create_local_device = device is None
    if create_local_device:
        device = ttnn.CreateDevice(0, l1_small_size=16384)
    
    # Extract dimensions
    batch_size = input_tensor.shape[0]
    in_channels = input_tensor.shape[1]
    input_height = input_tensor.shape[2]
    input_width = input_tensor.shape[3]
    out_channels = weight_tensor.shape[0]
    kernel_h = weight_tensor.shape[2]
    kernel_w = weight_tensor.shape[3]
    
    # Handle padding and stride
    if isinstance(padding, int):
        pad_h, pad_w = padding, padding
    else:
        pad_h, pad_w = padding[0], padding[1]
        
    if isinstance(stride, int):
        str_h, str_w = stride, stride
    else:
        str_h, str_w = stride[0], stride[1]
    
    # Convert input from NCHW to NHWC format
    input_nhwc = input_tensor.permute(0, 2, 3, 1).contiguous()
    
    # Prepare bias - reshape to (1, 1, 1, out_channels) if needed
    if use_bias:
        if bias_tensor.dim() == 1:
            bias_tensor = bias_tensor.reshape(1, 1, 1, -1)
        elif bias_tensor.shape != (1, 1, 1, out_channels):
            bias_tensor = bias_tensor.reshape(1, 1, 1, -1)
    
    # Determine ttnn dtype
    ttnn_dtype = ttnn.bfloat16 if input_tensor.dtype == torch.bfloat16 else ttnn.float32
    
    # Convert to ttnn tensors
    tt_input = ttnn.from_torch(input_nhwc, dtype=ttnn_dtype, layout=ttnn.ROW_MAJOR_LAYOUT, device=device)
    tt_weight = ttnn.from_torch(weight_tensor, dtype=ttnn_dtype)
    tt_bias = ttnn.from_torch(bias_tensor, dtype=ttnn_dtype) if use_bias else None
    
    # Configure compute kernel
    compute_config = ttnn.init_device_compute_kernel_config(
        device.arch(),
        math_fidelity=ttnn.MathFidelity.HiFi4,
        math_approx_mode=False,
        fp32_dest_acc_en=fp32_acc,
        packer_l1_acc=True,  # l1_acc is turned on
    )
    
    # Run conv2d
    [tt_output, out_dims, _] = ttnn.conv2d(
        input_tensor=tt_input,
        weight_tensor=tt_weight,
        bias_tensor=tt_bias,
        device=device,
        in_channels=in_channels,
        out_channels=out_channels,
        batch_size=batch_size,
        input_height=input_height,
        input_width=input_width,
        kernel_size=(kernel_h, kernel_w),
        stride=(str_h, str_w),
        padding=(pad_h, pad_h, pad_w, pad_w),
        dilation=(1, 1),
        groups=1,
        compute_config=compute_config,
        return_output_dim=True,
        return_weights_and_bias=True,
    )
    
    # Convert back to torch and reshape to NCHW
    output_torch = ttnn.to_torch(tt_output)
    out_height, out_width = out_dims
    output_nchw = output_torch.reshape(batch_size, out_height, out_width, out_channels).permute(0, 3, 1, 2)
    
    if create_local_device:
        ttnn.close_device(device)
    
    return output_nchw

def run_ttnn_matmul(A, B, fp32_acc=False, device=None):
    """
    Run matmul in ttnn.
    
    Args:
        A: First input tensor (torch)
        B: Second input tensor (torch)
        fp32_acc: Whether to use fp32 accumulation
        device: TTNN device
        
    Returns:
        Output tensor (torch)
    """
    create_local_device = device is None
    if create_local_device:
        device = ttnn.CreateDevice(0)
    
    tt_a = ttnn.from_torch(A, layout=ttnn.TILE_LAYOUT, device=device)
    tt_b = ttnn.from_torch(B, layout=ttnn.TILE_LAYOUT, device=device)
    
    compute_kernel_config = ttnn.init_device_compute_kernel_config(
        device.arch(),
        math_fidelity=ttnn.MathFidelity.HiFi4,
        math_approx_mode=False,
        fp32_dest_acc_en=fp32_acc,
        packer_l1_acc=True,  # l1_acc is turned on
    )
    
    tt_c = ttnn.matmul(
        tt_a,
        tt_b,
        core_grid=device.core_grid,
        compute_kernel_config=compute_kernel_config,
    )
    
    result = ttnn.to_torch(tt_c)
    
    if create_local_device:
        ttnn.close_device(device)
    
    return result

## Test Configuration

Select which test cases to run from the available reference data.

In [None]:
# Define test configurations
# Format: (dtype, input_channels, method, fp32_acc, config_name)
test_configs = [
    # ('bfloat16', 8, 'rand', False, 'bfloat16_ic8_rand'),
    # ('bfloat16', 8, 'rand', True, 'bfloat16_ic8_rand_fp32acc'),
    # ('bfloat16', 256, 'rand', False, 'bfloat16_ic256_rand'),
    # ('bfloat16', 256, 'rand', True, 'bfloat16_ic256_rand_fp32acc'),
    # ('float32', 8, 'rand', False, 'float32_ic8_rand'),
    # ('float32', 8, 'rand', True, 'float32_ic8_rand_fp32acc'),
    
    ('bfloat16', 8, 'rand', True, 'bfloat16_ic8_rand_fp32acc'),
    ('bfloat16', 256, 'rand', True, 'bfloat16_ic256_rand_fp32acc'),
    ('bfloat16', 1024, 'rand', True, 'bfloat16_ic1024_rand_fp32acc'),
    ('bfloat16', 4096, 'rand', True, 'bfloat16_ic4096_rand_fp32acc'),
    ('bfloat16', 4096, 'rand', True, 'bfloat16_ic4096_rand_fp32acc'),
    # 5120
    ('bfloat16', 5120, 'rand', True, 'bfloat16_ic5120_rand_fp32acc'),
    # 6144
    ('bfloat16', 6144, 'rand', True, 'bfloat16_ic6144_rand_fp32acc'),
    # 7168
    ('bfloat16', 7168, 'rand', True, 'bfloat16_ic7168_rand_fp32acc'),
    # 8192
    ('bfloat16', 8192, 'rand', True, 'bfloat16_ic8192_rand_fp32acc'),
    

]

# Print configuration
print("Test Configurations:")
print("=" * 80)
for i, (dtype, ic, method, fp32_acc, name) in enumerate(test_configs, 1):
    k = loader.ic_to_k[ic]
    print(f"{i}. {name}")
    print(f"   dtype={dtype}, input_channels={ic}, K={k}, method={method}, fp32_acc={fp32_acc}")
print("=" * 80)

## Run Experiments

For each configuration:
1. Load CUDA reference outputs
2. Get validated input tensors
3. Compute PyTorch FP32 ground truth
4. Run TTNN operations
5. Compare ULP errors (vs CUDA and vs PyTorch FP32)

In [None]:
device = ttnn.CreateDevice(0, l1_small_size=16384)

results = {}

# Get configuration from loader (assumes all test cases use the same config)
config = loader.config
padding = tuple(config['padding'])
stride = tuple(config['stride'])
use_bias = config['use_bias']

print(f"\nReference data configuration:")
print(f"  Padding: {padding}")
print(f"  Stride: {stride}")
print(f"  Use bias: {use_bias}")
print(f"  Batch: {config['batch']}")
print(f"  Input size: {config['input_height']}x{config['input_width']}")
print(f"  Output channels: {config['out_channels']}")
print(f"  Kernel: {config['kernel_h']}x{config['kernel_w']}")

for dtype, input_channels, method, fp32_acc, config_name in test_configs:
    print(f"\n{'='*80}")
    print(f"Running: {config_name}")
    print(f"{'='*80}")
    
    # Load CUDA reference outputs
    print("Loading CUDA reference outputs...")
    reference = get_reference_outputs(dtype, input_channels, method)
    conv2d_cuda_ref = reference['conv2d_output']
    matmul_cuda_ref = reference['matmul_output']
    metadata = reference['metadata']
    
    print(f"  Conv2D CUDA reference shape: {conv2d_cuda_ref.shape}")
    print(f"  Matmul CUDA reference shape: {matmul_cuda_ref.shape}")
    print(f"  Generation method: {metadata.get('gen_method', 'unknown')}")
    
    # Get validated inputs
    print("\nLoading and validating input tensors...")
    conv_input, conv_weight, conv_bias = get_conv2d_inputs(dtype, input_channels, method)
    matmul_A, matmul_B = get_matmul_inputs(dtype, input_channels, method)
    
    print(f"  ✓ Conv2D inputs loaded and verified")
    print(f"    Input: {conv_input.shape} {conv_input.dtype}")
    print(f"    Weight: {conv_weight.shape} {conv_weight.dtype}")
    print(f"    Bias: {conv_bias.shape} {conv_bias.dtype}")
    print(f"  ✓ Matmul inputs loaded and verified")
    print(f"    A: {matmul_A.shape} {matmul_A.dtype}")
    print(f"    B: {matmul_B.shape} {matmul_B.dtype}")
    
    # Compute PyTorch FP32 ground truth
    print("\nComputing PyTorch FP32 ground truth...")
    conv2d_torch_gt = run_torch_conv2d_ground_truth(
        conv_input, conv_weight, conv_bias, padding, stride, use_bias
    )
    matmul_torch_gt = run_torch_matmul_ground_truth(matmul_A, matmul_B)
    print(f"  ✓ Conv2D ground truth: {conv2d_torch_gt.shape}")
    print(f"  ✓ Matmul ground truth: {matmul_torch_gt.shape}")
    
    # Run TTNN operations
    print("\nRunning TTNN operations...")
    conv2d_ttnn = run_ttnn_conv2d(
        conv_input, conv_weight, conv_bias,
        padding, stride, fp32_acc, device, use_bias
    )
    print(f"  ✓ Conv2D complete: {conv2d_ttnn.shape}")
    
    matmul_ttnn = run_ttnn_matmul(matmul_A, matmul_B, fp32_acc, device)
    print(f"  ✓ Matmul complete: {matmul_ttnn.shape}")
    
    # Compute ULP statistics vs CUDA reference
    print("\nComputing ULP statistics vs CUDA reference...")
    conv2d_stats_cuda = compute_ulp_statistics(conv2d_ttnn, conv2d_cuda_ref)
    matmul_stats_cuda = compute_ulp_statistics(matmul_ttnn, matmul_cuda_ref)
    
    # Compute ULP statistics vs PyTorch FP32 ground truth
    print("Computing ULP statistics vs PyTorch FP32 ground truth...")
    conv2d_stats_torch = compute_ulp_statistics(conv2d_ttnn, conv2d_torch_gt)
    matmul_stats_torch = compute_ulp_statistics(matmul_ttnn, matmul_torch_gt)
    
    # Store results
    results[config_name] = {
        # Stats vs CUDA
        'conv2d_stats_cuda': conv2d_stats_cuda,
        'matmul_stats_cuda': matmul_stats_cuda,
        # Stats vs PyTorch FP32
        'conv2d_stats_torch': conv2d_stats_torch,
        'matmul_stats_torch': matmul_stats_torch,
        # Tensors
        'conv2d_ttnn': conv2d_ttnn,
        'conv2d_cuda_ref': conv2d_cuda_ref,
        'conv2d_torch_gt': conv2d_torch_gt,
        'matmul_ttnn': matmul_ttnn,
        'matmul_cuda_ref': matmul_cuda_ref,
        'matmul_torch_gt': matmul_torch_gt,
        # Metadata
        'metadata': metadata,
        'dtype': dtype,
        'input_channels': input_channels,
        'method': method,
        'fp32_acc': fp32_acc
    }
    
    # Print statistics
    print(f"\n{'='*60}")
    print(f"ULP STATISTICS: TTNN vs CUDA Reference")
    print(f"{'='*60}")
    print(f"\nConv2D:")
    for key, value in conv2d_stats_cuda.items():
        print(f"  {key:8s}: {value:12.4f}")
    print(f"\nMatmul:")
    for key, value in matmul_stats_cuda.items():
        print(f"  {key:8s}: {value:12.4f}")
    
    print(f"\n{'='*60}")
    print(f"ULP STATISTICS: TTNN vs PyTorch FP32 Ground Truth")
    print(f"{'='*60}")
    print(f"\nConv2D:")
    for key, value in conv2d_stats_torch.items():
        print(f"  {key:8s}: {value:12.4f}")
    print(f"\nMatmul:")
    for key, value in matmul_stats_torch.items():
        print(f"  {key:8s}: {value:12.4f}")
    
    # Generate plots for both comparisons
    print("\nGenerating plots...")
    print("  Plots vs CUDA Reference...")
    plot_ulp_comparison_dual(conv2d_stats_cuda, matmul_stats_cuda, config_name, "CUDA Reference")
    plot_ulp_histogram_comparison(conv2d_ttnn, conv2d_cuda_ref, matmul_ttnn, matmul_cuda_ref, config_name, "CUDA Ref")
    plot_ulp_cdf_comparison(conv2d_ttnn, conv2d_cuda_ref, matmul_ttnn, matmul_cuda_ref, config_name, "CUDA Ref")
    
    print("  Plots vs PyTorch FP32 Ground Truth...")
    plot_ulp_comparison_dual(conv2d_stats_torch, matmul_stats_torch, config_name, "PyTorch FP32 GT")
    plot_ulp_histogram_comparison(conv2d_ttnn, conv2d_torch_gt, matmul_ttnn, matmul_torch_gt, config_name, "PyTorch FP32 GT")
    plot_ulp_cdf_comparison(conv2d_ttnn, conv2d_torch_gt, matmul_ttnn, matmul_torch_gt, config_name, "PyTorch FP32 GT")

ttnn.close_device(device)
print(f"\n{'='*80}")
print("All experiments completed!")
print(f"{'='*80}")

## Summary Comparison: Both References

In [None]:
# Create summary comparison for both references
print("\n" + "="*100)
print("SUMMARY: ULP Error Comparison")
print("="*100)

print("\n" + "-"*100)
print("TTNN vs CUDA Reference")
print("-"*100)
print(f"{'Config':<30} {'Operation':<10} {'Mean':<12} {'Median':<12} {'Max':<12} {'P95':<12} {'P99':<12}")
print("-" * 100)

for config_name, data in results.items():
    conv2d_stats = data['conv2d_stats_cuda']
    matmul_stats = data['matmul_stats_cuda']
    
    print(f"{config_name:<30} {'Conv2D':<10} {conv2d_stats['mean']:<12.4f} {conv2d_stats['median']:<12.4f} {conv2d_stats['max']:<12.4f} {conv2d_stats['p95']:<12.4f} {conv2d_stats['p99']:<12.4f}")
    print(f"{'':<30} {'Matmul':<10} {matmul_stats['mean']:<12.4f} {matmul_stats['median']:<12.4f} {matmul_stats['max']:<12.4f} {matmul_stats['p95']:<12.4f} {matmul_stats['p99']:<12.4f}")
    print("-" * 100)

print("\n" + "-"*100)
print("TTNN vs PyTorch FP32 Ground Truth")
print("-"*100)
print(f"{'Config':<30} {'Operation':<10} {'Mean':<12} {'Median':<12} {'Max':<12} {'P95':<12} {'P99':<12}")
print("-" * 100)

for config_name, data in results.items():
    conv2d_stats = data['conv2d_stats_torch']
    matmul_stats = data['matmul_stats_torch']
    
    print(f"{config_name:<30} {'Conv2D':<10} {conv2d_stats['mean']:<12.4f} {conv2d_stats['median']:<12.4f} {conv2d_stats['max']:<12.4f} {conv2d_stats['p95']:<12.4f} {conv2d_stats['p99']:<12.4f}")
    print(f"{'':<30} {'Matmul':<10} {matmul_stats['mean']:<12.4f} {matmul_stats['median']:<12.4f} {matmul_stats['max']:<12.4f} {matmul_stats['p95']:<12.4f} {matmul_stats['p99']:<12.4f}")
    print("-" * 100)

# Plot side-by-side comparison
config_names = list(results.keys())
conv2d_means_cuda = [results[name]['conv2d_stats_cuda']['mean'] for name in config_names]
matmul_means_cuda = [results[name]['matmul_stats_cuda']['mean'] for name in config_names]
conv2d_means_torch = [results[name]['conv2d_stats_torch']['mean'] for name in config_names]
matmul_means_torch = [results[name]['matmul_stats_torch']['mean'] for name in config_names]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 6))

x = np.arange(len(config_names))
width = 0.35

# CUDA Reference
bars1 = ax1.bar(x - width/2, conv2d_means_cuda, width, label='Conv2D', alpha=0.8)
bars2 = ax1.bar(x + width/2, matmul_means_cuda, width, label='Matmul', alpha=0.8)
ax1.set_ylabel('Mean ULP Error')
ax1.set_title('Mean ULP Error: TTNN vs CUDA Reference\nLower is better')
ax1.set_xticks(x)
ax1.set_xticklabels(config_names, rotation=25, ha='right')
ax1.legend()
ax1.grid(True, alpha=0.3, axis='y')

# Add value labels
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height,
               f'{height:.2f}',
               ha='center', va='bottom', fontsize=8)

# PyTorch FP32 Ground Truth
bars3 = ax2.bar(x - width/2, conv2d_means_torch, width, label='Conv2D', alpha=0.8)
bars4 = ax2.bar(x + width/2, matmul_means_torch, width, label='Matmul', alpha=0.8)
ax2.set_ylabel('Mean ULP Error')
ax2.set_title('Mean ULP Error: TTNN vs PyTorch FP32 GT\nLower is better')
ax2.set_xticks(x)
ax2.set_xticklabels(config_names, rotation=25, ha='right')
ax2.legend()
ax2.grid(True, alpha=0.3, axis='y')

# Add value labels
for bars in [bars3, bars4]:
    for bar in bars:
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height,
               f'{height:.2f}',
               ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.show()

## Ratio Analysis: Conv2D vs Matmul (Both References)

In [None]:
# Calculate ratios for both references
print("\n" + "="*90)
print("RATIO ANALYSIS: Conv2D ULP / Matmul ULP")
print("="*90)

print("\n" + "-"*90)
print("vs CUDA Reference")
print("-"*90)
print(f"{'Config':<30} {'Mean Ratio':<15} {'Median Ratio':<15} {'Max Ratio':<15}")
print("-" * 90)

ratios_cuda = []
for config_name, data in results.items():
    conv2d = data['conv2d_stats_cuda']
    matmul = data['matmul_stats_cuda']
    
    mean_r = conv2d['mean'] / matmul['mean'] if matmul['mean'] > 0 else float('inf')
    median_r = conv2d['median'] / matmul['median'] if matmul['median'] > 0 else float('inf')
    max_r = conv2d['max'] / matmul['max'] if matmul['max'] > 0 else float('inf')
    
    ratios_cuda.append((mean_r, median_r, max_r))
    print(f"{config_name:<30} {mean_r:<15.4f} {median_r:<15.4f} {max_r:<15.4f}")

print("\n" + "-"*90)
print("vs PyTorch FP32 Ground Truth")
print("-"*90)
print(f"{'Config':<30} {'Mean Ratio':<15} {'Median Ratio':<15} {'Max Ratio':<15}")
print("-" * 90)

ratios_torch = []
for config_name, data in results.items():
    conv2d = data['conv2d_stats_torch']
    matmul = data['matmul_stats_torch']
    
    mean_r = conv2d['mean'] / matmul['mean'] if matmul['mean'] > 0 else float('inf')
    median_r = conv2d['median'] / matmul['median'] if matmul['median'] > 0 else float('inf')
    max_r = conv2d['max'] / matmul['max'] if matmul['max'] > 0 else float('inf')
    
    ratios_torch.append((mean_r, median_r, max_r))
    print(f"{config_name:<30} {mean_r:<15.4f} {median_r:<15.4f} {max_r:<15.4f}")

print("\n" + "="*90)
print("INTERPRETATION:")
print("="*90)
print("Ratio > 1.0 → Conv2D has more ULP error than Matmul")
print("Ratio < 1.0 → Conv2D has less ULP error than Matmul")
print("Ratio ≈ 1.0 → Conv2D and Matmul have similar error characteristics")
print("="*90)

## K Sweep Analysis (Both References)

Analyze how ULP error scales with K (accumulation dimension) for both references.

In [None]:
# Group results by dtype, method, and fp32_acc
grouped_cuda = {}
grouped_torch = {}

for config_name, data in results.items():
    dtype = data['dtype']
    method = data['method']
    fp32_acc = data['fp32_acc']
    ic = data['input_channels']
    k = loader.ic_to_k[ic]
    
    group_key = f"{dtype}_{method}_fp32acc={fp32_acc}"
    
    # CUDA reference
    if group_key not in grouped_cuda:
        grouped_cuda[group_key] = {'k_values': [], 'conv2d_mean': [], 'matmul_mean': []}
    grouped_cuda[group_key]['k_values'].append(k)
    grouped_cuda[group_key]['conv2d_mean'].append(data['conv2d_stats_cuda']['mean'])
    grouped_cuda[group_key]['matmul_mean'].append(data['matmul_stats_cuda']['mean'])
    
    # PyTorch FP32 GT
    if group_key not in grouped_torch:
        grouped_torch[group_key] = {'k_values': [], 'conv2d_mean': [], 'matmul_mean': []}
    grouped_torch[group_key]['k_values'].append(k)
    grouped_torch[group_key]['conv2d_mean'].append(data['conv2d_stats_torch']['mean'])
    grouped_torch[group_key]['matmul_mean'].append(data['matmul_stats_torch']['mean'])

# Sort by K
for group_key in grouped_cuda:
    group = grouped_cuda[group_key]
    sorted_indices = sorted(range(len(group['k_values'])), key=lambda i: group['k_values'][i])
    for key in group:
        group[key] = [group[key][i] for i in sorted_indices]

for group_key in grouped_torch:
    group = grouped_torch[group_key]
    sorted_indices = sorted(range(len(group['k_values'])), key=lambda i: group['k_values'][i])
    for key in group:
        group[key] = [group[key][i] for i in sorted_indices]

# Plot ULP vs K for both references
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Conv2D - CUDA Reference
for group_key, group in grouped_cuda.items():
    axes[0, 0].plot(group['k_values'], group['conv2d_mean'], marker='o', linewidth=2, label=group_key)
axes[0, 0].set_xlabel('K (Accumulation Dimension)', fontsize=12)
axes[0, 0].set_ylabel('Mean ULP Error', fontsize=12)
axes[0, 0].set_title('Conv2D: TTNN vs CUDA Reference', fontsize=13)
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].set_xscale('log')
axes[0, 0].set_yscale('log')

# Matmul - CUDA Reference
for group_key, group in grouped_cuda.items():
    axes[0, 1].plot(group['k_values'], group['matmul_mean'], marker='s', linewidth=2, label=group_key)
axes[0, 1].set_xlabel('K (Accumulation Dimension)', fontsize=12)
axes[0, 1].set_ylabel('Mean ULP Error', fontsize=12)
axes[0, 1].set_title('Matmul: TTNN vs CUDA Reference', fontsize=13)
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)
axes[0, 1].set_xscale('log')
axes[0, 1].set_yscale('log')

# Conv2D - PyTorch FP32 GT
for group_key, group in grouped_torch.items():
    axes[1, 0].plot(group['k_values'], group['conv2d_mean'], marker='o', linewidth=2, label=group_key)
axes[1, 0].set_xlabel('K (Accumulation Dimension)', fontsize=12)
axes[1, 0].set_ylabel('Mean ULP Error', fontsize=12)
axes[1, 0].set_title('Conv2D: TTNN vs PyTorch FP32 GT', fontsize=13)
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)
axes[1, 0].set_xscale('log')
axes[1, 0].set_yscale('log')

# Matmul - PyTorch FP32 GT
for group_key, group in grouped_torch.items():
    axes[1, 1].plot(group['k_values'], group['matmul_mean'], marker='s', linewidth=2, label=group_key)
axes[1, 1].set_xlabel('K (Accumulation Dimension)', fontsize=12)
axes[1, 1].set_ylabel('Mean ULP Error', fontsize=12)
axes[1, 1].set_title('Matmul: TTNN vs PyTorch FP32 GT', fontsize=13)
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)
axes[1, 1].set_xscale('log')
axes[1, 1].set_yscale('log')

plt.tight_layout()
plt.show()

print("\nObservations:")
print("- ULP error generally increases with K (more accumulation = more rounding errors)")
print("- fp32acc configurations should show slower error growth")
print("- Compare how the two references (CUDA vs PyTorch FP32) differ")
print("- CUDA reference may show different behavior due to higher precision settings")