<a href="https://colab.research.google.com/github/wesslen/gpu-testing/blob/main/notebooks/transformers_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_utils import PreTrainedModel
from time import time
import numpy as np

class BenchmarkConfig(PretrainedConfig):
    model_type = "benchmark"
    def __init__(self, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, **kwargs):
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        super().__init__(**kwargs)

class BenchmarkModel(PreTrainedModel):
    config_class = BenchmarkConfig

    def __init__(self, config):
        super().__init__(config)
        self.layers = torch.nn.ModuleList([
            torch.nn.Sequential(
                torch.nn.Linear(config.hidden_size, config.hidden_size * 4),
                torch.nn.GELU(),
                torch.nn.Linear(config.hidden_size * 4, config.hidden_size),
                torch.nn.LayerNorm(config.hidden_size)
            ) for _ in range(config.num_hidden_layers)
        ])

    def forward(self, x):
        for layer in self.layers:
            x = layer(x) + x
        return x

def run_benchmark(batch_size=32, seq_length=512, num_warmup=10, num_iterations=50):
    # Configure device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Running on device: {device}")
    if device.type == "cuda":
        print(f"GPU: {torch.cuda.get_device_name()}")
        print(f"CUDA Version: {torch.version.cuda}")

    # Initialize model
    config = BenchmarkConfig()
    model = BenchmarkModel(config).to(device)
    model.eval()

    # Create dummy input
    dummy_input = torch.randn(batch_size, seq_length, config.hidden_size, device=device)

    # Warmup
    print("Warming up...")
    with torch.no_grad():
        for _ in range(num_warmup):
            _ = model(dummy_input)

    # Benchmark
    print(f"Running benchmark with {num_iterations} iterations...")
    times = []
    with torch.no_grad():
        torch.cuda.synchronize()
        for i in range(num_iterations):
            start = time()
            _ = model(dummy_input)
            torch.cuda.synchronize()
            end = time()
            times.append(end - start)
            if (i + 1) % 10 == 0:
                print(f"Iteration {i + 1}/{num_iterations}")

    # Calculate statistics
    times = np.array(times) * 1000  # Convert to milliseconds
    stats = {
        'avg_latency': np.mean(times),
        'median_latency': np.median(times),
        'p90_latency': np.percentile(times, 90),
        'p99_latency': np.percentile(times, 99),
        'throughput': batch_size * seq_length / (np.mean(times) / 1000)
    }

    # Print results
    print("\nResults:")
    print(f"Average latency: {stats['avg_latency']:.2f} ms")
    print(f"Median latency: {stats['median_latency']:.2f} ms")
    print(f"90th percentile latency: {stats['p90_latency']:.2f} ms")
    print(f"99th percentile latency: {stats['p99_latency']:.2f} ms")
    print(f"Throughput: {stats['throughput']:.2f} tokens/second")

    return stats

if __name__ == "__main__":
    run_benchmark()

Running on device: cuda
GPU: NVIDIA A100-SXM4-40GB
CUDA Version: 12.1
Warming up...
Running benchmark with 50 iterations...
Iteration 10/50
Iteration 20/50
Iteration 30/50
Iteration 40/50
Iteration 50/50

Results:
Average latency: 106.65 ms
Median latency: 106.64 ms
90th percentile latency: 106.71 ms
99th percentile latency: 106.76 ms
Throughput: 153623.44 tokens/second


# Tests

- **Batch Size Testing** (1 to 64 samples): Tests how many examples your GPU can process at once. This is crucial for training deep learning models - larger batch sizes can speed up training, but only if your GPU has enough memory and computational power. Understanding your GPU's optimal batch size helps you maximize training efficiency without running out of memory.
- **Sequence Length Impact (128 to 2048 tokens)**: Measures how your GPU handles different text lengths. This is vital for NLP tasks - longer sequences (like full documents) require more memory and computation than shorter ones (like sentences). Knowing these limits helps you design efficient text processing pipelines and choose appropriate text chunking strategies.
- **Throughput vs Latency Trade-offs**: The benchmark measures both how many tokens per second your GPU can process (throughput) and how long each batch takes (latency). This helps you balance between speed and responsiveness - crucial for deciding between batch processing (like training) and real-time applications (like inference in production).

In [3]:
import pandas as pd
import itertools
from datetime import datetime
import json
import os

def run_benchmark_matrix():
    # Test scenarios
    scenarios = {
        'batch_sizes': [1, 8, 32, 64],
        'seq_lengths': [128, 512, 1024, 2048],
        'num_warmup': 5,
        'num_iterations': 20
    }

    # Create results directory
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_dir = f"benchmark_results_{timestamp}"
    os.makedirs(results_dir, exist_ok=True)

    # Store all results
    results = []

    # Run all combinations
    total_runs = len(scenarios['batch_sizes']) * len(scenarios['seq_lengths'])
    current_run = 0

    print(f"Starting benchmark matrix with {total_runs} combinations...")
    print("Configuration:", json.dumps(scenarios, indent=2))

    for batch_size, seq_length in itertools.product(
        scenarios['batch_sizes'],
        scenarios['seq_lengths']
    ):
        current_run += 1
        print(f"\nRun {current_run}/{total_runs}")
        print(f"Testing batch_size={batch_size}, seq_length={seq_length}")

        try:
            # Run the benchmark
            stats = run_benchmark(
                batch_size=batch_size,
                seq_length=seq_length,
                num_warmup=scenarios['num_warmup'],
                num_iterations=scenarios['num_iterations']
            )

            # Add configuration to results
            result = {
                'batch_size': batch_size,
                'seq_length': seq_length,
                'tokens_per_batch': batch_size * seq_length,
                **stats
            }
            results.append(result)

        except Exception as e:
            print(f"Error running benchmark with batch_size={batch_size}, "
                  f"seq_length={seq_length}: {str(e)}")

    # Convert results to DataFrame
    df = pd.DataFrame(results)

    # Save raw results
    df.to_csv(f"{results_dir}/raw_results.csv", index=False)

    # Create pivot tables for different metrics
    metrics = ['avg_latency', 'throughput']
    for metric in metrics:
        pivot = df.pivot(
            index='batch_size',
            columns='seq_length',
            values=metric
        )
        pivot.to_csv(f"{results_dir}/{metric}_matrix.csv")

        print(f"\n{metric.replace('_', ' ').title()} Matrix:")
        print(pivot)

    # Save test configuration
    with open(f"{results_dir}/config.json", 'w') as f:
        json.dump(scenarios, f, indent=2)

    print(f"\nBenchmark results saved in: {results_dir}")
    return df

if __name__ == "__main__":
    results_df = run_benchmark_matrix()

Starting benchmark matrix with 16 combinations...
Configuration: {
  "batch_sizes": [
    1,
    8,
    32,
    64
  ],
  "seq_lengths": [
    128,
    512,
    1024,
    2048
  ],
  "num_warmup": 5,
  "num_iterations": 20
}

Run 1/16
Testing batch_size=1, seq_length=128
Running on device: cuda
GPU: NVIDIA A100-SXM4-40GB
CUDA Version: 12.1
Warming up...
Running benchmark with 20 iterations...
Iteration 10/20
Iteration 20/20

Results:
Average latency: 2.06 ms
Median latency: 2.03 ms
90th percentile latency: 2.04 ms
99th percentile latency: 2.52 ms
Throughput: 62114.47 tokens/second

Run 2/16
Testing batch_size=1, seq_length=512
Running on device: cuda
GPU: NVIDIA A100-SXM4-40GB
CUDA Version: 12.1
Warming up...
Running benchmark with 20 iterations...
Iteration 10/20
Iteration 20/20

Results:
Average latency: 5.11 ms
Median latency: 5.54 ms
90th percentile latency: 5.61 ms
99th percentile latency: 5.76 ms
Throughput: 100248.05 tokens/second

Run 3/16
Testing batch_size=1, seq_length=1024


In [4]:
# Now check versions
import pkg_resources
import sys

def get_package_details():
    """Print details of specific packages and Python version"""
    packages_to_check = [
        'torch',
        'transformers',
        'numpy',
        'sentencepiece'  # Often used by transformers
    ]

    print("Python version:", sys.version.split()[0])
    print("\nPackage versions:")
    print("-" * 50)

    for package in packages_to_check:
        try:
            version = pkg_resources.get_distribution(package).version
            print(f"{package:<15} {version}")
        except pkg_resources.DistributionNotFound:
            print(f"{package:<15} Not installed")

# Check CUDA availability for PyTorch
import torch
print("\nCUDA Status:")
print("-" * 50)
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Current GPU: {torch.cuda.get_device_name()}")

# Run the check
get_package_details()


CUDA Status:
--------------------------------------------------
CUDA available: True
CUDA version: 12.1
Current GPU: NVIDIA A100-SXM4-40GB
Python version: 3.10.12

Package versions:
--------------------------------------------------
torch           2.5.0+cu121
transformers    4.44.2
numpy           1.26.4
sentencepiece   0.2.0
