<a href="https://colab.research.google.com/github/wesslen/gpu-testing/blob/main/notebooks/transformers_actual_hf_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from time import time
import numpy as np


def run_benchmark(model_name="bert-base-uncased", batch_size=32, seq_length=512, num_warmup=10, num_iterations=50):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Running on device: {device}")

    # Load model and tokenizer
    model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model.eval()

    # Create input using tokenizer
    dummy_text = ["This is a test sentence"] * batch_size
    inputs = tokenizer(dummy_text, padding=True, truncation=True, max_length=seq_length, return_tensors="pt").to(device)

    # Warmup
    print("Warming up...")
    with torch.no_grad():
        for _ in range(num_warmup):
            _ = model(**inputs)


    # Benchmark
    print(f"Running benchmark with {num_iterations} iterations...")
    times = []
    with torch.no_grad():
        torch.cuda.synchronize()
        for i in range(num_iterations):
            start = time()
            _ = model(**inputs)
            torch.cuda.synchronize()
            end = time()
            times.append(end - start)
            if (i + 1) % 10 == 0:
                print(f"Iteration {i + 1}/{num_iterations}")

    # Calculate statistics
    times = np.array(times) * 1000  # Convert to milliseconds
    stats = {
        'avg_latency': np.mean(times),
        'median_latency': np.median(times),
        'p90_latency': np.percentile(times, 90),
        'p99_latency': np.percentile(times, 99),
        'throughput': batch_size * seq_length / (np.mean(times) / 1000)
    }

    # Print results
    print("\nResults:")
    print(f"Average latency: {stats['avg_latency']:.2f} ms")
    print(f"Median latency: {stats['median_latency']:.2f} ms")
    print(f"90th percentile latency: {stats['p90_latency']:.2f} ms")
    print(f"99th percentile latency: {stats['p99_latency']:.2f} ms")
    print(f"Throughput: {stats['throughput']:.2f} tokens/second")

    return stats

if __name__ == "__main__":
    run_benchmark()

Running on device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Warming up...
Running benchmark with 50 iterations...
Iteration 10/50
Iteration 20/50
Iteration 30/50
Iteration 40/50
Iteration 50/50

Results:
Average latency: 15.70 ms
Median latency: 14.62 ms
90th percentile latency: 20.44 ms
99th percentile latency: 25.90 ms
Throughput: 1043818.17 tokens/second


# Tests

- **Batch Size Testing** (1 to 64 samples): Tests how many examples your GPU can process at once. This is crucial for training deep learning models - larger batch sizes can speed up training, but only if your GPU has enough memory and computational power. Understanding your GPU's optimal batch size helps you maximize training efficiency without running out of memory.
- **Sequence Length Impact (128 to 2048 tokens)**: Measures how your GPU handles different text lengths. This is vital for NLP tasks - longer sequences (like full documents) require more memory and computation than shorter ones (like sentences). Knowing these limits helps you design efficient text processing pipelines and choose appropriate text chunking strategies.
- **Throughput vs Latency Trade-offs**: The benchmark measures both how many tokens per second your GPU can process (throughput) and how long each batch takes (latency). This helps you balance between speed and responsiveness - crucial for deciding between batch processing (like training) and real-time applications (like inference in production).

In [2]:
import pandas as pd
import itertools
from datetime import datetime
import json
import os

def run_benchmark_matrix():
    # Test scenarios
    scenarios = {
        'batch_sizes': [1, 8, 32, 64],
        'seq_lengths': [128, 512, 1024, 2048],
        'num_warmup': 5,
        'num_iterations': 20
    }

    # Create results directory
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_dir = f"benchmark_results_{timestamp}"
    os.makedirs(results_dir, exist_ok=True)

    # Store all results
    results = []

    # Run all combinations
    total_runs = len(scenarios['batch_sizes']) * len(scenarios['seq_lengths'])
    current_run = 0

    print(f"Starting benchmark matrix with {total_runs} combinations...")
    print("Configuration:", json.dumps(scenarios, indent=2))

    for batch_size, seq_length in itertools.product(
        scenarios['batch_sizes'],
        scenarios['seq_lengths']
    ):
        current_run += 1
        print(f"\nRun {current_run}/{total_runs}")
        print(f"Testing batch_size={batch_size}, seq_length={seq_length}")

        try:
            # Run the benchmark
            stats = run_benchmark(
                batch_size=batch_size,
                seq_length=seq_length,
                num_warmup=scenarios['num_warmup'],
                num_iterations=scenarios['num_iterations']
            )

            # Add configuration to results
            result = {
                'batch_size': batch_size,
                'seq_length': seq_length,
                'tokens_per_batch': batch_size * seq_length,
                **stats
            }
            results.append(result)

        except Exception as e:
            print(f"Error running benchmark with batch_size={batch_size}, "
                  f"seq_length={seq_length}: {str(e)}")

    # Convert results to DataFrame
    df = pd.DataFrame(results)

    # Save raw results
    df.to_csv(f"{results_dir}/raw_results.csv", index=False)

    # Create pivot tables for different metrics
    metrics = ['avg_latency', 'throughput']
    for metric in metrics:
        pivot = df.pivot(
            index='batch_size',
            columns='seq_length',
            values=metric
        )
        pivot.to_csv(f"{results_dir}/{metric}_matrix.csv")

        print(f"\n{metric.replace('_', ' ').title()} Matrix:")
        print(pivot)

    # Save test configuration
    with open(f"{results_dir}/config.json", 'w') as f:
        json.dump(scenarios, f, indent=2)

    print(f"\nBenchmark results saved in: {results_dir}")
    return df

if __name__ == "__main__":
    results_df = run_benchmark_matrix()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting benchmark matrix with 16 combinations...
Configuration: {
  "batch_sizes": [
    1,
    8,
    32,
    64
  ],
  "seq_lengths": [
    128,
    512,
    1024,
    2048
  ],
  "num_warmup": 5,
  "num_iterations": 20
}

Run 1/16
Testing batch_size=1, seq_length=128
Running on device: cuda
Warming up...
Running benchmark with 20 iterations...
Iteration 10/20
Iteration 20/20

Results:
Average latency: 7.40 ms
Median latency: 7.17 ms
90th percentile latency: 7.69 ms
99th percentile latency: 10.47 ms
Throughput: 17300.25 tokens/second

Run 2/16
Testing batch_size=1, seq_length=512
Running on device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Warming up...
Running benchmark with 20 iterations...
Iteration 10/20
Iteration 20/20

Results:
Average latency: 7.76 ms
Median latency: 7.53 ms
90th percentile latency: 8.25 ms
99th percentile latency: 10.07 ms
Throughput: 66018.43 tokens/second


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Run 3/16
Testing batch_size=1, seq_length=1024
Running on device: cuda
Warming up...
Running benchmark with 20 iterations...
Iteration 10/20
Iteration 20/20

Results:
Average latency: 7.63 ms
Median latency: 7.21 ms
90th percentile latency: 7.98 ms
99th percentile latency: 12.10 ms
Throughput: 134255.70 tokens/second


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Run 4/16
Testing batch_size=1, seq_length=2048
Running on device: cuda
Warming up...
Running benchmark with 20 iterations...
Iteration 10/20
Iteration 20/20

Results:
Average latency: 7.30 ms
Median latency: 7.12 ms
90th percentile latency: 7.88 ms
99th percentile latency: 8.66 ms
Throughput: 280474.38 tokens/second

Run 5/16
Testing batch_size=8, seq_length=128
Running on device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Warming up...
Running benchmark with 20 iterations...
Iteration 10/20


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Iteration 20/20

Results:
Average latency: 8.56 ms
Median latency: 8.18 ms
90th percentile latency: 9.56 ms
99th percentile latency: 11.69 ms
Throughput: 119587.67 tokens/second

Run 6/16
Testing batch_size=8, seq_length=512
Running on device: cuda
Warming up...
Running benchmark with 20 iterations...
Iteration 10/20


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Iteration 20/20

Results:
Average latency: 8.51 ms
Median latency: 8.14 ms
90th percentile latency: 9.79 ms
99th percentile latency: 11.62 ms
Throughput: 481063.76 tokens/second

Run 7/16
Testing batch_size=8, seq_length=1024
Running on device: cuda
Warming up...
Running benchmark with 20 iterations...
Iteration 10/20
Iteration 20/20

Results:
Average latency: 11.51 ms
Median latency: 11.24 ms
90th percentile latency: 13.16 ms
99th percentile latency: 14.72 ms
Throughput: 711913.84 tokens/second

Run 8/16
Testing batch_size=8, seq_length=2048
Running on device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Warming up...
Running benchmark with 20 iterations...
Iteration 10/20
Iteration 20/20

Results:
Average latency: 10.99 ms
Median latency: 10.27 ms
90th percentile latency: 13.26 ms
99th percentile latency: 15.23 ms
Throughput: 1490404.08 tokens/second

Run 9/16
Testing batch_size=32, seq_length=128
Running on device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Warming up...
Running benchmark with 20 iterations...
Iteration 10/20
Iteration 20/20

Results:
Average latency: 13.62 ms
Median latency: 13.62 ms
90th percentile latency: 13.79 ms
99th percentile latency: 17.40 ms
Throughput: 300721.95 tokens/second

Run 10/16
Testing batch_size=32, seq_length=512
Running on device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Warming up...
Running benchmark with 20 iterations...
Iteration 10/20


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Iteration 20/20

Results:
Average latency: 13.93 ms
Median latency: 13.66 ms
90th percentile latency: 14.61 ms
99th percentile latency: 17.84 ms
Throughput: 1175826.53 tokens/second

Run 11/16
Testing batch_size=32, seq_length=1024
Running on device: cuda
Warming up...
Running benchmark with 20 iterations...
Iteration 10/20


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Iteration 20/20

Results:
Average latency: 13.71 ms
Median latency: 13.55 ms
90th percentile latency: 14.13 ms
99th percentile latency: 14.57 ms
Throughput: 2389214.27 tokens/second

Run 12/16
Testing batch_size=32, seq_length=2048
Running on device: cuda
Warming up...
Running benchmark with 20 iterations...
Iteration 10/20


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Iteration 20/20

Results:
Average latency: 13.71 ms
Median latency: 13.75 ms
90th percentile latency: 14.37 ms
99th percentile latency: 14.48 ms
Throughput: 4780352.32 tokens/second

Run 13/16
Testing batch_size=64, seq_length=128
Running on device: cuda
Warming up...
Running benchmark with 20 iterations...
Iteration 10/20


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Iteration 20/20

Results:
Average latency: 23.77 ms
Median latency: 23.73 ms
90th percentile latency: 23.84 ms
99th percentile latency: 25.63 ms
Throughput: 344649.08 tokens/second

Run 14/16
Testing batch_size=64, seq_length=512
Running on device: cuda
Warming up...
Running benchmark with 20 iterations...
Iteration 10/20


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Iteration 20/20

Results:
Average latency: 23.81 ms
Median latency: 23.74 ms
90th percentile latency: 24.45 ms
99th percentile latency: 24.69 ms
Throughput: 1376071.78 tokens/second

Run 15/16
Testing batch_size=64, seq_length=1024
Running on device: cuda
Warming up...
Running benchmark with 20 iterations...
Iteration 10/20


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Iteration 20/20

Results:
Average latency: 23.74 ms
Median latency: 23.72 ms
90th percentile latency: 24.15 ms
99th percentile latency: 24.52 ms
Throughput: 2760512.63 tokens/second

Run 16/16
Testing batch_size=64, seq_length=2048
Running on device: cuda
Warming up...
Running benchmark with 20 iterations...
Iteration 10/20
Iteration 20/20

Results:
Average latency: 23.68 ms
Median latency: 23.66 ms
90th percentile latency: 24.10 ms
99th percentile latency: 24.34 ms
Throughput: 5535314.52 tokens/second

Avg Latency Matrix:
seq_length       128        512        1024       2048
batch_size                                            
1            7.398736   7.755411   7.627237   7.301915
8            8.562756   8.514464  11.507010  10.992992
32          13.620555  13.934028  13.714969  13.709450
64          23.769104  23.812711  23.740518  23.679233

Throughput Matrix:
seq_length           128           512           1024          2048
batch_size                                           

In [3]:
# Now check versions
import pkg_resources
import sys

def get_package_details():
    """Print details of specific packages and Python version"""
    packages_to_check = [
        'torch',
        'transformers',
        'numpy',
        'sentencepiece'  # Often used by transformers
    ]

    print("Python version:", sys.version.split()[0])
    print("\nPackage versions:")
    print("-" * 50)

    for package in packages_to_check:
        try:
            version = pkg_resources.get_distribution(package).version
            print(f"{package:<15} {version}")
        except pkg_resources.DistributionNotFound:
            print(f"{package:<15} Not installed")

# Check CUDA availability for PyTorch
import torch
print("\nCUDA Status:")
print("-" * 50)
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Current GPU: {torch.cuda.get_device_name()}")

# Run the check
get_package_details()


CUDA Status:
--------------------------------------------------
CUDA available: True
CUDA version: 12.1
Current GPU: Tesla T4
Python version: 3.10.12

Package versions:
--------------------------------------------------
torch           2.5.1+cu121
transformers    4.46.2
numpy           1.26.4
sentencepiece   0.2.0


  import pkg_resources
