In [1]:
import torch
from transformers import BertTokenizer, BertModel
from torch.profiler import profile, record_function, ProfilerActivity
import os

In [2]:
# Load pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Set the model to evaluation mode (important for consistent behavior and disabling dropout)
model.eval()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"BERT model loaded with {model.config.num_hidden_layers} layers.")
print(f"Model is on device: {device}")

BERT model loaded with 12 layers.
Model is on device: cuda


In [3]:
# Example text
text = "Hello, how are you doing today? This is an example sentence for BERT."

# Tokenize and prepare input tensors
# The `return_tensors='pt'` argument ensures PyTorch tensors are returned.
# `padding='max_length'` and `truncation=True` are important for consistent input size.
inputs = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=128)

# Move inputs to the same device as the model
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)
token_type_ids = inputs['token_type_ids'].to(device) if 'token_type_ids' in inputs else None

print(f"Input IDs shape: {input_ids.shape}")
print(f"Attention Mask shape: {attention_mask.shape}")
if token_type_ids is not None:
    print(f"Token Type IDs shape: {token_type_ids.shape}")

Input IDs shape: torch.Size([1, 128])
Attention Mask shape: torch.Size([1, 128])
Token Type IDs shape: torch.Size([1, 128])


In [5]:
# Define the directory to save the trace file
log_dir = "bert_trace_logs"
os.makedirs(log_dir, exist_ok=True)

print(f"Saving trace file to: {os.path.abspath(log_dir)}")

# Use the profiler context manager
# activities: Specify what to profile (CPU and CUDA activities)
# record_shapes: Records input shapes of operators
# profile_memory: Records memory usage
# with_stack: Records stack information
# schedule: Defines when to start/stop profiling and how to save.
#          Here, `on_trace_ready` is used to save the trace to a file.
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
    on_trace_ready=torch.profiler.tensorboard_trace_handler(log_dir),
    record_shapes=True,
    profile_memory=True,
    with_stack=True
) as prof:
    for i in range(5):  # Run a few iterations to capture a meaningful trace
        with record_function("BERT_inference"):
            # Pass inputs to the model.
            # The BertModel returns a BaseModelOutputWithPoolingAndCrossAttentions object.
            # The hidden_states for all layers can be accessed via `output.hidden_states` if `output_hidden_states=True` was passed to from_pretrained.
            # For `BertModel`, by default it only returns the last hidden state and pooled output.
            # To get all hidden states, you'd usually use `BertConfig` and set `output_hidden_states=True`.
            # However, for basic tracing, just running the forward pass is sufficient.
            output = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        prof.step()

print("Profiling complete. Trace files saved.")

Saving trace file to: C:\Users\vaibh\Downloads\pytorch\bert_trace_logs
Profiling complete. Trace files saved.
