In [1]:
import torch
from transformers import BertTokenizer, BertModel
from torch.profiler import profile, record_function, ProfilerActivity
import os

print("--- Initializing ---")

# Load pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Set the model to evaluation mode (important for consistent behavior and disabling dropout)
model.eval()

# Check for CUDA availability and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"BERT model loaded with {model.config.num_hidden_layers} layers.")
print(f"Model is on device: {device}")
print(f"CUDA is available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Current CUDA device name: {torch.cuda.get_device_name(0)}")

# Example text
text = "Hello, how are you doing today? This is an example sentence for BERT."
# To increase GPU activity for a better trace, let's make the input longer and create a small batch
long_text = " ".join([text] * 10) # Repeat the sentence to make it longer
batch_texts = [long_text, "Another slightly different sentence for the batch."] # Create a batch of 2 sentences

# Tokenize and prepare input tensors
# Using padding=True (default pads to longest in batch) or 'max_length' for consistent size
inputs = tokenizer(batch_texts, return_tensors='pt', padding='max_length', truncation=True, max_length=256) # Increased max_length to 256

# Move inputs to the same device as the model
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)
token_type_ids = inputs['token_type_ids'].to(device) if 'token_type_ids' in inputs else None

print(f"\n--- Input Tensor Information ---")
print(f"Input IDs shape: {input_ids.shape}")
print(f"Attention Mask shape: {attention_mask.shape}")
if token_type_ids is not None:
    print(f"Token Type IDs shape: {token_type_ids.shape}")
else:
    print("Token Type IDs are not present (common for single segment tasks).")

# Define the directory to save the trace file
log_dir = "bert_trace_logs_with_gpu" # Changed log dir name to avoid conflicts
os.makedirs(log_dir, exist_ok=True)

print(f"\n--- Starting Profiling ---")
print(f"Saving trace file to: {os.path.abspath(log_dir)}")

# Use the profiler context manager
# activities: Specify what to profile (CPU and CUDA activities)
# record_shapes: Records input shapes of operators
# profile_memory: Records memory usage
# with_stack: Records stack information
# schedule: Defines when to start/stop profiling and how to save.
#          Here, `on_trace_ready` is used to save the trace to a file.
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=torch.profiler.schedule(wait=1, warmup=2, active=5, repeat=1), # Increased active steps
    on_trace_ready=torch.profiler.tensorboard_trace_handler(log_dir),
    record_shapes=True,
    profile_memory=True,
    with_stack=True
) as prof:
    # Run more iterations to ensure ample data for the trace
    for i in range(8): # Increased loop range
        # Use a dummy input if you want to skip a step, or use the actual input
        # to ensure the model runs for every prof.step()
        with record_function(f"BERT_inference_step_{i}"): # Add step number to recorded function
            output = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        prof.step()

print("\n--- Profiling Complete ---")
print("Trace files saved.")

# You can access the model output here if you need to inspect it
# The output typically includes `last_hidden_state` and `pooler_output`
print(f"\n--- Example Model Output Shape (last_hidden_state) ---")
if hasattr(output, 'last_hidden_state'):
    print(f"Last hidden state shape: {output.last_hidden_state.shape}")
if hasattr(output, 'pooler_output'):
    print(f"Pooled output shape: {output.pooler_output.shape}")

print("\nRemember to open the trace file in chrome://tracing or Perfetto.")

--- Initializing ---
BERT model loaded with 12 layers.
Model is on device: cuda
CUDA is available: True
Current CUDA device name: NVIDIA GeForce RTX 3050 Laptop GPU

--- Input Tensor Information ---
Input IDs shape: torch.Size([2, 256])
Attention Mask shape: torch.Size([2, 256])
Token Type IDs shape: torch.Size([2, 256])

--- Starting Profiling ---
Saving trace file to: C:\Users\vaibh\Downloads\pytorch\bert_trace_logs_with_gpu

--- Profiling Complete ---
Trace files saved.

--- Example Model Output Shape (last_hidden_state) ---
Last hidden state shape: torch.Size([2, 256, 768])
Pooled output shape: torch.Size([2, 768])

Remember to open the trace file in chrome://tracing or Perfetto.
