In [2]:
import torch
from transformers import BertTokenizer, BertModel
from torch.profiler import profile, record_function, ProfilerActivity
import os

print("--- Initializing ---")

# Load pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Set the model to evaluation mode (important for consistent behavior and disabling dropout)
model.eval()

# Check for CUDA availability and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device) # <--- COMMUNICATION: Model parameters are transferred from CPU to GPU memory here.

print(f"BERT model loaded with {model.config.num_hidden_layers} layers.")
print(f"Model is on device: {device}")
print(f"CUDA is available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Current CUDA device name: {torch.cuda.get_device_name(0)}")

# Example text
text = "Hello, how are you doing today? This is an example sentence for BERT."
# To increase GPU activity for a better trace, let's make the input longer and create a small batch
long_text = " ".join([text] * 10) # Repeat the sentence to make it longer
batch_texts = [long_text, "Another slightly different sentence for the batch."] # Create a batch of 2 sentences

# Tokenize and prepare input tensors
# Using padding=True (default pads to longest in batch) or 'max_length' for consistent size
inputs = tokenizer(batch_texts, return_tensors='pt', padding='max_length', truncation=True, max_length=256) # Increased max_length to 256

# Move inputs to the same device as the model
input_ids = inputs['input_ids'].to(device)         # <--- COMMUNICATION: Input data transferred from CPU to GPU
attention_mask = inputs['attention_mask'].to(device) # <--- COMMUNICATION: Input data transferred from CPU to GPU
token_type_ids = inputs['token_type_ids'].to(device) if 'token_type_ids' in inputs else None # <--- COMMUNICATION (if present): Input data transferred from CPU to GPU

print(f"\n--- Input Tensor Information ---")
print(f"Input IDs shape: {input_ids.shape}")
print(f"Attention Mask shape: {attention_mask.shape}")
if token_type_ids is not None:
    print(f"Token Type IDs shape: {token_type_ids.shape}")
else:
    print("Token Type IDs are not present (common for single segment tasks).")

# Define the directory to save the trace file
log_dir = "bert_trace_logs_with_gpu" # Changed log dir name to avoid conflicts
os.makedirs(log_dir, exist_ok=True)

print(f"\n--- Starting Profiling ---")
print(f"Saving trace file to: {os.path.abspath(log_dir)}")

# Use the profiler context manager
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=torch.profiler.schedule(wait=1, warmup=2, active=5, repeat=1),
    on_trace_ready=torch.profiler.tensorboard_trace_handler(log_dir),
    record_shapes=True,
    profile_memory=True,
    with_stack=True
) as prof:
    for i in range(8):
        with record_function(f"BERT_inference_step_{i}"):
            output = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        prof.step() # Signals the profiler to move to the next step in its schedule

print("\n--- Profiling Complete ---")
print("Trace files saved.")

# Print the aggregated results
print("\n--- Aggregated Profiler Results (milliseconds) ---")
# key_averages() provides an EventList with aggregated stats for each unique operation.
# table() formats these stats into a readable table.
# sort_by: Choose what to sort by (e.g., "cuda_time_total", "self_cuda_time_total", "cpu_time_total")
# row_limit: Limit the number of rows to display (e.g., top 20 or more)
# header: Customize the header if needed
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20, header="Top CUDA Time Operations"))
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=20, header="Top CPU Time Operations"))
print(prof.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=20, header="Top CUDA Memory Usage"))


# You can access the model output here if you need to inspect it
# The output typically includes `last_hidden_state` and `pooler_output`
print(f"\n--- Example Model Output Shape (last_hidden_state) ---")
if hasattr(output, 'last_hidden_state'):
    print(f"Last hidden state shape: {output.last_hidden_state.shape}")
if hasattr(output, 'pooler_output'):
    print(f"Pooled output shape: {output.pooler_output.shape}")

print("\nRemember to open the full trace file in chrome://tracing or Perfetto for detailed timeline analysis.")

--- Initializing ---
BERT model loaded with 12 layers.
Model is on device: cuda
CUDA is available: True
Current CUDA device name: NVIDIA GeForce RTX 3050 Laptop GPU

--- Input Tensor Information ---
Input IDs shape: torch.Size([2, 256])
Attention Mask shape: torch.Size([2, 256])
Token Type IDs shape: torch.Size([2, 256])

--- Starting Profiling ---
Saving trace file to: C:\Users\vaibh\Downloads\pytorch\bert_trace_logs_with_gpu

--- Profiling Complete ---
Trace files saved.

--- Aggregated Profiler Results (milliseconds) ---
Top CUDA Time Operations
-------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self 