In [2]:
pip install transformers

Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.33.1-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2024.11.6-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.2-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Collecting tqdm>=4.27 (from transformers)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
   ---------------------------------------- 0.0/10.5 MB ? eta -:--:--
   -- ------------------------------------- 0.5/10.5 MB 3.4 MB/s eta 0:00:03
   ----- ---------------------------------- 1.3/10.5 MB 3.5 MB/s eta 0:00:03
   --------- ---

In [3]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.profiler import profile, record_function, ProfilerActivity
import time

# Confirm CUDA is available
assert torch.cuda.is_available(), "CUDA not available"

# Load model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2").to("cuda")
model.eval()

# Prepare a longer input to increase GPU workload
prompt = "The future of AI is" * 50  # Repeat to make it longer
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
inputs_cuda = {k: v.to("cuda") for k, v in inputs.items()}

# Warm-up to initialize CUDA kernels
with torch.no_grad():
    for _ in range(3):
        _ = model(**inputs_cuda)

# Profile with both CPU and CUDA activities
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    record_shapes=True,
    with_stack=True,
    profile_memory=True
) as prof:
    with record_function("gpt2_inference"):
        _ = model(**inputs_cuda)
        torch.cuda.synchronize()  # Ensure all CUDA ops are completed
        time.sleep(0.1)  # Give profiler time to flush events

# Save trace
prof.export_chrome_trace("gpt2_trace-checkpoint.json")
print("Trace saved to gpt2_trace-checkpoint.json")

Trace saved to gpt2_trace-checkpoint.json


In [5]:
import torch
import torch.nn as nn
from torch.profiler import profile, record_function, ProfilerActivity

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on device: {device}")

# Decoder-only transformer model
class SimpleDecoderOnlyModel(nn.Module):
    def __init__(self, vocab_size=1000, d_model=256, nhead=4, num_layers=2, max_len=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_embedding = nn.Parameter(torch.randn(1, max_len, d_model))
        decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        seq_len = x.size(1)
        x = self.embedding(x) + self.pos_embedding[:, :seq_len, :]
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(seq_len).to(x.device)
        x = self.transformer_decoder(x, x, tgt_mask=tgt_mask)
        return self.fc_out(x)

# Instantiate model and dummy input
model = SimpleDecoderOnlyModel().to(device)
dummy_input = torch.randint(0, 1000, (1, 32)).to(device)

# Warm-up
_ = model(dummy_input)

# Profile and export trace
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
             record_shapes=True,
             with_stack=True) as prof:
    with record_function("model_inference"):
        output = model(dummy_input)

# Save trace
prof.export_chrome_trace("decoder_trace.json")
print("Trace saved to decoder_trace.json")

Running on device: cuda
Trace saved to decoder_trace.json


In [6]:
pip HolisticTraceAnalysis

Note: you may need to restart the kernel to use updated packages.


ERROR: unknown command "HolisticTraceAnalysis"



In [7]:
pip install HolisticTraceAnalysis

Collecting HolisticTraceAnalysis
  Using cached holistictraceanalysis-0.5.0-py3-none-any.whl.metadata (1.1 kB)
Collecting plotly>=5.11.0 (from HolisticTraceAnalysis)
  Using cached plotly-6.1.2-py3-none-any.whl.metadata (6.9 kB)
Collecting pydot>=1.3.0 (from HolisticTraceAnalysis)
  Downloading pydot-4.0.1-py3-none-any.whl.metadata (11 kB)
Collecting pytest>=7.4.4 (from HolisticTraceAnalysis)
  Downloading pytest-8.4.1-py3-none-any.whl.metadata (7.7 kB)
Collecting narwhals>=1.15.1 (from plotly>=5.11.0->HolisticTraceAnalysis)
  Downloading narwhals-1.44.0-py3-none-any.whl.metadata (11 kB)
Collecting iniconfig>=1 (from pytest>=7.4.4->HolisticTraceAnalysis)
  Using cached iniconfig-2.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting pluggy<2,>=1.5 (from pytest>=7.4.4->HolisticTraceAnalysis)
  Using cached pluggy-1.6.0-py3-none-any.whl.metadata (4.8 kB)
Using cached holistictraceanalysis-0.5.0-py3-none-any.whl (371 kB)
Using cached plotly-6.1.2-py3-none-any.whl (16.3 MB)
Downloading narwha

In [9]:
from hta.trace_analysis import TraceAnalysis

# Set your trace directory path
trace_dir = r"C:\Users\vaibh\Downloads\pytorch_new\trace_dir"

# Initialize the analyzer
analyzer = TraceAnalysis(trace_dir=trace_dir)

# Example: Get a temporal breakdown of GPU activity
temporal_df = analyzer.get_temporal_breakdown(visualize=True)

# Example: Get GPU kernel breakdown
kernel_type_df, kernel_df = analyzer.get_gpu_kernel_breakdown(visualize=True)

# Example: Get idle time breakdown
idle_df = analyzer.get_idle_time_breakdown(visualize=True)

# Example: Get communication-computation overlap
overlap_df = analyzer.get_comm_comp_overlap(visualize=True)

Parsed C:\Users\vaibh\Downloads\pytorch_new\trace_dir\decoder_trace.json time = 0.03 seconds 
Rounding down ns resolution events due to issue with events overlapping. ts dtype = float64, dur dtype = float64.Please see https://github.com/pytorch/pytorch/pull/122425
Parsed C:\Users\vaibh\Downloads\pytorch_new\trace_dir\decoder_trace.json backend=ParserBackend.JSON in 0.26 seconds; current PID:3604
Overall parsing of C:\Users\vaibh\Downloads\pytorch_new\trace_dir\decoder_trace.json in 0.38 seconds; current PID:3604
leaving parse_multiple_ranks duration=0.38 seconds
leaving parse_traces duration=0.39 seconds
ProfilerStep not found in the trace. The analysis result may not be accurate.



Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`




Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


invalid value encountered in scalar divide

