In [4]:
import torch
import torch.nn as nn
import torch.autograd.profiler as profiler

# Define a simple encoder-only model
class EncoderModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = nn.Linear(512, 256)

    def forward(self, x):
        return self.encoder(x)

# Initialize model and input
model = EncoderModel().cuda()
input_tensor = torch.rand(128, 512).cuda()

# Synchronize CUDA before profiling
torch.cuda.synchronize()

# Profile the forward pass with CUDA enabled
with profiler.profile(with_stack=True, profile_memory=True, use_cuda=True) as prof:
    output = model(input_tensor)

# Synchronize CUDA after profiling
torch.cuda.synchronize()

# Print profiling results, sorting by GPU execution time
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

--------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
--------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
        aten::linear        19.31%      60.800us       100.00%     314.900us     314.900us      38.000us         4.17%     911.000us     911.000us           0 b           0 b     128.00 Kb           0 b             1  
         aten::addmm        59.26%     186.600us        59.26%     186.600us     186.600us     797.000us        87.49%    

  with profiler.profile(with_stack=True, profile_memory=True, use_cuda=True) as prof:


In [6]:
import torch
import torch.nn as nn
import torch.profiler

# Define a simple encoder-only model
class EncoderModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = nn.Linear(512, 256)

    def forward(self, x):
        return self.encoder(x)

# Initialize model and input
model = EncoderModel().cuda()
input_tensor = torch.rand(128, 512).cuda()

# Synchronize CUDA before profiling to ensure accurate timing
torch.cuda.synchronize()

# Use torch.profiler to properly track GPU and CPU activities
with torch.profiler.profile(
    activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
    record_shapes=True,
    profile_memory=True,
    with_stack=True
) as prof:
    output = model(input_tensor)

# Synchronize CUDA again after profiling
torch.cuda.synchronize()

# Print profiling results, sorted by GPU execution time
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))

--------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
--------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
        aten::linear        13.33%      37.000us       100.00%     277.600us     277.600us      12.000us         2.11%     568.000us     568.000us           0 b           0 b     128.00 Kb           0 b             1  
         aten::addmm        61.67%     171.200us        61.67%     171.200us     171.200us     539.000us        94.89%    

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define a simple neural network
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)  # First layer
        self.relu = nn.ReLU()  # Activation function
        self.fc2 = nn.Linear(hidden_size, output_size)  # Output layer

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Initialize model, loss function, and optimizer
input_size = 10
hidden_size = 20
output_size = 1
model = SimpleNN(input_size, hidden_size, output_size)

criterion = nn.MSELoss()  # Mean Squared Error loss for regression
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Generate dummy data
X_train = torch.rand(100, input_size)  # 100 samples, each with 10 features
y_train = torch.rand(100, output_size)  # Target values

# Training loop
epochs = 100
for epoch in range(epochs):
    optimizer.zero_grad()  # Reset gradients
    outputs = model(X_train)  # Forward pass
    loss = criterion(outputs, y_train)  # Compute loss
    loss.backward()  # Backpropagation
    optimizer.step()  # Update weights

    if (epoch+1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

print("Training complete!")

Epoch [10/100], Loss: 0.1186
Epoch [20/100], Loss: 0.0929
Epoch [30/100], Loss: 0.0847
Epoch [40/100], Loss: 0.0818
Epoch [50/100], Loss: 0.0802
Epoch [60/100], Loss: 0.0789
Epoch [70/100], Loss: 0.0774
Epoch [80/100], Loss: 0.0757
Epoch [90/100], Loss: 0.0737
Epoch [100/100], Loss: 0.0708
Training complete!


In [4]:
import torch
from transformers import GPT2Tokenizer, GPT2Model

# Load model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2Model.from_pretrained("gpt2").cuda()
model.eval()

# Prepare input
inputs = tokenizer("Profiling GPT-2 with PyTorch profiler", return_tensors="pt").to("cuda")

# Profile and export trace
with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU,
        torch.profiler.ProfilerActivity.CUDA],
    schedule=torch.profiler.schedule(
        wait=1,
        warmup=1,
        active=2),
    on_trace_ready=None,  # We'll export manually
    record_shapes=True,
    profile_memory=True,
    with_stack=True
) as prof:
    for _ in range(4):  # match wait+warmup+active
        with torch.no_grad():
            model(**inputs)
        prof.step()

# Export to Chrome trace format
prof.export_chrome_trace("gpt2_trace.json")