In [2]:
import torch
from torch.profiler import profile, record_function, ProfilerActivity

In [3]:
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        # x = self.conv1(x)
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


net = Net().cuda()

In [4]:
inputs = torch.randn(4,3,32,32).cuda()
with profile(activities=[ProfilerActivity.CPU,ProfilerActivity.CUDA],record_shapes=True) as prof:
    net(inputs)
prof.export_chrome_trace("cpu02.3.json")

STAGE:2023-05-09 11:05:49 3851043:3851043 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
STAGE:2023-05-09 11:05:51 3851043:3851043 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-05-09 11:05:51 3851043:3851043 ActivityProfilerController.cpp:321] Completed Stage: Post Processing


In [7]:
with profile(activities=[ProfilerActivity.CPU,ProfilerActivity.CUDA],record_shapes=True, with_stack=True) as prof_stack:
    net(inputs)
print(prof_stack.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=2))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::cudnn_convolution        58.16%       2.363ms        73.07%       2.969ms       1.484ms      39.000us        29.77%      39.000us      19.500us             2  
void implicit_convolve_sgemm<float, float, 128, 5, 5...         0.00%       0.000us         0.00%       0.000us       0.000us      39.000us        29.77%      39.000us      19.500us             2  
---------

STAGE:2023-05-09 11:07:35 3851043:3851043 ActivityProfilerController.cpp:311] Completed Stage: Warm Up
STAGE:2023-05-09 11:07:35 3851043:3851043 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-05-09 11:07:35 3851043:3851043 ActivityProfilerController.cpp:321] Completed Stage: Post Processing


In [None]:
with profile(activities=[ProfilerActivity.CPU,ProfilerActivity.CUDA],record_shapes=True, with_stack=True,profile_memory=True) as prof_mem:
    net(inputs)
prof_mem.export_memory_timeline(path=".")