In [None]:
import torch
from torch import nn
import torch.autograd.profiler as profiler

import numpy as np

In [None]:
import torch

# Create a tensor of size (6, 3)
tensor = torch.randn(6, 3)

# Using torch.chunk
chunked_tensors = torch.chunk(tensor, chunks=3, dim=0)

# Using torch.split with a single integer
split_tensors = torch.split(tensor, split_size_or_sections=2, dim=0)

# Using torch.split with a list of sizes
split_tensors_custom = torch.split(tensor, split_size_or_sections=[1, 2, 3], dim=0)


In [None]:
split_tensors

(tensor([[ 0.2207, -0.2316, -1.3084],
         [ 0.4861,  1.0719,  0.2642]]),
 tensor([[ 0.2454, -0.4175,  0.0040],
         [ 0.0299, -0.6987, -1.7507]]),
 tensor([[ 0.2908,  1.7810,  0.2629],
         [ 1.9374,  0.3519, -1.6771]]))

In [None]:
chunked_tensors

(tensor([[ 0.2207, -0.2316, -1.3084],
         [ 0.4861,  1.0719,  0.2642]]),
 tensor([[ 0.2454, -0.4175,  0.0040],
         [ 0.0299, -0.6987, -1.7507]]),
 tensor([[ 0.2908,  1.7810,  0.2629],
         [ 1.9374,  0.3519, -1.6771]]))

### `Profile`

##### Example 1

In [None]:
class MyModule(nn.Module):
    def __init__(self, in_features: int, out_features: int, bias: bool = True):
        super(MyModule, self).__init__()
        self.linear = nn.Linear(in_features, out_features, bias)

    def forward(self, input, mask):
        with profiler.record_function("LINEAR PASS"):
            out = self.linear(input)

        with profiler.record_function("MASK INDICES"):
            threshold = out.sum(axis=1).mean().item()
            hi_idx = np.argwhere(mask.cpu().numpy() > threshold)
            hi_idx = torch.from_numpy(hi_idx)

        return out, hi_idx

In [None]:
model = MyModule(500, 10)
input = torch.rand(128, 500)
mask = torch.rand((500, 500, 500), dtype=torch.double)

In [None]:
profile_params = {
    "with_stack": True,
    "profile_memory": True
}

In [None]:
profile_params

{'with_stack': True, 'profile_memory': True}

In [None]:
def hardcore():
    out, idx = model(input, mask)

Profile the performance of function `hardcore` using `torch.autograd`

In [None]:
import torch.autograd.profiler as profiler

In [None]:
with profiler.profile(**profile_params) as prof:
    hardcore()



In [None]:
# prof.key_averages(group_by_stack_n=5).table()

##### Example 2

In [None]:
import torchvision.models as models

In [None]:
import torch

In [None]:
def hardcore():
    model = models.resnet18()
    inputs = torch.randn(5, 3, 224, 224)
    return model(inputs)

Profile the function `hardcore` on CPU

In [None]:
from torch.profiler import profile, ProfilerActivity

In [None]:
with profile(activities=[ProfilerActivity.CPU]) as prof:
    hardcore()

In [None]:
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     aten::conv2d         0.01%      56.000us        35.89%     172.144ms       8.607ms            20  
                aten::convolution         0.35%       1.686ms        35.88%     172.088ms       8.604ms            20  
               aten::_convolution         0.04%     209.000us        35.53%     170.402ms       8.520ms            20  
                aten::thnn_conv2d         0.01%      60.000us        35.48%     170.193ms       8.510ms            20  
       aten::_slow_conv2d_forward        35.43%     169.932ms        35.47%     170.133ms       8.507ms            20  
                    aten::normal_       

##### Example 3

In [None]:
schedular_params = {
    "wait": 1,
    "warmup": 1,
    "active": 2
}

In [None]:
def hardcore():
    for idx in range(8):
        model(inputs)

In [None]:
schedular_params

{'wait': 1, 'warmup': 1, 'active': 2}

Profile the `hardcore` function on CPU using a profiler and set a schedule with the `scheduler_params`

**Hint**: Not use the function

In [None]:
import torch
from torch.profiler import profile, ProfilerActivity

In [None]:
with profile(
    activities=[ProfilerActivity.CPU],
    schedule=torch.profiler.schedule(**schedular_params)
) as prof:
    for idx in range(3):
        model(inputs)
        prof.step()

In [None]:
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                    ProfilerStep*         4.32%       8.577ms        99.99%     198.590ms      99.295ms             2  
                     aten::conv2d         0.03%      53.000us        74.45%     147.871ms       7.394ms            20  
                aten::convolution         0.12%     243.000us        74.42%     147.818ms       7.391ms            20  
               aten::_convolution         0.09%     171.000us        74.30%     147.575ms       7.379ms            20  
                aten::thnn_conv2d         0.03%      64.000us        74.21%     147.404ms       7.370ms            20  
       aten::_slow_conv2d_forward       

##### Example 4

In [None]:
xs = torch.tensor([69, 69, 69]).float()

In [None]:
mean = xs.mean(dim=-1)

In [None]:
device = torch.device("cuda:0")

In [None]:
import torch

In [None]:
xs

tensor([69., 69., 69.])

In [None]:
device

device(type='cuda', index=0)

Create a new CUDA stream on the `device`, and then calculate the average of xs using that CUDA stream on `device`

In [None]:
stream = torch.cuda.Stream(device=device)

In [None]:
with torch.cuda.device(device):
    with torch.cuda.stream(stream):
        mean = xs.mean(dim=-1)

In [None]:
mean

tensor(69.)