# Эффективные модели ML и архитектуры нейросетей

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from tqdm.auto import trange

from torch.profiler import profile, ProfilerActivity, schedule, record_function

  from .autonotebook import tqdm as notebook_tqdm


In [41]:
device = "cuda:2"

In [43]:
a = torch.randn(1024, 1024)
b = torch.randn(1024, 1024)

In [44]:
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
    c = a.T @ b

In [45]:
print(prof.key_averages().table(
    sort_by="cuda_time_total",
    row_limit=10
))

-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
            aten::numpy_T         0.02%       3.500us         0.11%      22.110us      22.110us             1  
            aten::permute         0.07%      13.450us         0.09%      18.610us      18.610us             1  
         aten::as_strided         0.03%       5.160us         0.03%       5.160us       5.160us             1  
             aten::matmul         0.04%       8.260us        99.83%      19.824ms      19.824ms             1  
                 aten::mm        99.79%      19.815ms        99.79%      19.816ms      19.816ms             1  
       aten::resolve_conj         0.00%       0.730us         0.00%       0.730us       0.365us         

In [46]:
training_data = datasets.FashionMNIST(
    root="mnist",
    train=True,
    download=True,
    transform=ToTensor(),
)
loader = DataLoader(training_data, batch_size=256)

In [47]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        with record_function("IMPORTANT"):
            x = self.dropout2(x)
        x = self.fc2(x)
        return x

In [48]:
net = Net().to(device)
opt = torch.optim.Adam(net.parameters())
loss_fn = nn.CrossEntropyLoss()

In [49]:
def on_trace_ready(prof):
    prof.export_chrome_trace("trace.json")

`skip_first` + `repeat` * [`wait` (no tracing) + `warmup` (tracing, but not recording) + `active` (recording)]

[Perfetto trace visualizer](https://ui.perfetto.dev)

In [50]:
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=schedule(
        skip_first=1,
        wait=1,
        warmup=1,
        active=3,
        repeat=1,
    ),
    record_shapes=True,
    on_trace_ready=on_trace_ready,
) as prof:
    for _, (data, label) in zip(trange(7), loader):
        data = data.to(device)
        label = label.to(device)
        pred = net(data)
        loss = loss_fn(pred, label)
        opt.zero_grad()
        loss.backward()
        opt.step()

        prof.step()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:02<00:00,  2.44it/s]


## Data loading

In [31]:
training_data = datasets.FashionMNIST(
    root="mnist",
    train=True,
    download=True,
    transform=ToTensor(),
)
loader = DataLoader(training_data, batch_size=256, num_workers=4)

In [32]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        return x

In [33]:
net = Net().to(device)
opt = torch.optim.Adam(net.parameters())
loss_fn = nn.CrossEntropyLoss()

In [34]:
def on_trace_ready(prof):
    prof.export_chrome_trace("trace.json")

In [35]:
with profile(
    activities=[ProfilerActivity.CPU,
                ProfilerActivity.CUDA],
    schedule=schedule(
        skip_first=1,
        wait=1,
        warmup=1,
        active=3,
        repeat=1,
    ),
    record_shapes=True,
    on_trace_ready=on_trace_ready,
) as prof:
    for _, (data, label) in zip(trange(7), loader):
        data = data.to(device)
        label = label.to(device)
        pred = net(data)
        loss = loss_fn(pred, label)
        opt.zero_grad()
        loss.backward()
        opt.step()

        prof.step()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 16.71it/s]


## Less features for linear

In [36]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(2304, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 4)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        with record_function("LESS_FC"):
          x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        return x

In [38]:
net = Net().to(device)
opt = torch.optim.Adam(net.parameters())
loss_fn = nn.CrossEntropyLoss()

In [39]:
def on_trace_ready(prof):
    prof.export_chrome_trace("trace.json")

In [40]:
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=schedule(
        skip_first=1,
        wait=1,
        warmup=1,
        active=3,
        repeat=1,
    ),
    record_shapes=True,
    on_trace_ready=on_trace_ready,
) as prof:
    for _, (data, label) in zip(trange(7), loader):
        data = data.to(device)
        label = label.to(device)
        pred = net(data)
        loss = loss_fn(pred, label)
        opt.zero_grad()
        loss.backward()
        opt.step()

        prof.step()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 16.77it/s]


## less batch size

In [None]:
training_data = datasets.FashionMNIST(
    root="mnist",
    train=True,
    download=True,
    transform=ToTensor(),
)
loader = DataLoader(training_data, batch_size=32, num_workers=4)

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(2304, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 4)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        return x

In [None]:
net = Net().to(device)
opt = torch.optim.Adam(net.parameters())
loss_fn = nn.CrossEntropyLoss()

In [None]:
def on_trace_ready(prof):
    prof.export_chrome_trace("trace.json")

In [None]:
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=schedule(
        skip_first=1,
        wait=1,
        warmup=1,
        active=3,
        repeat=1,
    ),
    record_shapes=True,
    on_trace_ready=on_trace_ready,
) as prof:
    for _, (data, label) in zip(trange(7), loader):
        data = data.to(device)
        label = label.to(device)
        pred = net(data)
        loss = loss_fn(pred, label)
        opt.zero_grad()
        loss.backward()
        opt.step()

        prof.step()

### Let's try to speed up the model

In [51]:
training_data = datasets.FashionMNIST(
    root="mnist",
    train=True,
    download=True,
    transform=ToTensor(),
)
loader = DataLoader(training_data, batch_size=256, num_workers=4)

In [52]:
class Relu(nn.Module):
    def __init__(self, scale=1e-5):
        super().__init__()
        self.scale = scale

    def forward(self, x):
        y = F.relu(x)
        a = torch.sin(x)
        b = torch.cos(x)
        c = a * b
        d = torch.log1p(torch.abs(c) + 1e-6)
        return y + self.scale * d

In [61]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv1 = nn.Conv2d(1, 64, kernel_size=5, stride=1, padding=2)
        self.bn1   = nn.BatchNorm2d(64)

        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.bn2   = nn.BatchNorm2d(128)

        self.conv3 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1)
        self.bn3   = nn.BatchNorm2d(128)
        
        self.relu = Relu()
        
        self.adapt_pool = nn.AdaptiveAvgPool2d((7, 7)) 
        
        self.dropout = nn.Dropout(0.3)
        
        self.fc1 = nn.Linear(6272, 256)
        self.fc2 = nn.Linear(256, 10)
    
    def forward(self, x):

        x = self.conv1(x)     
        x = self.bn1(x)
        x = self.relu(x)
        
        x = self.conv2(x)   
        x = self.bn2(x)
        x = self.relu(x)
        
        residual = x       
        x = self.conv3(x)  
        x = self.bn3(x)
        x = self.relu(x)
        x = x + residual     
        
        x = self.adapt_pool(x) 
        x = self.dropout(x)
        x = torch.flatten(x, 1)
        
        x = self.fc1(x) 
        x = self.relu(x)
        x = self.fc2(x)    
        return x

In [62]:
net = Net().to(device)
opt = torch.optim.Adam(net.parameters())
loss_fn = nn.CrossEntropyLoss()

In [63]:
def on_trace_ready(prof):
    prof.export_chrome_trace("trace.json")

In [64]:
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=schedule(
        skip_first=1,
        wait=1,
        warmup=1,
        active=3,
        repeat=1,
    ),
    record_shapes=True,
    on_trace_ready=on_trace_ready,
) as prof:
    for _, (data, label) in zip(trange(7), loader):
        data = data.to(device)
        label = label.to(device)
        pred = net(data)
        loss = loss_fn(pred, label)
        opt.zero_grad()
        loss.backward()
        opt.step()
        prof.step()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00,  8.05it/s]


In [65]:
from model import run

In [69]:
run("flash", device="cuda:3", steps=12, batch_size=1024)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:04<00:00,  2.46it/s]


Saved trace: ./trace_flash.json
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                           aten::linear         0.37%       4.150ms         5.74%      64.411ms     100.642us       0.000us         0.00%     531.404ms     830.319us           0 b         