# Семинар 2

## План ноутбука

1. Высокоуровневое API для обучение нейросетей в `PyTorch`
2. Обучение первой нейросети в `PyTorch`

## Высокоуровневое API для обучение нейросетей в `PyTorch`

In [None]:
import torch
import torch.nn as nn

### Создание объекта нейросети

In [None]:
"""
W x + b

in_features -> out_features
"""

net = nn.Sequential(
    nn.Linear(700, 500),
    nn.ReLU(),
    nn.Linear(500, 200),
    nn.ReLU(),
    nn.Linear(200, 10)
)

In [None]:
net

In [None]:
net[0]

In [None]:
x = torch.rand(1, 700)

net(x)

In [None]:
from collections import OrderedDict

net = nn.Sequential(
    OrderedDict(
        [
            ('linear1', nn.Linear(700, 500)),
            ('relu1', nn.ReLU()),
            ('linear2', nn.Linear(500, 200)),
            ('relu2', nn.ReLU()),
            ('linear3', nn.Linear(200, 10))
        ]
    )
)

In [None]:
net

In [None]:
net.linear1

In [None]:
net.parameters()

In [None]:
n = next(iter(net.parameters()))
n

In [None]:
list(net.parameters())

In [None]:
type(n)

### Создание нейронки

In [None]:
input_tensor = torch.rand(6, 700)

net(input_tensor).shape

In [None]:
# необходимо отнаследоваться от nn.Module и определить методы __init__ и forward

class CustomTaskNetwork(nn.Module):
    def __init__(self, in_features=700, out_features=10, hidden_dim=500):
        super().__init__()

        self.linear1 = nn.Linear(in_features, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, out_features)

        self.activation = nn.ReLU()

    def forward(self, x):
        output = self.activation(self.linear1(x))
        output = self.activation(self.linear2(output))
        output = self.activation(self.linear2(output))
        output = self.linear3(output)

        return output

In [None]:
net = CustomTaskNetwork()

In [None]:
net(input_tensor)

### Сохранение и восстановление состояния модельки (веса и не только)

In [None]:
torch.save(net.state_dict(), "model.pt")

Восстановим состояние модельки через метод `torch.load`

In [None]:
weights = torch.load("model.pt")

net.load_state_dict(weights)
net.state_dict()

In [None]:
cuda_device = torch.device("cuda:0")

In [None]:
net.to(cuda_device)

In [None]:
net.linear1.weight.device

In [None]:
net.linear1.weight.device

In [None]:
try:
    print(net(input_tensor).shape)
except RuntimeError as e:
    print(f"Caught Error: {e}")
    print(f"Moving input to the net's device")
    print(net(input_tensor.to(net.linear1.weight.device)).shape)

перетаскивание модели между девайсами происходит in-place в отличие от тензоров:

In [None]:
net.cpu()
print(net.linear1.weight.device)
net.cuda()
print(net.linear1.weight.device)

In [None]:
input_tensor.device

In [None]:
input_tensor.cuda()
print(input_tensor.device)
input_tensor.cpu()
print(input_tensor.device)

### Training and evaluation mode

In [None]:
net.train()

In [None]:
net.training

In [None]:
net.eval()

In [None]:
net.training

Если мы хотим проинициализировать сетку какими-то весами из словаря, то важно чтобы названия и размеры параметров в передаваемом словаре совпадали с названиями и размерами параметров сетки

In [None]:
net.load_state_dict({'linear5.weight': torch.rand((1, 2, 3))})

### Оптимизаторы

In [None]:
# Образно как работает оптимизитор
# for param in parameters:
#     param = param - param.grad * self.lr

In [None]:
from torch import optim

In [None]:
optim.SGD, optim.Adam

Передача оптимизатору параметров происходит через передачу `net.parameters()`

In [None]:
optimizer = optim.Adam(net.parameters(), betas=(0.9, 0.999), lr=1e-3)
optimizer

У оптимизатора тоже есть состояние, и его тоже можно сохранять:

In [None]:
optimizer.state_dict()

In [None]:
torch.save(optimizer.state_dict(), "optimizer_state.pt")
!ls -la

Можно устанавливать отдельные гиперпараметры оптимизатора для некоторых параметров модели

In [None]:
optimizer = optim.SGD(
    [
        {'params': net.linear1.parameters()},
        {'params': net.linear2.parameters(), 'lr': 1e-3}
    ],
    lr=1e-2,
    momentum=0.9
)

In [None]:
optimizer

### Функции потерь

In [None]:
nn.L1Loss, nn.MSELoss, nn.CrossEntropyLoss, nn.NLLLoss

In [None]:
loss = nn.MSELoss()

In [None]:
loss

In [None]:
x = torch.randn(3, 5, requires_grad=True)
target = torch.randn(3, 5)

output = loss(x, target)

print(output)

output.backward()

In [None]:
x.grad

In [None]:
loss = nn.CrossEntropyLoss()

x = torch.randn(3, 5, requires_grad=True)
y = torch.empty(3, dtype=torch.long).random_(5)

x, y

In [None]:
loss(x, y)

### Датасеты и даталоадеры

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import TensorDataset

In [None]:
n_features = 2
n_objects = 300

torch.manual_seed(0)

In [None]:
w_true = torch.randn(n_features, 1)

X = (torch.rand(n_objects, n_features) - 0.5) * 10
X *= (torch.arange(n_features) * 2 + 1)

Y = X @ w_true
Y += torch.rand_like(Y)

w_true.shape, X.shape, Y.shape

In [None]:
# !pip install -U plotly kaleido

In [None]:
import plotly.express as px

def draw_curve(values, title=""):
    fig = px.line(y=values, title=title)
    return fig.show()


In [None]:
px.scatter(x=X[:, 0], y=X[:, 1], color=Y.ravel())

In [None]:
w_true

In [None]:
w = torch.rand_like(w_true)

w

Возьмем батч размера 10:

In [None]:
idx = torch.randint(low=0, high=len(X), size=(10,))

print(idx)

X[idx]

In [None]:
X[idx] @ w - Y[idx]

В торче есть удобный интерфейс датасета, который позволяет проводить индексацию и поддерживает трансформации элементов датасета:

In [None]:
dataset = TensorDataset(X, Y)

In [None]:
dataset[7]

In [None]:
X[7], Y[7]

Создадим свой датасет:

In [None]:
# надо отнаследоваться от Dataset и определить методы __init__, __len__ и __getitem__

class CustomDataset(Dataset):
    def __init__(self, w_true, n_features, n_objects):
        self.X = (torch.rand(n_objects, n_features) - 0.5) * 10
        self.X *= (torch.arange(n_features) * 2 + 1)

        self.Y = self.X @ w_true
        self.Y += torch.rand_like(self.Y)

    def __len__(self) -> int:
        return len(self.Y)

    def __getitem__(self, item) -> tuple[torch.Tensor, torch.Tensor]:
        return self.X[item], self.Y[item]

In [None]:
dataset = CustomDataset(w_true, n_features, n_objects)

In [None]:
dataset[7]

In [None]:
dataset.X[7]

Dataloader - "надстройка" над датасетом, позволяет подгружать данные в модель в итеративном формате. Поддерживает поставку данных батчами

In [None]:
from torch.utils.data import DataLoader

In [None]:
loader = DataLoader(dataset, batch_size=16, shuffle=True, drop_last=True)

In [None]:
for x, y in loader:
    print(f"{x=}\t{x.shape=}")
    print(f"{y=}\t{y.shape=}")
    break

## Встроенные датасеты

In [None]:
# !pip install torchvision

In [None]:
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor, Compose, Lambda

dataset = MNIST("./mnist_example", 
                transform=Compose(
                    [
                        ToTensor(),
                        Lambda(lambda x: torch.flatten(x))
                     ]
                ),
                download=True,
            )
valid_dataset = MNIST("./mnist_example/", train=False,
                      transform=Compose(
                    [
                        ToTensor(),
                        Lambda(lambda x: torch.flatten(x))
                     ]
                ),
                download=True,)

In [None]:
dataset[0][0].shape

# Напишем утилиты для обучения 

### Общая структура обучения модели
- модель
- оптимизатор
- датасет $\to$ даталоадер
- лосс-функция
- train loop

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = CustomTaskNetwork(in_features=784, out_features=10).to(device)
optimizer = torch.optim.Adam(model.parameters())
train_dataloader = DataLoader(dataset=dataset, batch_size=4096, shuffle=True)
valid_dataloader = DataLoader(dataset=valid_dataset, batch_size=4096, shuffle=False)

loss_fn = nn.CrossEntropyLoss().to(device)

model.train()

for x, y in train_dataloader:
    optimizer.zero_grad()

    output = model(x.to(device))

    loss = loss_fn(output, y.to(device))

    loss.backward()

    optimizer.step()

Засунем все в функцию, чтобы вызывать ее каждую эпоху

In [None]:
from tqdm import tqdm

In [None]:
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import Optimizer


def train(model: nn.Module, data_loader: DataLoader, optimizer: Optimizer, loss_fn: nn.Module, device: torch.device):
    model.train()

    total_loss = 0
    
    for i, (x, y) in tqdm(enumerate(data_loader), total=len(data_loader), desc="[TRAIN]"):
        optimizer.zero_grad()
        x = x.to(device)
        y = y.to(device)

        output = model(x)

        loss = loss_fn(output, y)

        loss.backward()

        total_loss += loss.detach()

        optimizer.step()

    return total_loss.item() / len(data_loader)


Создадим функцию для валидации:

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader


@torch.inference_mode()
def evaluate(model: nn.Module, data_loader: DataLoader, loss_fn: nn.Module, device: torch.device):
    model.eval()

    total_loss = 0

    for i, (x, y) in enumerate(tqdm(data_loader, desc="[VALIDATION]", total=len(data_loader))):
        output = model(x.to(device))

        loss = loss_fn(output, y.to(device))

        total_loss += loss.detach()

    return total_loss.item() / len(data_loader)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


sns.set(style='darkgrid')


def plot_stats(
    train_loss: list[float],
    valid_loss: list[float],
    title: str
):
    plt.figure(figsize=(16, 8))

    plt.title(title + ' loss')

    plt.plot(train_loss, label='Train loss')
    plt.plot(valid_loss, label='Valid loss')

    plt.legend()

    plt.ylabel("Loss")
    plt.xlabel("Epochs")

    plt.show()

In [None]:
from IPython.display import clear_output


def fit(model, train_loader, valid_loader, optimizer, loss_fn, num_epochs, title, device):
    train_loss_history, valid_loss_history = [None], [None]

    # vvvvvv uncomment if you want to see untuned model performance
    # train_loss_history, valid_loss_history = [evaluate(model, train_loader, loss_fn, device)], [evaluate(model, valid_loader, loss_fn, device)]

    for epoch in range(num_epochs):
        train_loss = train(model, train_loader, optimizer, loss_fn, device)
        valid_loss = evaluate(model, valid_loader, loss_fn, device)

        train_loss_history.append(train_loss)
        valid_loss_history.append(valid_loss)

    clear_output()
    plot_stats(train_loss_history, valid_loss_history, title)

Наша первая функци полноценного обучения готова!

In [None]:
BATCH_SIZE = 2 ** 10

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = CustomTaskNetwork(in_features=784, out_features=10).to(device)
optimizer = torch.optim.Adam(model.parameters())                                   
train_dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE, shuffle=True,
                            #   num_workers=8,  # parallel dataloading
                            #   prefetch_factor=4, # number of batches each worker should preload
                              # for more info, see link: https://discuss.pytorch.org/t/when-does-a-pytorch-dataset-worker-pull-new-data/153286/2
                              )
valid_dataloader = DataLoader(dataset=valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

loss_fn = nn.CrossEntropyLoss().to(device)

model.train()


fit(model=model, train_loader=train_dataloader, valid_loader=valid_dataloader,
    optimizer=optimizer, loss_fn=loss_fn, num_epochs=10, title="MNIST", device=device)

## Обучение первой нейросети в `PyTorch`

In [None]:
class CustomTaskNetwork(nn.Module):
    def __init__(self):
        super().__init__()

        self.linear = nn.Linear(n_features, 1)

    def forward(self, x):
        return self.linear(x)


model = CustomTaskNetwork().to(device=device)

optimizer = optim.Adam(model.parameters(), lr=1e-2)

loss_fn = nn.MSELoss().to(device=device)

In [None]:
from torch.utils.data import random_split


dataset = TensorDataset(X, Y)

train_dataset, valid_dataset = random_split(
    dataset,
    (int(len(dataset) * 0.8), len(dataset) -  int(len(dataset) * 0.8)),
    generator=torch.Generator().manual_seed(300)
)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=10, shuffle=False)

In [None]:
fit(model, train_loader, valid_loader, optimizer, loss_fn, 20, 'Simple fc', device=device)

### Что будет, если не делать ни разу `zero_grad()`

Обсуждение на форуме - [link](https://discuss.pytorch.org/t/why-do-we-need-to-set-the-gradients-manually-to-zero-in-pytorch/4903?u=alband)

In [None]:
def calculate_grad_norm(model_parameters):
    total_norm = 0
    parameters = [p for p in model_parameters if p.grad is not None and p.requires_grad]
    for p in parameters:
        param_norm = p.grad.detach().data.norm(2)
        total_norm += param_norm.item() ** 2
    total_norm = total_norm ** 0.5
    return total_norm

In [None]:
net = CustomTaskNetwork()
optimizer = optim.Adam(net.parameters(), lr=1e-1)
loss_fn = nn.MSELoss()
dataset = TensorDataset(X, Y)
loader = DataLoader(dataset, batch_size=4, shuffle=True)

grad_history = []

net.train()
loss_progress = []

for i, (x, y) in enumerate(loader):
    
    optimizer.zero_grad()

    output = net(x)

    loss = loss_fn(output, y)

    loss.backward()

    optimizer.step()
    loss_progress.append(loss.item())
    grad_history.append(calculate_grad_norm(net.parameters()))

draw_curve(loss_progress, title="MSE")
draw_curve(grad_history, title="Grad norm")

## BatchNorm и Dropout

In [None]:
dropout = nn.Dropout(p=0.1)

dropout

In [None]:
x = torch.rand(3, 7)

x

In [None]:
x / 0.9

In [None]:
dropout.train()

for _ in range(3):
    print(dropout(x))

In [None]:
dropout.eval()

dropout(x)

In [None]:
batch_norm = nn.BatchNorm1d(num_features=7)

batch_norm

In [None]:
x = torch.rand(3, 7)

x

In [None]:
batch_norm(x)

In [None]:
batch_norm.weight

In [None]:
batch_norm.bias

In [None]:
batch_norm.running_mean

In [None]:
batch_norm.running_var

In [None]:
batch_norm.num_batches_tracked

In [None]:
batch_norm(x)

batch_norm.num_batches_tracked

In [None]:
batch_norm.eval()

batch_norm(x)

In [None]:
x

In [None]:
batch_norm = nn.BatchNorm1d(num_features=7)

batch_norm.eval()

batch_norm(x)

# Бонус
Что делать, если хочется батч больше, но видеокарта не позволяет?


Есть несколько способов, самый простой - увеличить эффективный размер батча (делать аккумуляцию градиента каждые `k` шагов)

In [None]:
grad_accumulation_steps = 1

net = CustomTaskNetwork()
optimizer = optim.Adam(net.parameters(), lr=1e-1)
loss_fn = nn.MSELoss()
dataset = TensorDataset(X, Y)
loader = DataLoader(dataset, batch_size=4, shuffle=True)

grad_history = []

net.train()
loss_progress = []


for step, (x, y) in enumerate(loader, 1):
    

    output = net(x)

    loss = loss_fn(output, y)
    loss.backward()
    grad_history.append(calculate_grad_norm(net.parameters()))

    if step % grad_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()

    loss_progress.append(loss.item())

draw_curve(loss_progress, title="MSE")
draw_curve(grad_history, title="Grad norm")