<a href="https://colab.research.google.com/github/vyragosa/Deep-Learning-with-Pytorch/blob/main/Lesson4/Homework4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Vit Transformer

## Модель

In [1]:
import torch
from torch import nn

In [2]:
# Смоделируем данные

n_features = 10  # Количество признаков
n_classes = 3  # Количество классов
batch_size = 5 

data = torch.randn((batch_size, n_features))
print(data.shape)
print(data)

torch.Size([5, 10])
tensor([[-2.5334, -0.9565, -0.6687, -0.2604, -0.7144,  0.8632, -0.8125, -0.4398,
          0.7745, -0.1242],
        [-0.7363,  0.6791, -0.3438,  0.5119,  0.5442,  0.0076,  0.4123, -0.4552,
         -0.0984, -0.9653],
        [ 0.6041, -0.8996, -0.1496,  0.7702, -0.3284,  3.1291, -0.8884, -0.5011,
         -0.0531,  0.4702],
        [ 0.9984, -1.4029,  0.2687, -0.2648, -0.4529, -0.4318, -0.6493, -0.2855,
          1.6445,  2.0167],
        [-0.1352,  0.2940,  0.1175,  0.2456, -0.8673, -0.4940,  0.8356,  2.2461,
         -1.4129,  1.5931]])


In [3]:
# Зададим простую модель
model = nn.Linear(n_features, n_classes)

In [4]:
# Применим модель к вектору
answer = model(data)
print(answer.shape)
print(answer)

torch.Size([5, 3])
tensor([[ 0.9367, -0.2060, -0.0400],
        [ 0.9351,  0.1815,  0.2405],
        [ 0.4786,  0.2207, -1.3682],
        [-1.0627, -0.2106, -0.6474],
        [ 0.0065, -0.5155, -0.4240]], grad_fn=<AddmmBackward0>)


In [5]:
# Модель как наследник nn.Module
class SimpleNN(nn.Module):
    def __init__(self, n_features, n_classes):
        super().__init__()

        self.lin = nn.Linear(n_features, n_classes)

    def forward(self, x):
        return self.lin(x)

In [6]:
# Попробуем применить модель в виде класса к данным
model = SimpleNN(n_features, n_classes)

answer = model(data)
print(answer.shape)
print(answer)

torch.Size([5, 3])
tensor([[-0.9749,  0.2739, -0.2506],
        [ 0.1815, -0.0888, -0.0780],
        [-0.7308,  1.3065, -0.7929],
        [ 0.1040,  0.9197, -0.4299],
        [ 0.0714, -1.0181,  1.3724]], grad_fn=<AddmmBackward0>)


In [7]:
!pip install torchsummary
from torchsummary import summary

model = SimpleNN(n_features, n_classes).cuda()

# 5, 10
input_size = (batch_size, n_features)
print(summary(model, input_size))

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [-1, 5, 3]              33
Total params: 33
Trainable params: 33
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------
None


In [8]:
# Модель как sequential
model = nn.Sequential(nn.Linear(n_features, n_classes))

answer = model(data)
print(answer.shape)
print(answer)

torch.Size([5, 3])
tensor([[ 0.2390, -0.0312,  0.2191],
        [ 0.1500, -0.0242, -0.4466],
        [ 0.9750,  0.0896,  0.6038],
        [-0.1699, -0.6341,  1.0429],
        [-0.2059, -0.2868, -0.4787]], grad_fn=<AddmmBackward0>)


In [9]:
# Модель как nn.ModuleList

model = nn.ModuleList([nn.Linear(n_features, n_classes)])

# answer = model(data)
# print(answer.shape)
# print(answer)

answer = model[0](data)
print(answer.shape)
print(answer)


torch.Size([5, 3])
tensor([[ 1.0925, -0.3466,  0.8294],
        [ 0.2715, -0.3599,  0.6825],
        [-0.2782, -0.0437, -0.1845],
        [ 0.1277,  0.7062, -0.8107],
        [-0.5537,  0.2513,  0.1564]], grad_fn=<AddmmBackward0>)


In [10]:
# Проверим параметры модели
class ParametersCheck(nn.Module):
    def __init__(self, n_features, n_classes):
        super().__init__()

        self.lin = nn.Linear(n_features, n_classes)
        self.seq = nn.Sequential(nn.Linear(n_features, n_classes))
        self.module_list = nn.ModuleList([nn.Linear(n_features, n_classes)])
        self.list_of_layers = [nn.Linear(n_features, n_classes)]


In [11]:
model = ParametersCheck(n_features, n_classes)

for i, param in enumerate(model.parameters()):
    print(f'Параметр #{i + 1}.')
    print(f'\t{param.shape}')

Параметр #1.
	torch.Size([3, 10])
Параметр #2.
	torch.Size([3])
Параметр #3.
	torch.Size([3, 10])
Параметр #4.
	torch.Size([3])
Параметр #5.
	torch.Size([3, 10])
Параметр #6.
	torch.Size([3])


## ViT

![alt text](https://drive.google.com/uc?export=view&id=1J5TvycDPs8pzfvlXvtO5MCFBy64yp9Fa)

In [3]:
!pip install einops

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting einops
  Downloading einops-0.6.0-py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.6/41.6 KB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.6.0


In [4]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

from torch import nn
from torch import Tensor
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor
from einops import rearrange, reduce, repeat
from einops.layers.torch import Rearrange, Reduce
from torchsummary import summary

![](https://amaarora.github.io/images/vit-01.png)

## Часть 1. Patch Embedding, CLS Token, Position Encoding

![](https://amaarora.github.io/images/vit-02.png)

In [5]:
# input image `B, C, H, W`
x = torch.randn(1, 3, 224, 224)
# 2D conv
conv = nn.Conv2d(3, 768, 16, 16)
conv(x).reshape(-1, 196).transpose(0,1).shape

torch.Size([196, 768])

In [6]:
class PatchEmbedding(nn.Module):
    """ Image to Patch Embedding
    """
    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
        super().__init__()
        self.num_patches = (img_size // patch_size) ** 2
        self.patch_embeddings = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, image):

        patches = self.patch_embeddings(image).flatten(2).transpose(1, 2)
        
        return patches

In [7]:
patch_embed = PatchEmbedding()
x = torch.randn(1, 3, 224, 224)
patch_embed(x).shape 

torch.Size([1, 196, 768])

![](https://amaarora.github.io/images/vit-03.png)

## Часть 2. Transformer Encoder

![](https://amaarora.github.io/images/ViT.png)

![](https://amaarora.github.io/images/vit-07.png)

In [8]:
class MLP(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, drop=0.):
        super().__init__()

        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        # Linear Layers
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.fc2 = nn.Linear(hidden_features, out_features)
        
        # Activation(s)
        self.act = nn.GELU()

        self.drop = nn.Dropout(drop)

    def forward(self, x):
        
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)

        return x

In [9]:
x = torch.randn(1, 197,768)
mlp = MLP(768, 3072, 768)
out = mlp(x)
out.shape

torch.Size([1, 197, 768])

In [10]:
class Attention(nn.Module):
    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., out_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = head_dim ** -0.5

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.out = nn.Linear(dim, dim)
        self.out_drop = nn.Dropout(out_drop)

    def forward(self, x):
        # Attention
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]

        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        # Out projection
        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.out(x)
        x = self.out_drop(x)

        return x


![](https://amaarora.github.io/images/vit-08.png)

In [20]:
# attn = (q @ k.transpose(-2, -1)) * self.scale
# attn = attn.softmax(dim=-1)

In [11]:
x = torch.randn(1, 197, 768)
attention = Attention(768, 8)
out = attention(x)
out.shape

torch.Size([1, 197, 768])

In [12]:
class Block(nn.Module):
    def __init__(self, dim, num_heads=8, mlp_ratio=4, drop_rate=0.):
        super().__init__()

        # Normalization
        self.norm1 = nn.LayerNorm(dim)

        # Attention
        self.attn = Attention(dim, 
                              num_heads=num_heads, 
                              qkv_bias=False,
                              attn_drop=0., 
                              out_drop=0.)

        # Dropout
        self.drop_path = Transformer(drop_rate) if drop_rate > 0. else nn.Identity()

        # Normalization
        self.norm2 = nn.LayerNorm(dim)

        # MLP
        self.mlp = MLP(in_features=dim, hidden_features=int(dim * mlp_ratio), drop=0.)

    def forward(self, x):
        # Attetnion
        x += self.drop_path(self.attn(self.norm1(x)))

        # MLP
        x += self.drop_path(self.mlp(self.norm2(x)))
        return x

In [13]:
x = torch.randn(1, 197, 768)
block = Block(768, 8)
out = attention(x)
out.shape

torch.Size([1, 197, 768])

В оригинальной реализации теперь используется [DropPath](https://github.com/rwightman/pytorch-image-models/blob/e98c93264cde1657b188f974dc928b9d73303b18/timm/layers/drop.py)

In [14]:
class Transformer(nn.Module):
    def __init__(self, depth, dim, num_heads=8, mlp_ratio=4, drop_rate=0.):
        super().__init__()
        self.blocks = nn.ModuleList([
            Block(dim, num_heads, mlp_ratio, drop_rate)
            for i in range(depth)])

    def forward(self, x):
        for block in self.blocks:
            x = block(x)
        return x

In [15]:
x = torch.randn(1, 197, 768)
block = Transformer(12, 768)
out = attention(x)
out.shape

torch.Size([1, 197, 768])

![](https://amaarora.github.io/images/vit-06.png)

In [19]:
from torch.nn.modules.normalization import LayerNorm

class ViT(nn.Module):
    """ Vision Transformer with support for patch or hybrid CNN input stage
    """
    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000,
                 embed_dim=768, depth=12, num_heads=12, mlp_ratio=4., 
                 qkv_bias=False, drop_rate=0.,):
        super().__init__()

        # Присвоение переменных
        self.num_classes = num_classes
        self.num_features = self.embed_dim = embed_dim
        
        
        # Path Embeddings, CLS Token, Position Encoding
        self.patch_embed = PatchEmbedding(img_size=img_size, 
                                          patch_size=patch_size, 
                                          in_chans=in_chans, 
                                          embed_dim=embed_dim)
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(torch.zeros(1, self.patch_embed.num_patches + 1, embed_dim))
        self.pos_drop = nn.Dropout(p=drop_rate)

        # Transformer Encoder
        self.transformer = Transformer(
            depth=depth,
            dim=embed_dim,
            num_heads=num_heads,
            mlp_ratio=mlp_ratio,
            drop_rate=drop_rate,
        )

        # Classifier
        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()

    def forward_features(self, x):
        B = x.shape[0]
        x = self.patch_embed(x)
        cls_tokens = self.cls_token.expand(B, -1, -1)  
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.pos_embed
        x = self.pos_drop(x)
        x = self.transformer.forward(x)
        return x[:, 0]

    def forward(self, x):
        x = self.forward_features(x)
        x = self.head(x)
        return x

In [20]:
x = torch.randn(1, 3, 224, 224)
vit = ViT()
out = vit(x)
out.shape

torch.Size([1, 1000])

# Домашнее задание


1. Выбрать датасет для классификации изображений с размерностью 64x64+ 
2. Обучить ViT на таком датасете.
3. Попробовать поменять размерности и посмотреть, что поменяется при обучении.


Примечание:
- Датасеты можно взять [тут](https://pytorch.org/vision/stable/datasets.html#built-in-datasets) или найти в другом месте.
- Из за того, что ViT учится медленно, количество примеров в датасете можно ограничить до 1к-5к.

In [21]:
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10
from torchvision.transforms import ToTensor
from tqdm import tqdm

def train(model, loss, optimizer, loader, device='cpu', num_epochs=1):
    model.to(device)
    loss.to(device)
    
    loss_history = []

    with torch.cuda.device(device):
        for epoch in range(num_epochs):
            epoch_loss = 0.0

            for x, y in tqdm(loader, desc=f'Epoch {epoch+1}'):
                x = x.to(device)
                y = y.to(device)

                output = model(x)
                batch_loss = loss(output, y)
                batch_loss = batch_loss.detach()

                optimizer.zero_grad()
                batch_loss.backward()
                optimizer.step()

                epoch_loss += batch_loss.item()
            
            epoch_loss /= len(loader)
            loss_history.append(epoch_loss)

            print(f'Epoch {epoch+1}, Loss: {epoch_loss}')

    return loss_history

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = ViT()
loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=3e-4)

transform = ToTensor()
train_dataset = CIFAR10(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

loss_history = train(model, loss, optimizer, train_loader, device=device, num_epochs=10)

print(f'Final Loss: {loss_history[-1]}')