In [1]:
import torch
import torch.nn as nn

import torchvision
from torch.utils import data # Data의 batch size 설정 및 random하게 섞기 등을 해주는 모듈
import torchsummary

from tqdm import tqdm

from vit_pytorch.deepvit import DeepViT



In [4]:
model = DeepViT(
    image_size = 256,
    patch_size = 32,
    num_classes = 1000,
    dim = 1024,
    depth = 6,
    heads = 16,
    mlp_dim = 2048,
    dropout = 0.1,
    emb_dropout = 0.1
)

# img = torch.randn(1, 3, 256, 256)

# preds = v(img) # (1, 1000)

In [5]:
batch_size = 64

In [6]:
train_data=torchvision.datasets.CIFAR10("../DataSets/", train=True, transform=torchvision.transforms.ToTensor(), target_transform=None, download=True)
test_data=torchvision.datasets.CIFAR10("../DataSets/", train=False, transform=torchvision.transforms.ToTensor(), target_transform=None, download=True)

dataset_size = len(train_data)
train_size = int(dataset_size * 0.8)
test_size = int(batch_size)
validation_size = dataset_size - train_size - test_size

train_dataset, val_dataset, train_test = data.random_split(train_data, [train_size, validation_size, test_size])

train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=6, drop_last=True)
val_loader = data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=6, drop_last=True)
test_loader = data.DataLoader(test_data, batch_size=batch_size, shuffle=True, num_workers=6, drop_last=True)
train_test_loader = data.DataLoader(train_test, batch_size=batch_size, shuffle=True, num_workers=2, drop_last=True)


Files already downloaded and verified
Files already downloaded and verified


In [None]:
import torch
import torch.nn as nn
import torchvision.datasets as dset
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from tqdm import tqdm

# import torch
# import torch.nn as nn
from einops.layers.torch import Rearrange

class DeepViT(nn.Module):
    def __init__(self, image_size, patch_size, num_classes, embed_dim, depth, num_heads, mlp_ratio=4):
        super().__init__()

        assert (image_size % patch_size) == 0, 'image size must be divisible by patch size'
        num_patches = (image_size // patch_size) ** 2
        patch_dim = 3 * patch_size ** 2

        self.patch_size = patch_size
        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, embed_dim))
        self.patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_size, p2=patch_size),
            nn.Linear(patch_dim, embed_dim),
        )

        self.transformer = nn.ModuleList([
            nn.Sequential(*[
                nn.LayerNorm(embed_dim),
                nn.MultiheadAttention(embed_dim, num_heads, bias=False),
                nn.LayerNorm(embed_dim),
                nn.Sequential(
                    nn.Linear(embed_dim, mlp_ratio * embed_dim),
                    nn.GELU(),
                    nn.Linear(mlp_ratio * embed_dim, embed_dim),
                    nn.Dropout(0.1)
                )
            ]) for _ in range(depth)
        ])

        self.layer_norm = nn.LayerNorm(embed_dim)
        self.mlp_head = nn.Sequential(
            nn.Linear(embed_dim, embed_dim),
            nn.GELU(),
            nn.Linear(embed_dim, num_classes)
        )

    def forward(self, x):
        x = self.patch_embedding(x)
        x = x + self.pos_embedding[:, :x.size(1)]
        for transformer in self.transformer:
            x = x + transformer(x, x, x)[0]
        x = self.layer_norm(x)
        x = x.mean(dim=1)
        x = self.mlp_head(x)
        return x


# CIFAR10 데이터셋 불러오기
train_dataset = dset.CIFAR10(root='../DataSets/', train=True, transform=transforms.ToTensor(), download=True)
test_dataset = dset.CIFAR10(root='../DataSets/', train=False, transform=transforms.ToTensor(), download=True)

# 하이퍼파라미터 설정
image_size = 32
patch_size = 4
num_classes = 10
embed_dim = 384
num_layers = 12
num_heads = 6
mlp_ratio = 4
batch_size = 64
num_epochs = 10
learning_rate = 0.001

# 모델 생성
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = DeepViT(image_size, patch_size, num_classes, embed_dim, num_layers, num_heads, mlp_ratio).to(device)



# 손실 함수와 옵티마이저 정의
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# 데이터로더 설정
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 모델 학습
for epoch in range(num_epochs):
    train_loss = 0.0
    train_acc = 0.0
    test_loss = 0.0
    test_acc = 0.0

    # 학습 데이터셋으로 모델 학습
    model.train()
    for images, labels in tqdm(train_loader, desc='Train'):
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_acc += (outputs.argmax(dim=1) == labels).sum().item()

    train_loss /= len(train_loader.dataset)
    train_acc /= len(train_loader.dataset)

    # 테스트 데이터셋으로 모델 평가
    model.eval()
    with torch.no_grad():
        for images, labels in tqdm(test_loader, desc='Test'):
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            test_loss += loss.item()
            test_acc += (outputs.argmax(dim=1) == labels).sum().item()

        test_loss /= len(test_loader.dataset)
        test_acc /= len(test_loader.dataset)

    print(f'Epoch {epoch + 1}/{num_epochs}: train_loss={train_loss:.4f}, train_acc={train_acc:.4f}, test_loss={test_loss:.4f}, test_acc={test_acc:.4f}')

print('Training finished')