In [None]:
import tensorflow
from tensorflow.keras.activations import gelu
!pip install tensorflow_addons
import tensorflow_addons as tfa
from typing import List, Tuple

Collecting tensorflow_addons
  Downloading tensorflow_addons-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (611 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/611.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m604.2/611.8 kB[0m [31m18.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.8/611.8 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7 (from tensorflow_addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow_addons
Successfully installed tensorflow_addons-0.23.0 typeguard-2.13.3



TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [None]:
 class MultiHeadedAttention(tensorflow.keras.Model):
    def __init__(self, dimension: int, heads: int = 8):
        super(MultiHeadedAttention, self).__init__()
        self.heads = heads
        self.dimension = dimension
        assert dimension // heads
        self.depth = dimension // heads
        self.wq = tensorflow.keras.layers.Dense(dimension)
        self.wk = tensorflow.keras.layers.Dense(dimension)
        self.wv = tensorflow.keras.layers.Dense(dimension)
        self.dense = tensorflow.keras.layers.Dense(dimension)

    def call(self, inputs):
        output = None
        batch_size = tensorflow.shape(inputs)[0]
        q: tensorflow.Tensor = self.wq(inputs)
        k: tensorflow.Tensor = self.wk(inputs)
        v: tensorflow.Tensor = self.wv(inputs)

        def split_heads(x, batch_size):
            x = tensorflow.reshape(x, (batch_size, -1, self.heads, self.depth))
            return tensorflow.transpose(x, perm=[0,2,1,3])

        q = split_heads(q, batch_size)
        k = split_heads(k, batch_size)
        v = split_heads(v, batch_size)

        def scaled_dot_product_attention(q,k,v):
            matmul_qk = tensorflow.matmul(q, k, transpose_b = True)
            dk = tensorflow.cast(tensorflow.shape(k)[-1], tensorflow.float32)
            scaled_attention_logits = matmul_qk / tensorflow.math.sqrt(dk)

            softmax = tensorflow.nn.softmax(scaled_attention_logits, axis=-1)
            scaled_dot_product_attention_output = tensorflow.matmul(softmax, v)
            return scaled_dot_product_attention_output, softmax

        attention_weights, softmax = scaled_dot_product_attention(q, k, v)
        scaled_attention = tensorflow.transpose(attention_weights, perm=[0,2,1,3])
        concat_attention = tensorflow.reshape(scaled_attention, (batch_size, -1, self.dimension))
        output = self.dense(concat_attention)
        return output

In [None]:
class ResidualBlock(tensorflow.keras.Model):
    def __init__(self, residual_function):
        super(ResidualBlock, self).__init__()
        self.residual_function = residual_function

    def call(self, inputs):
        return self.residual_function(inputs) + inputs


In [None]:
class NormalizationBlock(tensorflow.keras.Model):
    def __init__(self, norm_function, epsilon=1e-5):
        super(NormalizationBlock, self).__init__()
        self.norm_function = norm_function
        self.normalize = tensorflow.keras.layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs):
        return self.norm_function(self.normalize(inputs))


In [None]:
class MLPBlock(tensorflow.keras.Model):
    def __init__(self, output_dimension, hidden_dimension):
        super(MLPBlock, self).__init__()
        self.output_dimension = tensorflow.keras.layers.Dense(output_dimension)
        self.hidden_dimension = tensorflow.keras.layers.Dense(hidden_dimension)
        self.dropout1 = tensorflow.keras.layers.Dropout(0.1)
        self.dropout2 = tensorflow.keras.layers.Dropout(0.1)

    def call(self, inputs):
        output = None
        x = self.hidden_dimension(inputs)
        x = gelu(x)
        x = self.dropout1(x)
        x = self.output_dimension(x)
        x = gelu(x)
        output = self.dropout2(x)
        return output


In [None]:
class TransformerEncoder(tensorflow.keras.layers.Layer):
    def __init__(self, dimension, depth, heads, mlp_dimension):
        super(TransformerEncoder, self).__init__()
        layers_ = []
        layers_.append(tensorflow.keras.Input(shape=((CFG.obj_image_size//CFG.patch_size)*(CFG.obj_image_size//CFG.patch_size)+1,dimension)))
        for i in range(depth):
            layers_.append(NormalizationBlock(ResidualBlock(MultiHeadedAttention(dimension, heads))))
            layers_.append(NormalizationBlock(ResidualBlock(MLPBlock(dimension, mlp_dimension))))

        self.layers_ = tensorflow.keras.Sequential(layers_)

    def call(self, inputs):
        return self.layers_(inputs)


In [None]:
class ImageTransformer(tensorflow.keras.Model):
    def __init__(
            self, image_size, patch_size, n_classes, batch_size,
            dimension, depth, heads, mlp_dimension, channels=3):
        super(ImageTransformer, self).__init__()
        assert image_size % patch_size == 0, 'invalid patch size for image size'

        num_patches = (image_size // patch_size) ** 2
        self.patch_size = patch_size
        self.dimension = dimension
        self.batch_size = batch_size

        self.positional_embedding = self.add_weight(
            "position_embeddings", shape=[num_patches + 1, dimension],
            initializer=tensorflow.keras.initializers.RandomNormal(), dtype=tensorflow.float32
        )
        self.classification_token = self.add_weight(
            "classification_token", shape=[1, 1, dimension],
            initializer=tensorflow.keras.initializers.RandomNormal(), dtype=tensorflow.float32
        )
        self.heads = heads
        self.depth = depth
        self.mlp_dimension = dimension
        self.n_classes = n_classes
        self.num_patches = num_patches

        self.patch_projection = tensorflow.keras.layers.Dense(dimension)
        self.normalization2 = tensorflow.keras.layers.LayerNormalization(epsilon=1e-6)
        self.MLP = MLPBlock(self.dimension, self.mlp_dimension)
        self.output_classes = tensorflow.keras.layers.Dense(self.n_classes)
        self.transformer = TransformerEncoder(self.dimension, self.depth, self.heads, self.mlp_dimension)
        self.dropout1 = tensorflow.keras.layers.Dropout(0.5)

    def call(self, inputs):
        output = None
        batch_size = tensorflow.shape(inputs)[0]

        ###############################################
        ############ 가장 중요한 부분 ##################
        ###############################################

        # 이미지를 patch_size로 조각낸다.
        patches = tensorflow.image.extract_patches(
            images = inputs,
            sizes = [1, self.patch_size, self.patch_size, 1],
            strides = [1, self.patch_size, self.patch_size, 1],
            rates = [1,1,1,1],
            padding="VALID",
        )

        patch_dims = patches.shape[-1]
        patches = tensorflow.reshape(patches, [batch_size, patches.shape[1]*patches.shape[2], patch_dims])
        x = self.patch_projection(patches)

        cls_pos = tensorflow.broadcast_to(
            self.classification_token, [batch_size, 1, self.dimension]
        )
        x = tensorflow.concat([cls_pos, x], axis=1)
        x = x + self.positional_embedding
        x = self.transformer(x)
        x = self.normalization2(x)
        x = x[:,0,:]
        x_keep = tensorflow.identity(x)
        x = self.dropout1(x)
        output = self.output_classes(x)
        return output


In [None]:
from tensorflow.keras import datasets
# CIFAR10 데이터 다운로드
(train_images, train_labels), (test_images, test_labels) = datasets.cifar10.load_data()
train_images = train_images / 255.
test_images = test_images / 255.

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


In [None]:
class CFG:
    num_classes = 10
    input_shape = (32, 32, 3)
    learning_rate = 0.001
    weight_decay = 0.0001
    batch_size = 256
    num_epochs = 100
    image_size = 32
    obj_image_size = 32
    patch_size = 4
    num_patches = (image_size // patch_size) ** 2
    projection_dim = 128  # 임베딩 차원을 CIFAR-10에 맞게 조정
    num_heads = 4  # 어텐션 헤드 수를 CIFAR-10에 맞게 조정
    transformer_layers = 6  # 트랜스포머 레이어 수를 CIFAR-10에 맞게 조정

In [None]:
CFG = CFG()
optimizer = tfa.optimizers.AdamW(learning_rate=CFG.learning_rate, weight_decay=CFG.weight_decay)

model_vit = ImageTransformer(
    CFG.image_size, CFG.patch_size, CFG.num_classes, CFG.batch_size,
    CFG.projection_dim, CFG.transformer_layers, CFG.num_heads, CFG.projection_dim
)
model_vit.compile(
    optimizer=optimizer,
    loss=tensorflow.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tensorflow.keras.metrics.SparseCategoricalAccuracy(name="accuracy")]
)
model_vit.fit(x=train_images, y=train_labels, batch_size=CFG.batch_size, epochs=CFG.num_epochs, validation_data=(test_images, test_labels), shuffle=True)
print('==============Training Finished===============')

accuracy = 0
_, accuracy = model_vit.evaluate(test_images, test_labels)

print('Test Accuracy :', accuracy)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torchvision.datasets as datasets

# 데이터 증강 추가
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

trainset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
trainloader = DataLoader(trainset, batch_size=256, shuffle=True, num_workers=2)

testset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
testloader = DataLoader(testset, batch_size=256, shuffle=False, num_workers=2)

class MLPBlock(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout=0.1):
        super(MLPBlock, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, input_dim)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.gelu(self.fc1(x))
        x = self.dropout(x)
        x = self.gelu(self.fc2(x))
        x = self.dropout(x)
        return x

class MultiHeadedAttention(nn.Module):
    def __init__(self, dimension: int, heads: int = 8):
        super(MultiHeadedAttention, self).__init__()
        self.heads = heads
        self.dimension = dimension
        self.depth = dimension // heads

        self.wq = nn.Linear(dimension, dimension)
        self.wk = nn.Linear(dimension, dimension)
        self.wv = nn.Linear(dimension, dimension)
        self.dense = nn.Linear(dimension, dimension)

    def forward(self, x):
        batch_size = x.shape[0]

        def split_heads(x):
            x = x.view(batch_size, -1, self.heads, self.depth)
            return x.permute(0, 2, 1, 3)

        q = split_heads(self.wq(x))
        k = split_heads(self.wk(x))
        v = split_heads(self.wv(x))

        scaled_attention, _ = self.scaled_dot_product_attention(q, k, v)
        scaled_attention = scaled_attention.permute(0, 2, 1, 3).contiguous()
        concat_attention = scaled_attention.view(batch_size, -1, self.dimension)
        output = self.dense(concat_attention)
        return output

    def scaled_dot_product_attention(self, q, k, v):
        matmul_qk = torch.matmul(q, k.transpose(-2, -1))
        dk = k.shape[-1]
        scaled_attention_logits = matmul_qk / torch.sqrt(torch.tensor(dk, dtype=torch.float32))
        softmax = nn.Softmax(dim=-1)
        attention_weights = softmax(scaled_attention_logits)
        output = torch.matmul(attention_weights, v)
        return output, attention_weights

class TransformerEncoder(nn.Module):
    def __init__(self, dimension, depth, heads, mlp_dimension, dropout=0.1):
        super(TransformerEncoder, self).__init__()
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                nn.LayerNorm(dimension),
                MultiHeadedAttention(dimension, heads),
                nn.LayerNorm(dimension),
                MLPBlock(dimension, mlp_dimension, dropout)
            ]))

    def forward(self, x):
        for norm1, attn, norm2, mlp in self.layers:
            x = attn(norm1(x)) + x
            x = mlp(norm2(x)) + x
        return x

class ImageTransformer(nn.Module):
    def __init__(self, image_size, patch_size, num_classes, dimension, depth, heads, mlp_dimension, channels=3):
        super(ImageTransformer, self).__init__()
        assert image_size % patch_size == 0, 'invalid patch size for image size'

        num_patches = (image_size // patch_size) ** 2
        self.patch_size = patch_size
        self.dimension = dimension

        self.positional_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dimension))
        self.classification_token = nn.Parameter(torch.randn(1, 1, dimension))

        self.patch_projection = nn.Linear(patch_size * patch_size * channels, dimension)
        self.transformer = TransformerEncoder(dimension, depth, heads, mlp_dimension)
        self.norm = nn.LayerNorm(dimension)
        self.fc = nn.Linear(dimension, num_classes)
        self.dropout = nn.Dropout(0.5)  # 드롭아웃 비율 증가

    def forward(self, x):
        batch_size = x.shape[0]

        patches = x.unfold(2, self.patch_size, self.patch_size).unfold(3, self.patch_size, self.patch_size)
        patches = patches.contiguous().view(batch_size, -1, self.patch_size * self.patch_size * x.shape[1])
        x = self.patch_projection(patches)

        cls_tokens = self.classification_token.expand(batch_size, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.positional_embedding

        x = self.transformer(x)
        x = self.norm(x)
        x = self.dropout(x[:, 0])
        x = self.fc(x)
        return x

# 모델 및 하이퍼파라미터 설정
model = ImageTransformer(
    image_size=32, patch_size=4, num_classes=10, dimension=128, depth=6, heads=4, mlp_dimension=256
).to('cuda')

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.0001)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:03<00:00, 43735952.42it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [None]:
import torch
import torch.nn.functional as F
import time

# 학습
num_epochs = 100
for epoch in range(num_epochs):
    start_time = time.time()

    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    # 훈련 단계
    for inputs, labels in trainloader:
        inputs, labels = inputs.to('cuda'), labels.to('cuda')

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    train_loss = running_loss / len(trainloader)
    train_accuracy = 100 * correct / total

    # 검증 단계
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for inputs, labels in testloader:
            inputs, labels = inputs.to('cuda'), labels.to('cuda')

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_loss /= len(testloader)
    val_accuracy = 100 * val_correct / val_total

    epoch_time = time.time() - start_time
    remaining_time = epoch_time * (num_epochs - epoch - 1)
    eta = time.strftime("%H:%M:%S", time.gmtime(remaining_time))

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {train_loss:.4f}, Accuracy: {train_accuracy:.2f}%, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%, ETA: {eta}")


Epoch 1/100, Loss: 2.0029, Accuracy: 26.27%, Val Loss: 1.7053, Val Accuracy: 38.70%, ETA: 00:20:20
Epoch 2/100, Loss: 1.7255, Accuracy: 37.03%, Val Loss: 1.5671, Val Accuracy: 43.71%, ETA: 00:18:46
Epoch 3/100, Loss: 1.6263, Accuracy: 41.04%, Val Loss: 1.5043, Val Accuracy: 44.97%, ETA: 00:18:29
Epoch 4/100, Loss: 1.5714, Accuracy: 43.07%, Val Loss: 1.4557, Val Accuracy: 47.44%, ETA: 00:18:19
Epoch 5/100, Loss: 1.5279, Accuracy: 44.79%, Val Loss: 1.4151, Val Accuracy: 48.57%, ETA: 00:17:50
Epoch 6/100, Loss: 1.4905, Accuracy: 45.96%, Val Loss: 1.3980, Val Accuracy: 49.32%, ETA: 00:17:46
Epoch 7/100, Loss: 1.4641, Accuracy: 47.02%, Val Loss: 1.3685, Val Accuracy: 51.24%, ETA: 00:17:24
Epoch 8/100, Loss: 1.4362, Accuracy: 48.06%, Val Loss: 1.3417, Val Accuracy: 51.53%, ETA: 00:17:41
Epoch 9/100, Loss: 1.4073, Accuracy: 49.29%, Val Loss: 1.3142, Val Accuracy: 52.35%, ETA: 00:17:21
Epoch 10/100, Loss: 1.3851, Accuracy: 50.23%, Val Loss: 1.3227, Val Accuracy: 52.90%, ETA: 00:16:46
Epoch 11/

In [None]:
# 평가
correct = 0
total = 0
model.eval()
with torch.no_grad():
    for inputs, labels in testloader:
        inputs, labels = inputs.to('cuda'), labels.to('cuda')
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Test Accuracy: {100 * correct / total}%')

Test Accuracy: 77.79%


In [None]:
model_path = 'vit_cifar10.pth'
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Model saved to vit_cifar10.pth


In [None]:
!lscpu

Architecture:             x86_64
  CPU op-mode(s):         32-bit, 64-bit
  Address sizes:          46 bits physical, 48 bits virtual
  Byte Order:             Little Endian
CPU(s):                   12
  On-line CPU(s) list:    0-11
Vendor ID:                GenuineIntel
  Model name:             Intel(R) Xeon(R) CPU @ 2.20GHz
    CPU family:           6
    Model:                85
    Thread(s) per core:   2
    Core(s) per socket:   6
    Socket(s):            1
    Stepping:             7
    BogoMIPS:             4400.44
    Flags:                fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 cl
                          flush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc re
                          p_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3
                           fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand
                           hypervisor lahf_lm abm 3dnowprefetch

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, random_split
import torchvision.datasets as datasets
import time

# 데이터 증강 및 정규화 설정
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

# CIFAR-10 데이터셋 다운로드 및 로더 설정
full_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

# 전체 데이터를 학습, 검증, 테스트 데이터로 분할
train_size = int(0.8 * len(full_dataset))
val_size = int(0.1 * len(full_dataset))
train_dataset, val_dataset, _ = random_split(full_dataset, [train_size, val_size, len(full_dataset) - train_size - val_size])

trainloader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=2)
valloader = DataLoader(val_dataset, batch_size=256, shuffle=False, num_workers=2)
testloader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=2)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:12<00:00, 13122836.87it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [None]:
# 모델 정의
class MLPBlock(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout=0.1):
        super(MLPBlock, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, input_dim)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.gelu(self.fc1(x))
        x = self.dropout(x)
        x = self.gelu(self.fc2(x))
        x = self.dropout(x)
        return x

class MultiHeadedAttention(nn.Module):
    def __init__(self, dimension: int, heads: int = 8):
        super(MultiHeadedAttention, self).__init__()
        self.heads = heads
        self.dimension = dimension
        self.depth = dimension // heads

        self.wq = nn.Linear(dimension, dimension)
        self.wk = nn.Linear(dimension, dimension)
        self.wv = nn.Linear(dimension, dimension)
        self.dense = nn.Linear(dimension, dimension)

    def forward(self, x):
        batch_size = x.shape[0]

        def split_heads(x):
            x = x.view(batch_size, -1, self.heads, self.depth)
            return x.permute(0, 2, 1, 3)

        q = split_heads(self.wq(x))
        k = split_heads(self.wk(x))
        v = split_heads(self.wv(x))

        scaled_attention, _ = self.scaled_dot_product_attention(q, k, v)
        scaled_attention = scaled_attention.permute(0, 2, 1, 3).contiguous()
        concat_attention = scaled_attention.view(batch_size, -1, self.dimension)
        output = self.dense(concat_attention)
        return output

    def scaled_dot_product_attention(self, q, k, v):
        matmul_qk = torch.matmul(q, k.transpose(-2, -1))
        dk = k.shape[-1]
        scaled_attention_logits = matmul_qk / torch.sqrt(torch.tensor(dk, dtype=torch.float32))
        attention_weights = torch.nn.functional.softmax(scaled_attention_logits, dim=-1)
        output = torch.matmul(attention_weights, v)
        return output, attention_weights

class TransformerEncoder(nn.Module):
    def __init__(self, dimension, depth, heads, mlp_dimension, dropout=0.1):
        super(TransformerEncoder, self).__init__()
        self.layers = nn.ModuleList([
            nn.ModuleList([
                nn.LayerNorm(dimension),
                MultiHeadedAttention(dimension, heads),
                nn.LayerNorm(dimension),
                MLPBlock(dimension, mlp_dimension, dropout)
            ])
            for _ in range(depth)
        ])

    def forward(self, x):
        for norm1, attn, norm2, mlp in self.layers:
            x = attn(norm1(x)) + x
            x = mlp(norm2(x)) + x
        return x

class ImageTransformer(nn.Module):
    def __init__(self, image_size, patch_size, num_classes, dimension, depth, heads, mlp_dimension, channels=3):
        super(ImageTransformer, self).__init__()
        assert image_size % patch_size == 0, 'Invalid patch size for image size'

        num_patches = (image_size // patch_size) ** 2
        self.patch_size = patch_size
        self.dimension = dimension

        self.positional_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dimension))
        self.classification_token = nn.Parameter(torch.randn(1, 1, dimension))

        self.patch_projection = nn.Linear(patch_size * patch_size * channels, dimension)
        self.transformer = TransformerEncoder(dimension, depth, heads, mlp_dimension)
        self.norm = nn.LayerNorm(dimension)
        self.fc = nn.Linear(dimension, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        batch_size = x.shape[0]

        patches = x.unfold(2, self.patch_size, self.patch_size).unfold(3, self.patch_size, self.patch_size)
        patches = patches.contiguous().view(batch_size, -1, self.patch_size * self.patch_size * x.shape[1])
        x = self.patch_projection(patches)

        cls_tokens = self.classification_token.expand(batch_size, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.positional_embedding

        x = self.transformer(x)
        x = self.norm(x)
        x = self.dropout(x[:, 0])
        x = self.fc(x)
        return x

# 모델 및 하이퍼파라미터 설정
model = ImageTransformer(
    image_size=32, patch_size=4, num_classes=10, dimension=128, depth=6, heads=4, mlp_dimension=256
).to('cuda')

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.0001)

In [None]:
# 학습
num_epochs = 100
for epoch in range(num_epochs):
    start_time = time.time()

    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in trainloader:
        inputs, labels = inputs.to('cuda'), labels.to('cuda')

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    train_loss = running_loss / len(trainloader)
    train_accuracy = 100 * correct / total

    # 검증 단계
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for inputs, labels in valloader:
            inputs, labels = inputs.to('cuda'), labels.to('cuda')

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_loss /= len(valloader)
    val_accuracy = 100 * val_correct / val_total

    epoch_time = time.time() - start_time
    remaining_time = epoch_time * (num_epochs - epoch - 1)
    eta = time.strftime("%H:%M:%S", time.gmtime(remaining_time))

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {train_loss:.4f}, Accuracy: {train_accuracy:.2f}%, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%, ETA: {eta}")


Epoch 1/100, Loss: 2.0683, Accuracy: 23.63%, Val Loss: 1.8194, Val Accuracy: 34.12%, ETA: 00:16:49
Epoch 2/100, Loss: 1.7775, Accuracy: 35.13%, Val Loss: 1.7016, Val Accuracy: 37.36%, ETA: 00:15:21
Epoch 3/100, Loss: 1.6658, Accuracy: 39.20%, Val Loss: 1.6055, Val Accuracy: 40.84%, ETA: 00:15:11
Epoch 4/100, Loss: 1.6077, Accuracy: 41.76%, Val Loss: 1.5572, Val Accuracy: 43.16%, ETA: 00:15:24
Epoch 5/100, Loss: 1.5612, Accuracy: 43.49%, Val Loss: 1.5265, Val Accuracy: 44.90%, ETA: 00:14:49
Epoch 6/100, Loss: 1.5272, Accuracy: 44.73%, Val Loss: 1.4893, Val Accuracy: 45.70%, ETA: 00:14:31
Epoch 7/100, Loss: 1.4909, Accuracy: 46.18%, Val Loss: 1.4664, Val Accuracy: 46.96%, ETA: 00:14:28
Epoch 8/100, Loss: 1.4750, Accuracy: 46.67%, Val Loss: 1.4453, Val Accuracy: 47.20%, ETA: 00:14:34
Epoch 9/100, Loss: 1.4528, Accuracy: 47.77%, Val Loss: 1.4092, Val Accuracy: 48.62%, ETA: 00:14:30
Epoch 10/100, Loss: 1.4187, Accuracy: 49.10%, Val Loss: 1.3900, Val Accuracy: 50.38%, ETA: 00:14:04
Epoch 11/

In [None]:
# 테스트 평가
correct = 0
total = 0
model.eval()
with torch.no_grad():
    for inputs, labels in testloader:
        inputs, labels = inputs.to('cuda'), labels.to('cuda')
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Test Accuracy: {100 * correct / total:.2f}%')


Test Accuracy: 76.26%


In [None]:
model_path = 'vit_cifar10_val.pth'
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Model saved to vit_cifar10_val.pth


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, random_split
import torchvision.datasets as datasets
import time

# 데이터 증강 및 정규화 설정
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

# CIFAR-10 데이터셋 다운로드 및 로더 설정
full_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

# 전체 데이터를 학습, 검증, 테스트 데이터로 분할
train_size = int(0.8 * len(full_dataset))
val_size = int(0.1 * len(full_dataset))
train_dataset, val_dataset, _ = random_split(full_dataset, [train_size, val_size, len(full_dataset) - train_size - val_size])

trainloader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=2)
valloader = DataLoader(val_dataset, batch_size=256, shuffle=False, num_workers=2)
testloader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=2)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:12<00:00, 13295198.26it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [None]:
# 모델 정의
class MLPBlock(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout=0.1):
        super(MLPBlock, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, input_dim)
        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.gelu(self.fc1(x))
        x = self.dropout(x)
        x = self.gelu(self.fc2(x))
        x = self.dropout(x)
        return x

class MultiHeadedAttention(nn.Module):
    def __init__(self, dimension: int, heads: int = 8):
        super(MultiHeadedAttention, self).__init__()
        self.heads = heads
        self.dimension = dimension
        self.depth = dimension // heads

        self.wq = nn.Linear(dimension, dimension)
        self.wk = nn.Linear(dimension, dimension)
        self.wv = nn.Linear(dimension, dimension)
        self.dense = nn.Linear(dimension, dimension)

    def forward(self, x):
        batch_size = x.shape[0]

        def split_heads(x):
            x = x.view(batch_size, -1, self.heads, self.depth)
            return x.permute(0, 2, 1, 3)

        q = split_heads(self.wq(x))
        k = split_heads(self.wk(x))
        v = split_heads(self.wv(x))

        scaled_attention, _ = self.scaled_dot_product_attention(q, k, v)
        scaled_attention = scaled_attention.permute(0, 2, 1, 3).contiguous()
        concat_attention = scaled_attention.view(batch_size, -1, self.dimension)
        output = self.dense(concat_attention)
        return output

    def scaled_dot_product_attention(self, q, k, v):
        matmul_qk = torch.matmul(q, k.transpose(-2, -1))
        dk = k.shape[-1]
        scaled_attention_logits = matmul_qk / torch.sqrt(torch.tensor(dk, dtype=torch.float32))
        attention_weights = torch.nn.functional.softmax(scaled_attention_logits, dim=-1)
        output = torch.matmul(attention_weights, v)
        return output, attention_weights

class TransformerEncoder(nn.Module):
    def __init__(self, dimension, depth, heads, mlp_dimension, dropout=0.1):
        super(TransformerEncoder, self).__init__()
        self.layers = nn.ModuleList([
            nn.ModuleList([
                nn.LayerNorm(dimension),
                MultiHeadedAttention(dimension, heads),
                nn.LayerNorm(dimension),
                MLPBlock(dimension, mlp_dimension, dropout)
            ])
            for _ in range(depth)
        ])

    def forward(self, x):
        for norm1, attn, norm2, mlp in self.layers:
            x = attn(norm1(x)) + x
            x = mlp(norm2(x)) + x
        return x

class ImageTransformer(nn.Module):
    def __init__(self, image_size, patch_size, num_classes, dimension, depth, heads, mlp_dimension, channels=3):
        super(ImageTransformer, self).__init__()
        assert image_size % patch_size == 0, 'Invalid patch size for image size'

        num_patches = (image_size // patch_size) ** 2
        self.patch_size = patch_size
        self.dimension = dimension

        self.positional_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dimension))
        self.classification_token = nn.Parameter(torch.randn(1, 1, dimension))

        self.patch_projection = nn.Linear(patch_size * patch_size * channels, dimension)
        self.transformer = TransformerEncoder(dimension, depth, heads, mlp_dimension)
        self.norm = nn.LayerNorm(dimension)
        self.fc = nn.Linear(dimension, num_classes)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        batch_size = x.shape[0]

        patches = x.unfold(2, self.patch_size, self.patch_size).unfold(3, self.patch_size, self.patch_size)
        patches = patches.contiguous().view(batch_size, -1, self.patch_size * self.patch_size * x.shape[1])
        x = self.patch_projection(patches)

        cls_tokens = self.classification_token.expand(batch_size, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.positional_embedding

        x = self.transformer(x)
        x = self.norm(x)
        x = self.dropout(x[:, 0])
        x = self.fc(x)
        return x

# 모델 및 하이퍼파라미터 설정
model = ImageTransformer(
    image_size=32, patch_size=8, num_classes=10, dimension=128, depth=6, heads=4, mlp_dimension=256
).to('cuda')

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.0001)

In [None]:
# 학습
num_epochs = 100
for epoch in range(num_epochs):
    start_time = time.time()

    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in trainloader:
        inputs, labels = inputs.to('cuda'), labels.to('cuda')

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    train_loss = running_loss / len(trainloader)
    train_accuracy = 100 * correct / total

    # 검증 단계
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for inputs, labels in valloader:
            inputs, labels = inputs.to('cuda'), labels.to('cuda')

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    val_loss /= len(valloader)
    val_accuracy = 100 * val_correct / val_total

    epoch_time = time.time() - start_time
    remaining_time = epoch_time * (num_epochs - epoch - 1)
    eta = time.strftime("%H:%M:%S", time.gmtime(remaining_time))

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {train_loss:.4f}, Accuracy: {train_accuracy:.2f}%, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%, ETA: {eta}")


  self.pid = os.fork()


Epoch 1/100, Loss: 2.0547, Accuracy: 24.16%, Val Loss: 1.8163, Val Accuracy: 32.96%, ETA: 00:16:53
Epoch 2/100, Loss: 1.7607, Accuracy: 35.88%, Val Loss: 1.6505, Val Accuracy: 39.08%, ETA: 00:15:25
Epoch 3/100, Loss: 1.6689, Accuracy: 39.32%, Val Loss: 1.6196, Val Accuracy: 39.20%, ETA: 00:15:37
Epoch 4/100, Loss: 1.6131, Accuracy: 41.33%, Val Loss: 1.5540, Val Accuracy: 42.60%, ETA: 00:15:59
Epoch 5/100, Loss: 1.5714, Accuracy: 43.36%, Val Loss: 1.5519, Val Accuracy: 43.88%, ETA: 00:15:39
Epoch 6/100, Loss: 1.5432, Accuracy: 44.34%, Val Loss: 1.5129, Val Accuracy: 44.20%, ETA: 00:15:11
Epoch 7/100, Loss: 1.5196, Accuracy: 45.21%, Val Loss: 1.4553, Val Accuracy: 46.06%, ETA: 00:15:02
Epoch 8/100, Loss: 1.4926, Accuracy: 46.06%, Val Loss: 1.4706, Val Accuracy: 45.84%, ETA: 00:14:52
Epoch 9/100, Loss: 1.4795, Accuracy: 46.58%, Val Loss: 1.4511, Val Accuracy: 46.76%, ETA: 00:14:04
Epoch 10/100, Loss: 1.4616, Accuracy: 47.54%, Val Loss: 1.4288, Val Accuracy: 48.20%, ETA: 00:14:21
Epoch 11/

In [None]:
# 테스트 평가
correct = 0
total = 0
model.eval()
with torch.no_grad():
    for inputs, labels in testloader:
        inputs, labels = inputs.to('cuda'), labels.to('cuda')
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Test Accuracy: {100 * correct / total:.2f}%')


Test Accuracy: 68.17%


In [None]:
model_path = 'vit_cifar10_val_patch8.pth'
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Model saved to vit_cifar10_val_patch8.pth


In [None]:
# 학습 곡선 시각화
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(range(1, num_epochs + 1), train_losses, label='Train Loss')
plt.plot(range(1, num_epochs + 1), val_losses, label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(range(1, num_epochs + 1), train_accuracies, label='Train Accuracy')
plt.plot(range(1, num_epochs + 1), val_accuracies, label='Val Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()

plt.show()

NameError: name 'plt' is not defined