# Reference
* https://deep-learning-study.tistory.com/807
* https://towardsdatascience.com/implementing-visualttransformer-in-pytorch-184f9f16f632

In [1]:
!pip install einops

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import drive
drive.mount('content/')

Drive already mounted at content/; to attempt to forcibly remount, call drive.mount("content/", force_remount=True).


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

from torch import optim
from torchvision import datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import os
import json
from torchvision import utils


from torch import nn
from torch import Tensor
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor
from einops import rearrange, reduce, repeat
from einops.layers.torch import Rearrange, Reduce
from torchsummary import summary
import numpy as np
import pandas as pd
import time
import copy
import random
from tqdm.notebook import tqdm
import math

# Device configuration
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU가 사용 가능합니다.")
else:
    device = torch.device("cpu")
    print("GPU를 사용할 수 없습니다.")

GPU가 사용 가능합니다.


# 데이터 처리

In [4]:
#datapath = '/content/data'

#if not os.path.exists(datapath):
##  os.mkdir(datapath)

# STL10 data set 다운로드
#train_set = datasets.STL10(datapath,split='train', download = True, transform=transforms.ToTensor())
#val_set = datasets.STL10(datapath, split='test', download=True, transform=transforms.ToTensor())

#print(len(train_set))
#print(len(val_set))

trainpath = '/content/content/MyDrive/etc/aihub-meat-image/Training/'
valpath = '/content/content/MyDrive/etc/aihub-meat-image/Validation/'
train_imagepath = os.path.join(trainpath, '[image]cow_seg_')
val_imagepath = os.path.join(valpath,'[image]cow_seg_')



In [5]:
# JSON 파일이 있는 디렉토리 경로
train_labelpath = []
train_labelpath.append(os.path.join(trainpath, '[label]cow_seg_1'))
train_labelpath.append(os.path.join(trainpath, '[label]cow_seg_2'))
train_labelpath.append(os.path.join(trainpath, '[label]cow_seg_3'))

val_labelpath = []
val_labelpath.append(os.path.join(valpath, '[label]cow_seg_1'))
val_labelpath.append(os.path.join(valpath, '[label]cow_seg_2'))
val_labelpath.append(os.path.join(valpath, '[label]cow_seg_3'))

# 라벨 정보를 저장할 딕셔너리
jsons = []

# JSON 디렉토리 내의 모든 파일에 대해 라벨 정보 추출
for path in train_labelpath:
  for filename in os.listdir(path):
    if filename.endswith(".json"):
      json_path = os.path.join(path, filename)

      # JSON 파일 로드
      with open(json_path) as f:
        json_data = json.load(f)

      # 라벨 정보 추출
      jsons.append(json_data)

train_labels = []
for d in jsons:
  label = [d["label_info"]["image"]["file_name"],
           d["label_info"]["shapes"][0]["label"],
           d["label_info"]["shapes"][0]["grade"],
           d["label_info"]["shapes"][0]["gender"],
          ]
  train_labels.append(label)

# 라벨 정보를 저장할 딕셔너리
jsons = []

# JSON 디렉토리 내의 모든 파일에 대해 라벨 정보 추출
for path in val_labelpath:
  for filename in os.listdir(path):
    if filename.endswith(".json"):
      json_path = os.path.join(path, filename)

      # JSON 파일 로드
      with open(json_path) as f:
        json_data = json.load(f)

      # 라벨 정보 추출
      jsons.append(json_data)

val_labels = []
for d in jsons:
  label = [d["label_info"]["image"]["file_name"],
           d["label_info"]["shapes"][0]["label"],
           d["label_info"]["shapes"][0]["grade"],
           d["label_info"]["shapes"][0]["gender"],
          ]
  val_labels.append(label)

In [6]:
train_label_set = pd.DataFrame(data=train_labels, columns=['file_name','label','grade','gender'])
val_label_set = pd.DataFrame(data=val_labels, columns=['file_name','label','grade','gender'])
print(train_label_set)
print(val_label_set)

def grade_encoding(x):
  if x == '1':
    return 0
  elif x == '2':
    return 1
  elif x== '3':
    return 2
  return 0

one_hot_labels = torch.eye(3)[[0,1,2]]

print(one_hot_labels)

train_label_set['grade_encode'] = train_label_set['grade'].apply(grade_encoding)
val_label_set['grade_encode'] = val_label_set['grade'].apply(grade_encoding)

print(train_label_set)
print(val_label_set)

                             file_name     label grade  gender
0     QC_cow_segmentation_1_003463.jpg    hanwoo     1  female
1     QC_cow_segmentation_1_002225.jpg    hanwoo     1  female
2     QC_cow_segmentation_1_001673.jpg    hanwoo     1  female
3     QC_cow_segmentation_1_001809.jpg    hanwoo     1  female
4     QC_cow_segmentation_1_001262.jpg    hanwoo     1   steer
...                                ...       ...   ...     ...
2995  QC_cow_segmentation_3_011561.jpg  holstein     3   steer
2996  QC_cow_segmentation_3_000960.jpg    hanwoo     3  female
2997  QC_cow_segmentation_3_013209.jpg  holstein     3   steer
2998  QC_cow_segmentation_3_015332.jpg  holstein     3   steer
2999  QC_cow_segmentation_3_002319.jpg    hanwoo     3  female

[3000 rows x 4 columns]
                             file_name     label grade  gender
0     QC_cow_segmentation_1_069605.jpg    hanwoo     1   steer
1     QC_cow_segmentation_1_069588.jpg    hanwoo     1   steer
2     QC_cow_segmentation_1_06

In [7]:

class CustomImageDataset(Dataset):
  def __init__(self, labels, img_dir, transform=None, target_transform=None):
    self.img_dir = img_dir
    self.transform = transform
    self.target_transform = target_transform
    self.img_labels = labels
    #self.train = train
    #self.train_len = int(len(labels) * 0.8)
    #if train == True:
    #  self.img_labels = labels[:self.train_len]
    #else:
    #  self.img_labels = labels[self.train_len:]

  def __len__(self):
    return len(self.img_labels)

  def __getitem__(self, idx):
    img_path = self.img_dir + str(self.img_labels.iloc[idx]['grade'])
    img_path = os.path.join(img_path, self.img_labels.iloc[idx, 0])
    image = Image.open(img_path)
    label = self.img_labels.iloc[idx]['grade_encode']
    if self.transform:
        image = self.transform(image)
    if self.target_transform:
        label = self.target_transform(label)
    return image, label

In [8]:

# 이미지 변형 틀 정의
# tensor 형식 + 224로 resize
transformation = transforms.Compose([
    transforms.Resize([224,224]),
    transforms.ToTensor(),
    ])

# data set에 적용
#train_set.transform = transformation
#val_set.transform = transformation

# data loader 정의
#train_dl = DataLoader(train_set, batch_size=32, shuffle=True)
#val_dl = DataLoader(val_set, batch_size=64, shuffle=True)


In [9]:
train_set = CustomImageDataset(train_label_set, train_imagepath, transform=transformation)
val_set = CustomImageDataset(val_label_set, val_imagepath, transform=transformation)

In [10]:
train_set[0][0].shape

torch.Size([3, 224, 224])

In [11]:
print(train_set[0])
print(val_set[0])

(tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]]), 0)
(tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0.

In [12]:
train_dl = DataLoader(train_set, batch_size=16, shuffle=True)
val_dl = DataLoader(val_set,batch_size=8, shuffle = True)

# ViT 구현

patch embedding

In [13]:
# patch embedding 구현
# 2D 이미지를 일정 크기의 patch로 나눈다음에 이 patch들을 flat 시킴
class PatchEmbedding(nn.Module):
  def __init__(self, in_channels=3, patch_size=16,emb_size=768, img_size=224):
    super().__init__()
    self.patch_size = patch_size

    self.projection = nn.Sequential(
      Rearrange('b c (h s1) (w s2) -> b (h w) (s1 s2 c)', s1=patch_size, s2=patch_size),
      nn.Linear(patch_size * patch_size * in_channels, emb_size)
    )

    # cls token, position embedding 정의
    self.cls_token = nn.Parameter(torch.randn(1,1,emb_size))
    self.positions = nn.Parameter(torch.randn((img_size//patch_size)**2 + 1, emb_size))

  def forward(self,x):
    b = x.shape[0]
    x = self.projection(x)
    cls_tokens = repeat(self.cls_token, '() n e -> b n e',b=b)
    x = torch.cat([cls_tokens, x], dim = 1)
    x += self.positions
    return x

In [14]:
# patch embedding test
x = torch.randn(16, 3, 224, 224)
patch_embedding = PatchEmbedding()
patch_output = patch_embedding(x)
print('[batch, 1+num of patches, emb_size] = ', patch_output.shape)

[batch, 1+num of patches, emb_size] =  torch.Size([16, 197, 768])


multi head attention

In [15]:
# Multi-Head Attention

class MultiHeadAttention(nn.Module):
  def __init__(self, emb_size=768, num_heads=8, dropout=0):
    super().__init__()
    self.emb_size = emb_size
    self.num_heads = num_heads
    self.keys = nn.Linear(emb_size, emb_size)
    self.queries = nn.Linear(emb_size, emb_size)
    self.values = nn.Linear(emb_size, emb_size)
    self.att_drop = nn.Dropout(dropout)
    self.projection = nn.Linear(emb_size, emb_size)

  def forward(self, x, mask=None):
    # key , query, value로 나누기
    # b, 197, 728 -> b, 8, 197, 91
    queries = rearrange(self.queries(x), 'b n (h d) -> b h n d', h=self.num_heads)
    keys = rearrange(self.keys(x), 'b n (h d) -> b h n d', h=self.num_heads)
    values = rearrange(self.values(x), 'b n (h d) -> b h n d', h=self.num_heads)

    # matrix multiplication between queries and keys
    energy = torch.einsum('bhqd, bhkd -> bhqk', queries, keys) # batch, num_head, query_len, key_len

    if mask is not None:
      fill_value = torch.finfo(torch.float32).min
      energy.mask_fill(-mask,fill_value)

    scaling = self.emb_size ** (1/2)
    att = F.softmax(energy, dim=-1) / scaling
    att = self.att_drop(att)

    out = torch.einsum('bhal, bhlv -> bhav', att, values)
    out = rearrange(out, 'b h n d -> b n (h d)')
    out = self.projection(out)
    return out

In [16]:
# MultiHeadAttention test
MHA = MultiHeadAttention()
MHA_output = MHA(patch_output)
print(MHA_output.shape)

torch.Size([16, 197, 768])


residual

In [17]:
# Residual

class ResidualAdd(nn.Module):
  def __init__(self, fn):
    super().__init__()
    self.fn = fn

  def forward(self, x, **kwargs):
    res = x
    x = self.fn(x, **kwargs)
    x += res
    return x

Feed Forward Block

In [18]:
#Feed Foward Block
class FeedForwardBlock(nn.Sequential):
  def __init__(self, emb_size, expansion=4, drop_p=0):
    super().__init__(
        nn.Linear(emb_size,expansion*emb_size),
        nn.GELU(),
        nn.Dropout(drop_p),
        nn.Linear(expansion * emb_size, emb_size),
    )


In [19]:
# test
x = torch.randn(16,1,128)
model = FeedForwardBlock(128)
output = model(x)
print(output.shape)

torch.Size([16, 1, 128])


Encoder

In [20]:
class TransformerEncoderBlock(nn.Sequential):
  def __init__(self, emb_size=768, drop_p=0., forward_expansion=4, forward_drop_p=0., **kwargs):
    super().__init__(
        ResidualAdd(nn.Sequential(
            nn.LayerNorm(emb_size),
            MultiHeadAttention(emb_size, **kwargs),
            nn.Dropout(drop_p)
        )),
        ResidualAdd(nn.Sequential(
            nn.LayerNorm(emb_size),
            FeedForwardBlock(emb_size, **kwargs),
            nn.Dropout(drop_p)
        ))
    )

In [21]:
# TransformerEncoderBlock test
model = TransformerEncoderBlock()
output = model(patch_output)
print(output.shape)

torch.Size([16, 197, 768])


Transformer

In [22]:
class TransformerEncoder(nn.Sequential):
  def __init__(self, depth=12, **kwargs):
    super().__init__(*[TransformerEncoderBlock(**kwargs) for _ in range(depth)])

In [23]:
#TransformerEncoder test
model = TransformerEncoder()
output = model(patch_output)
print(output.shape)

torch.Size([16, 197, 768])


Classification Head

In [24]:
#Classification Head
class ClassificationHead(nn.Sequential):
  def __init__(self, emb_size=768, n_classes=10):
    super().__init__(
        Reduce('b n e -> b e', reduction='mean'),
        nn.LayerNorm(emb_size),
        nn.Linear(emb_size, n_classes)
    )

In [25]:
# Classification Head test
x = torch.randn(16, 1, 768)
model = ClassificationHead()
output = model(x)
print(output.shape)
print(output)

torch.Size([16, 10])
tensor([[-0.1756, -0.3962, -0.7319,  0.9893,  0.5073, -0.8378, -0.0500, -0.1542,
         -0.1830,  0.1590],
        [-0.3236, -0.1310,  0.0840, -0.0656, -0.5988, -0.1810,  0.5672,  0.1301,
          0.9058, -0.3975],
        [-0.2422,  0.5345, -1.0108,  0.5492, -0.5773,  0.6671, -0.0065, -0.1982,
         -0.9336, -0.2522],
        [ 0.0274, -0.9949,  0.6455,  0.5396,  0.2553,  0.3228, -0.4100,  0.3233,
          0.1157, -0.1160],
        [ 0.6084,  0.8529,  0.1225,  0.6400,  1.0559,  0.1767, -0.1677, -1.0577,
         -0.3706,  0.5752],
        [ 0.7302, -0.0735,  0.1970, -0.6718, -0.2570,  0.9854, -0.3303,  0.4793,
         -0.1289,  0.6237],
        [-0.4581,  0.4324,  0.9388, -0.2118, -0.0528,  0.7740,  0.2034, -0.6981,
         -0.3094, -0.6294],
        [-0.7056,  0.3142, -0.6580,  0.8068,  0.3903, -0.0375,  0.6319, -0.3740,
         -0.3815,  0.1366],
        [ 0.5644, -0.2214,  0.2263, -0.0865,  0.2352, -0.5273, -0.9620,  0.1701,
         -0.1206,  0.7289]

ViT

In [26]:
# ViT
class ViT(nn.Sequential):
  def __init__(self, in_channels=3, patch_size=16,emb_size=768,img_size=224,depth=12,n_classes=10, **kwargs):
    super().__init__(
        PatchEmbedding(in_channels, patch_size, emb_size, img_size),
        TransformerEncoder(depth, emb_size=emb_size, **kwargs),
        ClassificationHead(emb_size, n_classes)
    )


In [27]:
# ViT test
hyperparameter = {
    'patch_size': 16,
    'emb_size': 768,
    'img_size':224,
    'depth':12,
    'n_classes':1,
}
x = torch.randn(16,3,224,224).to(device)
model = ViT(n_classes=3).to(device)
output = model(x)
print(output.shape)
print(output)

torch.Size([16, 3])
tensor([[ 1.2018, -0.1067,  0.0368],
        [ 1.3140,  0.0887, -0.2325],
        [ 1.1362, -0.2226, -0.0786],
        [ 1.1904, -0.1666, -0.3068],
        [ 1.2641, -0.0335, -0.1170],
        [ 1.2406, -0.2646, -0.0852],
        [ 1.3557, -0.2209, -0.1937],
        [ 1.1473, -0.1054, -0.0439],
        [ 1.0723, -0.2278,  0.1508],
        [ 1.2304, -0.0989, -0.0146],
        [ 1.3120, -0.1242, -0.1207],
        [ 1.2713, -0.1714, -0.1558],
        [ 1.2936, -0.0825, -0.0918],
        [ 1.2397, -0.1886, -0.2195],
        [ 0.9669, -0.0925,  0.1166],
        [ 1.2676, -0.2016, -0.2853]], device='cuda:0',
       grad_fn=<AddmmBackward0>)


In [28]:
summary(ViT(), (3, 224, 224),device='cpu')

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
         Rearrange-1             [-1, 196, 768]               0
            Linear-2             [-1, 196, 768]         590,592
    PatchEmbedding-3             [-1, 197, 768]               0
         LayerNorm-4             [-1, 197, 768]           1,536
            Linear-5             [-1, 197, 768]         590,592
            Linear-6             [-1, 197, 768]         590,592
            Linear-7             [-1, 197, 768]         590,592
           Dropout-8          [-1, 8, 197, 197]               0
            Linear-9             [-1, 197, 768]         590,592
MultiHeadAttention-10             [-1, 197, 768]               0
          Dropout-11             [-1, 197, 768]               0
      ResidualAdd-12             [-1, 197, 768]               0
        LayerNorm-13             [-1, 197, 768]           1,536
           Linear-14            [-1, 1

Training

In [29]:
loss_func = nn.CrossEntropyLoss(reduction='sum')
opt = optim.Adam(model.parameters(), lr=0.01)

from torch.optim.lr_scheduler import ReduceLROnPlateau
lr_scheduler = ReduceLROnPlateau(opt, mode='min', factor=0.1, patience=10)

In [30]:
# get current lr
def get_lr(opt):
    for param_group in opt.param_groups:
        return param_group['lr']

In [35]:
# calculate the metric per mini-batch
def metric_batch(output, target):
    pred = output.argmax(1, keepdim=True)
    corrects = pred.eq(target.view_as(pred)).sum().item()
    return corrects

# calculate the loss per mini-batch
def loss_batch(loss_func, output, target, opt=None):
    loss_b = loss_func(output, target)
    metric_b = metric_batch(output, target)

    if opt is not None:
        opt.zero_grad()
        loss_b.backward()
        opt.step()

    return loss_b.item(), metric_b

# calculate the loss per epochs
def loss_epoch(model, loss_func, dataset_dl, sanity_check=False, opt=None):
    running_loss = 0.0
    running_metric = 0.0
    len_data = len(dataset_dl.dataset)

    for xb, yb in dataset_dl:
        xb = xb.to(device)
        yb = yb.to(device)
        output = model(xb)
        loss_b, metric_b = loss_batch(loss_func, output, yb, opt)

        running_loss += loss_b

        if metric_b is not None:
            running_metric += metric_b

        if sanity_check is True:
            break

    loss = running_loss / len_data
    metric = running_metric / len_data
    return loss, metric

In [36]:
# function to start training
def train_val(model, params):
    num_epochs=params['num_epochs']
    loss_func=params['loss_func']
    opt=params['optimizer']
    train_dl=params['train_dl']
    val_dl=params['val_dl']
    sanity_check=params['sanity_check']
    lr_scheduler=params['lr_scheduler']
    path2weights=params['path2weights']

    loss_history = {'train': [], 'val': []}
    metric_history = {'train': [], 'val': []}

    best_loss = float('inf')
    best_model_wts = copy.deepcopy(model.state_dict())
    start_time = time.time()

    for epoch in range(num_epochs):
        current_lr = get_lr(opt)
        print('Epoch {}/{}, current lr= {}'.format(epoch, num_epochs-1, current_lr))

        model.train()
        train_loss, train_metric = loss_epoch(model, loss_func, train_dl, sanity_check, opt)
        loss_history['train'].append(train_loss)
        metric_history['train'].append(train_metric)

        model.eval()
        with torch.no_grad():
            val_loss, val_metric = loss_epoch(model, loss_func, val_dl, sanity_check)
        loss_history['val'].append(val_loss)
        metric_history['val'].append(val_metric)

        if val_loss < best_loss:
            best_loss = val_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            torch.save(model.state_dict(), path2weights)
            print('Copied best model weights!')

        lr_scheduler.step(val_loss)
        if current_lr != get_lr(opt):
            print('Loading best model weights!')
            model.load_state_dict(best_model_wts)

        print('train loss: %.6f, val loss: %.6f, accuracy: %.2f, time: %.4f min' %(train_loss, val_loss, 100*val_metric, (time.time()-start_time)/60))
        print('-'*10)

    model.load_state_dict(best_model_wts)
    return model, loss_history, metric_history

In [39]:
# define the training parameters
params_train = {
    'num_epochs':20,
    'optimizer':opt,
    'loss_func':loss_func,
    'train_dl':train_dl,
    'val_dl':val_dl,
    'sanity_check':False,
    'lr_scheduler':lr_scheduler,
    'path2weights':'./models/weights.pt',
}

# check the directory to save weights.pt
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print('Error')
createFolder('./models')

In [None]:
# Start training
model, loss_hist, metric_hist = train_val(model, params_train)

Epoch 0/19, current lr= 0.01
Copied best model weights!
train loss: 1.101988, val loss: 1.098820, accuracy: 33.33, time: 4.6195 min
----------
Epoch 1/19, current lr= 0.01
Copied best model weights!
train loss: 1.101845, val loss: 1.098670, accuracy: 33.33, time: 9.2295 min
----------
Epoch 2/19, current lr= 0.01
train loss: 1.101305, val loss: 1.099607, accuracy: 33.33, time: 13.8234 min
----------
Epoch 3/19, current lr= 0.01
train loss: 1.102555, val loss: 1.098858, accuracy: 33.33, time: 18.4267 min
----------
Epoch 4/19, current lr= 0.01
train loss: 1.100264, val loss: 1.099193, accuracy: 33.33, time: 23.0078 min
----------
Epoch 5/19, current lr= 0.01
train loss: 1.101719, val loss: 1.099201, accuracy: 33.33, time: 27.6068 min
----------
Epoch 6/19, current lr= 0.01
train loss: 1.101269, val loss: 1.101156, accuracy: 33.33, time: 32.2022 min
----------
Epoch 7/19, current lr= 0.01
train loss: 1.101554, val loss: 1.102144, accuracy: 33.33, time: 36.7947 min
----------
Epoch 8/19, 

KeyboardInterrupt: ignored

In [None]:
num_epochs = params_train['num_epochs']

# Plot train-val loss
plt.title('Train-Val Loss')
plt.plot(range(1, num_epochs+1), loss_hist['train'], label='train')
plt.plot(range(1, num_epochs+1), loss_hist['val'], label='val')
plt.ylabel('Loss')
plt.xlabel('Training Epochs')
plt.legend()
plt.show()

# plot train-val accuracy
plt.title('Train-Val Accuracy')
plt.plot(range(1, num_epochs+1), metric_hist['train'], label='train')
plt.plot(range(1, num_epochs+1), metric_hist['val'], label='val')
plt.ylabel('Accuracy')
plt.xlabel('Training Epochs')
plt.legend()
plt.show()