In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from pprint import pprint
# from konlpy.tag import Mecab
from nltk.tokenize import word_tokenize as en_tokenizer
import sentencepiece as spm
import urllib.request
import csv
import numpy as np
from einops import rearrange, reduce, repeat
from torch.cuda import amp
from tqdm import tqdm
import wandb
import time
import copy
from collections import defaultdict
from sklearn.metrics import mean_squared_error
import joblib
import gc
import os
from icecream import ic   #디버깅을 해주는 library로 파이썬에서 출력됐을 때 어떤 코드에서 출력값이 나온 것인지 알려주는 라이브러리
from sklearn.model_selection import train_test_split
import os



In [2]:
VOCAB_SIZE = 10000
SEQ_LEN = 60


PAD_IDX = 0
BOS_IDX = 2
EOS_IDX = 3



# ENV = 'COLAB'
#ENV = 'KAGGLE'
ENV = 'SYSTEM'

# Option for Mixed Precision
FP16 = True
# FP16 = False

N = 2
HIDDEN_DIM = 256
NUM_HEAD = 8 
INNER_DIM = 512
BATCH_SIZE = 64
LEARNING_RATE = 1e-4
WEIGHT_DECAY = 0


CONFIG = {
    'VOCAB_SIZE': VOCAB_SIZE,
    'SEQ_LEN': SEQ_LEN,
    'N': N,
    'HIDDEN_DIM': HIDDEN_DIM,
    'NUM_HEAD': NUM_HEAD,
    'INNER_DIM': INNER_DIM,
    'BATCH_SIZE': BATCH_SIZE,
    'WEIGHT_DECAY' : WEIGHT_DECAY,
    'LEARNING_RATE' : LEARNING_RATE,
}


if 'device' not in globals():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using {device}')

Using cpu


### 데이터 수집

In [3]:
DATASET_PATH = './data'
en_train = open(os.path.join(DATASET_PATH, 'bible-all.en.txt'))
en_train_content = en_train.read()
en_train_list = en_train_content.split('\n')

ko_train = open(os.path.join(DATASET_PATH, 'bible-all.kr.txt'))
ko_train_content = ko_train.read()
ko_train_list = ko_train_content.split('\n')
en_train_list[:10]

['Genesis1.1  In the beginning God created the heavens and the earth.',
 'Genesis1.2  Now the earth was formless and empty, darkness was over the surface of the deep, and the Spirit of God was hovering over the waters.',
 'Genesis1.3  And God said, "Let there be light," and there was light.',
 'Genesis1.4  God saw that the light was good, and He separated the light from the darkness.',
 'Genesis1.5  God called the light "day," and the darkness he called "night." And there was evening, and there was morning--the first day.',
 'Genesis1.6  And God said, "Let there be an expanse between the waters to separate water from water."',
 'Genesis1.7  So God made the expanse and separated the water under the expanse from the water above it. And it was so.',
 'Genesis1.8  God called the expanse "sky." And there was evening, and there was morning--the second day.',
 'Genesis1.9  And God said, "Let the water under the sky be gathered to one place, and let dry ground appear." And it was so.',
 'Genes

In [4]:
data = pd.DataFrame()
data['en_raw'] = en_train_list
data['ko_raw'] = ko_train_list
data = data.reset_index(drop = True)
print(len(data))
data.head()

31104


Unnamed: 0,en_raw,ko_raw
0,Genesis1.1 In the beginning God created the h...,Genesis1.1 태초에 하나님이 천지를 창조하셨다.
1,Genesis1.2 Now the earth was formless and emp...,"Genesis1.2 땅이 혼돈하고 공허하며, 어둠이 깊음 위에 있고, 하나님의 영..."
2,"Genesis1.3 And God said, ""Let there be light,...","Genesis1.3 하나님이 말씀하시기를 ""빛이 생겨라"" 하시니, 빛이 생겼다."
3,"Genesis1.4 God saw that the light was good, a...","Genesis1.4 그 빛이 하나님 보시기에 좋았다. 하나님이 빛과 어둠을 나누셔서,"
4,"Genesis1.5 God called the light ""day,"" and th...","Genesis1.5 빛을 낮이라고 하시고, 어둠을 밤이라고 하셨다. 저녁이 되고 ..."


In [5]:
data['en'] = data['en_raw'].apply(lambda x: x.split(' ')[1:])
data['en'] = data['en'].apply(lambda x: (' ').join(x))
data['ko'] = data['ko_raw'].apply(lambda x: x.split(' ')[1:])
data['ko'] = data['ko'].apply(lambda x: (' ').join(x))

In [6]:
data = data[['en','ko']]
data.head()

Unnamed: 0,en,ko
0,In the beginning God created the heavens and ...,태초에 하나님이 천지를 창조하셨다.
1,"Now the earth was formless and empty, darknes...","땅이 혼돈하고 공허하며, 어둠이 깊음 위에 있고, 하나님의 영은 물 위에 움직이고..."
2,"And God said, ""Let there be light,"" and there...","하나님이 말씀하시기를 ""빛이 생겨라"" 하시니, 빛이 생겼다."
3,"God saw that the light was good, and He separ...","그 빛이 하나님 보시기에 좋았다. 하나님이 빛과 어둠을 나누셔서,"
4,"God called the light ""day,"" and the darkness ...","빛을 낮이라고 하시고, 어둠을 밤이라고 하셨다. 저녁이 되고 아침이 되니, 하루가..."


### 단어 사전 만들기

In [7]:
with open('src.txt', mode = 'w', encoding='utf8') as f:
    f.write('\n'.join(data['en']))
with open('trg.txt', mode= 'w', encoding='utf8') as f:
    f.write('\n'.join(data['ko']))

In [8]:
%%capture context
corpus = "src.txt"
prefix = "src"
spm.SentencePieceTrainer.train(
    f"--input={corpus} --model_prefix={prefix} --vocab_size={VOCAB_SIZE}" +
    " --model_type=bpe" +
    " --max_sentence_length=999999" +  # 문장 최대 길이
    " --pad_id=0 --pad_piece=[PAD]" +  # pad (0)
    " --unk_id=1 --unk_piece=[UNK]" +  # unknown (1)
    " --bos_id=2 --bos_piece=[BOS]" +  # begin of sequence (2)
    " --eos_id=3 --eos_piece=[EOS]" +  # end of sequence (3)
    " --user_defined_symbols=[SEP],[CLS],[MASK]");  # 사용자 정의 토큰

In [9]:
%%capture context
corpus = "trg.txt"
prefix = "trg"
spm.SentencePieceTrainer.train(
    f"--input={corpus} --model_prefix={prefix} --vocab_size={VOCAB_SIZE}" +
    " --model_type=bpe" +
    " --max_sentence_length=999999" +  # 문장 최대 길이
    " --pad_id=0 --pad_piece=[PAD]" +  # pad (0)
    " --unk_id=1 --unk_piece=[UNK]" +  # unknown (1)
    " --bos_id=2 --bos_piece=[BOS]" +  # begin of sequence (2)
    " --eos_id=3 --eos_piece=[EOS]" +  # end of sequence (3)
    " --user_defined_symbols=[SEP],[CLS],[MASK]");  # 사용자 정의 토큰

- 정수 인코딩

In [10]:
sp_src = spm.SentencePieceProcessor()
sp_src.Load('src.model')


for idx in range(3):
    sentence = data['en'][idx]
    print(sp_src.EncodeAsPieces(sentence))
    print(sp_src.EncodeAsIds(sentence))

['▁In', '▁the', '▁beginning', '▁God', '▁created', '▁the', '▁heavens', '▁and', '▁the', '▁earth', '.']
[502, 10, 2155, 133, 3212, 10, 1354, 19, 10, 458, 9961]
['▁Now', '▁the', '▁earth', '▁was', '▁form', 'less', '▁and', '▁empty', ',', '▁darkness', '▁was', '▁over', '▁the', '▁surface', '▁of', '▁the', '▁deep', ',', '▁and', '▁the', '▁Spirit', '▁of', '▁God', '▁was', '▁ho', 'vering', '▁over', '▁the', '▁waters', '.']
[589, 10, 458, 127, 3464, 636, 19, 3330, 9958, 1451, 127, 268, 10, 6810, 21, 10, 1685, 9958, 19, 10, 837, 21, 133, 127, 386, 8187, 268, 10, 1411, 9961]
['▁And', '▁God', '▁said', ',', '▁"', 'Let', '▁there', '▁be', '▁light', ',"', '▁and', '▁there', '▁was', '▁light', '.']
[288, 133, 150, 9958, 65, 1612, 250, 52, 897, 393, 19, 250, 127, 897, 9961]


In [11]:
def en_encode(tmpstr:str) -> np.array :
    tmpstr = np.array(sp_src.EncodeAsIds(tmpstr))

    # SEQ_LEN보다 길면 짜른다 
    if len(tmpstr) > SEQ_LEN :
        tmpstr = tmpstr[:SEQ_LEN]

    # SEQ_LEN보다 작으면 padding
    else :
        tmpstr = np.pad(tmpstr, (0, SEQ_LEN - len(tmpstr)), 'constant', constant_values = sp_src.pad_id())
    
    return tmpstr

In [12]:
# src_data는 data['en']를 참조한다. (동일 id)
src_data = data['en']

src_list = []

for idx in range(len(src_data)):
    src_list.append(en_encode(src_data[idx]))

src_list[0]

array([ 502,   10, 2155,  133, 3212,   10, 1354,   19,   10,  458, 9961,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0])

In [13]:
sp_trg = spm.SentencePieceProcessor()
sp_trg.Load('trg.model')


for idx in range(3):
    sentence = data['ko'][idx]
    print(sp_trg.EncodeAsPieces(sentence))
    print(sp_trg.EncodeAsIds(sentence))

['▁태', '초', '에', '▁하나님이', '▁천', '지를', '▁창조', '하셨다', '.']
[561, 9349, 8964, 213, 369, 513, 2208, 883, 8962]
['▁땅이', '▁혼', '돈', '하고', '▁공', '허', '하며', ',', '▁어둠이', '▁깊', '음', '▁위에', '▁있고', ',', '▁하나님의', '▁영', '은', '▁물', '▁위에', '▁움직', '이고', '▁계셨다', '.']
[1226, 1567, 9398, 106, 440, 9291, 455, 8961, 4716, 1114, 9043, 394, 696, 8961, 194, 153, 8978, 119, 394, 5214, 411, 4486, 8962]
['▁하나님이', '▁말씀하시기를', '▁"', '빛이', '▁생겨', '라', '"', '▁하시니', ',', '▁빛이', '▁생', '겼다', '.']
[213, 2045, 32, 7888, 5865, 8983, 8995, 2921, 8961, 3057, 171, 1450, 8962]


In [14]:
def ko_encode(tmpstr: str) -> np.array:
    tmpstr = np.array(sp_trg.EncodeAsIds(tmpstr))
    tmpstr = np.insert(tmpstr, 0, sp_trg.bos_id())

    if len(tmpstr) >= SEQ_LEN:
        # SEQ_LEN -1의 길이로 자른다
        tmpstr = tmpstr[:SEQ_LEN-1]
        # 마지막에 <eos> 토큰을 넣어줌으로써, 길이를 SEQ_LEN으로 맞춘다
        tmpstr = np.pad(tmpstr, (0, 1),
                        'constant', constant_values=sp_trg.eos_id())


    else:
        tmpstr = np.pad(tmpstr, (0, 1),
                        'constant', constant_values=sp_trg.eos_id())
        tmpstr = np.pad(tmpstr, (0, SEQ_LEN - len(tmpstr)),
                        'constant', constant_values=sp_trg.pad_id())

    return tmpstr

In [15]:
# trg_data는 data['ko']를 참조한다. (동일 id)
trg_data = data['ko']

trg_list = []

for idx in range(len(trg_data)):
    trg_list.append(ko_encode(trg_data[idx]))   

trg_list[0]

array([   2,  561, 9349, 8964,  213,  369,  513, 2208,  883, 8962,    3,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0])

In [16]:
src_train, src_valid, trg_train, trg_valid = train_test_split(src_list, trg_list, test_size=0.2, random_state=42)

In [17]:
class TrainDataset(Dataset):
    def __init__(self, src_data, trg_data):
        super().__init__()

        assert len(src_data) == len(trg_data)

        self.src_data = src_data
        self.trg_data = trg_data

    def __len__(self):
        return len(self.src_data)
        
    def __getitem__ (self, idx):
        src = self.src_data[idx]
        trg_input = self.trg_data[idx]
        trg_output = trg_input[1:SEQ_LEN]
        trg_output = np.pad(trg_output, (0,1), 'constant', constant_values =0)
        # (seq_len,)
        return torch.Tensor(src).long(), torch.Tensor(trg_input).long(), torch.Tensor(trg_output).long()

train_dataset = TrainDataset(src_train, trg_train)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle= True, pin_memory=True)

In [18]:
class ValidDataset(Dataset):
    def __init__(self, src_data, trg_data):
        super().__init__()

        assert len(src_data) == len(trg_data)

        self.src_data = src_data
        self.trg_data = trg_data

    def __len__(self):
        return len(self.src_data)
        
    def __getitem__ (self, idx):
        src = self.src_data[idx]
        trg_input = self.trg_data[idx]
        trg_output = trg_input[1:SEQ_LEN]
        trg_output = np.pad(trg_output, (0,1), 'constant',constant_values= 0)

        return torch.Tensor(src).long(), torch.Tensor(trg_input).long(), torch.Tensor(trg_output).long()

valid_dataset = ValidDataset(src_valid, trg_valid)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle= False, pin_memory=True)

### Mask Fuction
- padding option : 'PAD_IDX'로 지정된 패딩 토큰을 0으로 바꾸어 패딩 작업을 완료합니다.

- lookahead option : 
   - repeat 함수를 통해 padding된 행렬을 seq_len x seq_len 차원으로 확장시킵니다.
   - ones_like와 tril 메소드를 통해 해당 차원 내에서 하향 삼각행렬을 만듭니다.
   - 이후, 하향 삼각행렬(mask)와 repeate로 만들어진 확장된 패딩 행렬(padding_mask)을 내적하여 최종 행렬을 출력합니다.
       
       => 이는 미래 정보를 예측에 활용되지 않게 하기 위한 transformer의 masking 기법을 적용한 최종 산출물 입니다. 

In [19]:
'''
Mask 행렬을 반환하는 Mask Function
Masking은 QK_T 중 srcK 의 seq_len을 중심으로 한다는 점을 알아두자!!

Input
- Tensor
    shape (bs, srcK seq_len)

Args
- Option
    If option is 'padding', function returns padding mask
    If option is 'lookahead', function returns lookahead mask

Output
- Tensor (option = 'padding' )
    shape (bs, 1, 1, srcK seq_len)


* shape 중 (1, 1) 부분은 broad casting을 위한 것이다.
'''

def makeMask(tensor, option: str) -> torch.Tensor:
    '''
    tensor (bs, seq_len)
    '''
    if option == 'padding':
        tmp = torch.full_like(tensor, fill_value=PAD_IDX).to(device)
        # tmp : (bs,seq_len)
        mask = (tensor != tmp).float()
        # mask : (bs, seq_len)
        mask = rearrange(mask, 'bs seq_len -> bs 1 1 seq_len ')

        # mask(bs, 1, seq_len,seq_len)

        '''
        Example of mask
        tensor([[
         [1., 1., 1., 1., 0., 0., 0., 0.]]])
        '''

    elif option == 'lookahead':
        # srcQ의 seq_len과 srcK의 seq_len이 동일하다고 가정한다
        # tensor : (bs, seq_len)

        padding_mask = makeMask(tensor, 'padding')
        padding_mask = repeat(
            padding_mask, 'bs 1 1 k_len -> bs 1 new k_len', new=padding_mask.shape[3])
        # padding_mask : (bs, 1, seq_len, seq_len)

        '''
        Example of padding_mask
        tensor([[
         [1., 1., 1., 1., 0., 0., 0., 0.]
         [1., 1., 1., 1., 0., 0., 0., 0.]
         [1., 1., 1., 1., 0., 0., 0., 0.]
         [1., 1., 1., 1., 0., 0., 0., 0.]
         [1., 1., 1., 1., 0., 0., 0., 0.]
         [1., 1., 1., 1., 0., 0., 0., 0.]
         [1., 1., 1., 1., 0., 0., 0., 0.]
         [1., 1., 1., 1., 0., 0., 0., 0.]]])
        '''
        mask = torch.ones_like(padding_mask)
        mask = torch.tril(mask)
        '''
        Example of 'mask'
        tensor([[
        [1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]]])
        '''

        mask = mask * padding_mask
        # ic(mask.shape)

        '''
        Example
        tensor([[
         [1., 0., 0., 0., 0., 0., 0., 0.],
         [1., 1., 0., 0., 0., 0., 0., 0.],
         [1., 1., 1., 0., 0., 0., 0., 0.],
         [1., 1., 1., 1., 0., 0., 0., 0.],
         [1., 1., 1., 1., 0., 0., 0., 0.],
         [1., 1., 1., 1., 0., 0., 0., 0.],
         [1., 1., 1., 1., 0., 0., 0., 0.],
         [1., 1., 1., 1., 0., 0., 0., 0.]]])
        '''

    return mask

In [20]:
test = torch.Tensor([[1,2,3,4,5,6,0,0,0,0]])
ic(test.shape)
test1 = makeMask(test, option = 'padding')
test2 = makeMask(test, option = 'lookahead')
ic(test1.shape)
print(" padding  옵션에 대한 출력 \n =>", test1)
print()
ic(test2.shape)
print("lookahead 옵션에 대한 출력 \n =>", test2)

ic| test.shape: torch.Size([1, 10])
ic| test1.shape: torch.Size([1, 1, 1, 10])
ic| test2.shape:

 padding  옵션에 대한 출력 
 => tensor([[[[1., 1., 1., 1., 1., 1., 0., 0., 0., 0.]]]])



 torch.Size([1, 1, 10, 10])


lookahead 옵션에 대한 출력 
 => tensor([[[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 0., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 0., 0., 0., 0.],
          [1., 1., 1., 1., 1., 1., 0., 0., 0., 0.]]]])


### Multihead Attention

- initialize
    - Linear 선형 변환을 진행하여 더 좋은 품질의 데이터로 변환합니다.(해당 코드에서는 차원은 변하지 않습니다.)
        
        => 이는 학습을 통해 계속 update 되어 좋은 성능을 낼 수 있습니다.

    - Dropout을 통해 과적합을 방지합니다.

- forward
    1. rearrange를 통해 multi-head를 가진 학습 데이터로 변형시킵니다.
        - rearrange 함수는 문자열을 통해 차원을 변환시킵니다. (코드에서 '괄호'의 역할은 곱셈입니다.)
        
    2. Masking 후 Query와 key의 행렬을 내적하여 Energy 행렬을 만듭니다.

    3. 이후 softmax 함수를 통해 Energy를 확률로 변환시킨 행렬을 생성합니다.
    
    4. 이를 통해 만들어진 행렬과 Value 행렬을 내적하여 최종 예측된 행렬을 생성합니다.
    
    5. 마지막으로 다시 rearrange 함수를 통해 Concat을 진행합니다.
    
    6. 최종적으로 다시 fc layer를 통해 최종 산출물을 만들어냅니다.


In [21]:

class Multiheadattention(nn.Module):
    def __init__(self, hidden_dim: int, num_head: int):
        super().__init__()

        # embedding_dim, d_model, 512 in paper
        self.hidden_dim = hidden_dim
        # 8 in paper
        self.num_head = num_head
        # head_dim, d_key, d_query, d_value, 64 in paper (= 512 / 8)
        self.head_dim = hidden_dim // num_head
        self.scale = torch.sqrt(torch.FloatTensor()).to(device)

        self.fcQ = nn.Linear(hidden_dim, hidden_dim)
        self.fcK = nn.Linear(hidden_dim, hidden_dim)
        self.fcV = nn.Linear(hidden_dim, hidden_dim)
        self.fcOut = nn.Linear(hidden_dim, hidden_dim)

        self.dropout = nn.Dropout(0.1)


    def forward(self, srcQ, srcK, srcV, mask=None):

        ##### SCALED DOT PRODUCT ATTENTION ######

        # input : (bs, seq_len, hidden_dim)
        Q = self.fcQ(srcQ)
        K = self.fcK(srcK)
        V = self.fcV(srcV)
        
        #rearrange : 문자열을 통해 tensor의 배열을 바꿔주는 함수이다.(띄어쓰기를 통해 구별 / 괄호는 병합과 분리를 뜻한다.)
        Q = rearrange(
            Q, 'bs seq_len (num_head head_dim) -> bs num_head seq_len head_dim', num_head=self.num_head)
        K_T = rearrange(
            K, 'bs seq_len (num_head head_dim) -> bs num_head head_dim seq_len', num_head=self.num_head)
        V = rearrange(
            V, 'bs seq_len (num_head head_dim) -> bs num_head seq_len head_dim', num_head=self.num_head)
        
        attention_energy = torch.matmul(Q, K_T)
        # attention_energy : (bs, num_head, q_len, k_len)

        if mask is not None :
            '''
            mask.shape
            if padding : (bs, 1, 1, k_len)
            if lookahead : (bs, 1, q_len, k_len)
            '''
            attention_energy = torch.masked_fill(attention_energy, (mask == 0), -1e+4)
            
        attention_energy = torch.softmax(attention_energy, dim = -1)

        result = torch.matmul(self.dropout(attention_energy),V)
        # result (bs, num_head, seq_len, head_dim)

        ##### END OF SCALED DOT PRODUCT ATTENTION ######

        # CONCAT
        result = rearrange(result, 'bs num_head seq_len head_dim -> bs seq_len (num_head head_dim)')
        # result : (bs, seq_len, hidden_dim)

        # LINEAR

        result = self.fcOut(result)

        return result

In [22]:
### Linear 변환 이해 코드
# Linear는 "마지막 차원"에 대해 선형 변환을 적용시킨다.
import torch
import torch.nn as nn

# 예제 입력 텐서
batch_size = 2
seq_len = 3
random_dim = 5
hidden_dim = 4

# (batch_size, seq_len, hidden_dim) 형태의 입력 텐서
input_tensor = torch.randn(batch_size, seq_len, random_dim)

# nn.Linear 레이어 정의
linear_layer = nn.Linear(random_dim, hidden_dim)  # in_features와 out_features가 같음

# 선형 변환 적용
output_tensor = linear_layer(input_tensor)

print(f"Input shape: {input_tensor.shape}")
print(f"Output shape: {output_tensor.shape}")

Input shape: torch.Size([2, 3, 5])
Output shape: torch.Size([2, 3, 4])


In [23]:
# TEST CODE #
bs = 1
seq_len = 10
hidden_dim = 60

src = torch.randint(1, 10000, (bs, seq_len)).to(device)
ic(src.shape)
# src = torch.Tensor([[1,2,3,4,5,6,0,0,0,0]])

padding_mask = makeMask(src, option = 'padding')
ic(padding_mask.shape)
lookahead_mask = makeMask(src, option = 'lookahead')
ic(lookahead_mask.shape)

test_Q = torch.randn((bs, seq_len, hidden_dim)).to(device)
test_K = torch.randn((bs, seq_len, hidden_dim)).to(device)
test_V = torch.randn((bs, seq_len, hidden_dim)).to(device)

ic(test_Q.shape)
ic(test_K.shape)
ic(test_V.shape)
test_layer = Multiheadattention(hidden_dim=hidden_dim, num_head =2)
ic(test_layer(srcQ = test_Q, srcK = test_K, srcV = test_V, mask = padding_mask).shape)

ic| src.shape: torch.Size([1, 10])
ic| padding_mask.shape: torch.Size([1, 1, 1, 10])
ic| lookahead_mask.shape: torch.Size([1, 1, 10, 10])
ic| test_Q.shape: torch.Size([1, 10, 60])
ic| test_K.shape: torch.Size([1, 10, 60])
ic| test_V.shape: torch.Size([1, 10, 60])
ic| test_layer(srcQ = test_Q, srcK = test_K, srcV = test_V, mask = padding_mask).shape: torch.Size([1, 10, 60])


torch.Size([1, 10, 60])

### Positionwise Feedforward Network

In [24]:
class FFN(nn.Module):
    def __init__ (self, hidden_dim, inner_dim):
        super().__init__()

        # 512 in paper 
        self.hidden_dim = hidden_dim
        # 2048 in paper
        self.inner_dim = inner_dim 

        self.fc1 = nn.Linear(hidden_dim, inner_dim)
        self.fc2 = nn.Linear(inner_dim, hidden_dim)
        self.relu = nn.ReLU(inplace=False)
        self.dropout = nn.Dropout(0.1)


        
    def forward(self, input):
        output = input
        output = self.fc1(output)
        output2 = self.relu(output)
        output2 = self.dropout(output)
        output3 = self.fc2(output2)

        return output3

### Transformer 
<img src="transformer.png" alt="transformer architecture" width="800" height = "500">

##### Encoder Architecture
- Encode layer class를 만들어 위에 있는 하나의 Encoder layer를 만듭니다.
- 이후, Encdoer Architecture 코드를 작성합니다.
    - 여기서는 positional encoding 대신 positional embedding을 사용하였습니다.
        
        => 이를 통해 embedding 과정(masking 포함)을 진행하고 dropout까지 적용하였습니다.
    - N개의 layer를 for문을 통해 실행합니다.

In [25]:
class EncoderLayer(nn.Module):
    def __init__(self, hidden_dim, num_head, inner_dim):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.inner_dim = inner_dim
        
        self.multiheadattention = Multiheadattention(hidden_dim, num_head)
        self.ffn = FFN(hidden_dim, inner_dim)
        self.layerNorm1 = nn.LayerNorm(hidden_dim)
        self.layerNorm2 = nn.LayerNorm(hidden_dim)


        self.dropout1 = nn.Dropout(p=0.1)
        self.dropout2 = nn.Dropout(p=0.1)


    def forward(self, input, mask = None):

        # input : (bs, seq_len, hidden_dim)
        
        # encoder attention
        # uses only padding mask
        output = self.multiheadattention(srcQ= input, srcK = input, srcV = input, mask = mask)
        output = self.dropout1(output)
        output = input + output
        output = self.layerNorm1(output)

        output_ = self.ffn(output)
        output_ = self.dropout2(output_)
        output = output + output_
        output = self.layerNorm2(output)

        # output : (bs, seq_len, hidden_dim)
        return output

In [26]:
class Encoder(nn.Module):
    def __init__ (self, N, hidden_dim, num_head, inner_dim,max_length=100):
        super().__init__()

        # N : number of encoder layer repeated 
        self.N = N
        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.inner_dim = inner_dim

        self.embedding = nn.Embedding(num_embeddings=VOCAB_SIZE, embedding_dim=hidden_dim, padding_idx=0)
        self.pos_embedding = nn.Embedding(max_length, hidden_dim)
        self.enc_layers = nn.ModuleList([EncoderLayer(hidden_dim, num_head, inner_dim) for _ in range(N)])

        self.dropout = nn.Dropout(p=0.1)



    def forward(self, input):
        
        batch_size = input.shape[0]
        seq_len = input.shape[1]
        # input : (bs, seq_len)

        mask = makeMask(input, option='padding')

        pos = torch.arange(0, seq_len).unsqueeze(0).repeat(batch_size, 1).to(device)
        #0 부터 seq_len-1 까지의 숫자들을 (1 x seq_len)인 행렬로 만들고 이를 repeat를 통해 batch_size x seq_len 행렬로 바꿉니다.
        # pos: [batch_size, src_len]

        # embedding layer
        output = self.dropout(self.embedding(input) + self.pos_embedding(pos))
        # output : (bs, seq_len, hidden_dim)


        # Positional Embedding
        # output = pos_embed(output)

        # Dropout
        output = self.dropout(output)

        # N encoder layer
        for layer in self.enc_layers:
            output = layer(output, mask)

        # output : (bs, seq_len, hidden_dim)

        return output

##### Decoder Architecutre
- lookahead padding을 통해 미래 정보를 예측에 적용할 수 없게 합니다.
- finalFc를 통해 각 행렬을 단어의 사전으로 선형변환을 시켜줍니다.
- 이후 softmax 함수를 통해 각 토큰에서의 단어 확률을 만들고
- argmax를 통해 각 토큰에서 확률이 가장 큰 단어를 추출합니다. 

In [27]:
class DecoderLayer(nn.Module):
    def __init__(self, hidden_dim, num_head, inner_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.inner_dim = inner_dim

        self.multiheadattention1 = Multiheadattention(hidden_dim, num_head)
        self.layerNorm1 = nn.LayerNorm(hidden_dim)
        self.multiheadattention2 = Multiheadattention(hidden_dim, num_head)
        self.layerNorm2 = nn.LayerNorm(hidden_dim)
        self.ffn = FFN(hidden_dim, inner_dim)
        self.layerNorm3 = nn.LayerNorm(hidden_dim)

        self.dropout1 = nn.Dropout(p=0.1)
        self.dropout2 = nn.Dropout(p=0.1)
        self.dropout3 = nn.Dropout(p=0.1)

    
    def forward(self, input, enc_output, paddingMask, lookaheadMask):
        # input : (bs, seq_len, hidden_dim)
        # enc_output : (bs, seq_len, hidden_dim)

        # first multiheadattention
        output = self.multiheadattention1(input, input, input, lookaheadMask)
        output = self.dropout1(output)
        output = output + input
        output = self.layerNorm1(output)


        # second multiheadattention
        output_ = self.multiheadattention2(output, enc_output, enc_output, paddingMask)
        output_ = self.dropout2(output_)
        output = output_ + output
        output = self.layerNorm2(output)



        # Feedforward Network
        output_ = self.ffn(output)
        output_ = self.dropout3(output_)
        output = output + output_
        output = self.layerNorm3(output)



        return output

In [28]:
class Decoder(nn.Module):
    def __init__ (self, N, hidden_dim, num_head, inner_dim, max_length=100):
        super().__init__()

        # N : number of encoder layer repeated 
        self.N = N
        self.hidden_dim = hidden_dim
        self.num_head = num_head
        self.inner_dim = inner_dim

        self.embedding = nn.Embedding(num_embeddings=VOCAB_SIZE, embedding_dim=hidden_dim, padding_idx=0)
        self.pos_embedding = nn.Embedding(max_length, hidden_dim)

        self.dec_layers = nn.ModuleList([DecoderLayer(hidden_dim, num_head, inner_dim) for _ in range(N)])

        self.dropout = nn.Dropout(p=0.1)
        
        self.finalFc = nn.Linear(hidden_dim, VOCAB_SIZE)


    def forward(self, input, enc_src, enc_output):

        # input = dec_src : (bs, seq_len)
        # enc_src : (bs, seq_len)
        # enc_output : (bs, seq_len,hidden_dim)
        
        lookaheadMask = makeMask(input, option= 'lookahead')
        paddingMask = makeMask(enc_src, option = 'padding')

        # embedding layer
        output = self.embedding(input)
        # output = (bs, seq_len, hidden_dim)


        # Positional Embedding
        # output = pos_embed(output)

        # Dropout
        output = self.dropout(output)

        # N decoder layer
        for layer in self.dec_layers:
            output = layer(output, enc_output, paddingMask, lookaheadMask)
        # output : (bs, seq_len, hidden_dim)

        logits = self.finalFc(output)
        # logits : (bs, seq_len, VOCAB_SIZE)
        output = torch.softmax(logits, dim = -1)

        output = torch.argmax(output, dim = -1)
        # output : (bs, seq_len), dtype=int64



        return logits, output

### Transformer Model

In [29]:
class Transformer(nn.Module):
    def __init__(self, N = 2, hidden_dim = 256, num_head = 8, inner_dim = 512):
        super().__init__()
        self.encoder = Encoder(N, hidden_dim, num_head, inner_dim)
        self.decoder = Decoder(N, hidden_dim, num_head, inner_dim)

    def forward(self, enc_src, dec_src):
        # enc_src : (bs, seq_len)
        # dec_src : (bs, seq_len)

        # print(f'enc_src : {enc_src.shape}')
        # print(f'dec_src : {dec_src.shape}')

        enc_output = self.encoder(enc_src)
        # enc_output : (bs, seq_len, hidden_dim)
        logits, output = self.decoder(dec_src, enc_src, enc_output)
        # logits = (bs, seq_len, VOCAB_SIZE) 

        return logits, output

## 실제 훈련 과정

### Create Model

In [30]:
model = Transformer(N, HIDDEN_DIM, NUM_HEAD, INNER_DIM).to(device)
ic.disable() #ic 출력을 비활성화 하는 함수입니다.

### Check Model Structure

In [31]:
from torchsummary import summary
test1 = torch.randint(low = 0, high = 1000, size = (SEQ_LEN,))
test2 = torch.randint(low = 0, high = 1000, size = (SEQ_LEN,))
summary(model, [(SEQ_LEN,), (SEQ_LEN,)], dtypes = [torch.int, torch.int])

Layer (type:depth-idx)                        Output Shape              Param #
├─Encoder: 1-1                                [-1, 60, 256]             --
|    └─Embedding: 2-1                         [-1, 60, 256]             2,560,000
|    └─Embedding: 2-2                         [-1, 60, 256]             25,600
|    └─Dropout: 2-3                           [-1, 60, 256]             --
|    └─Dropout: 2-4                           [-1, 60, 256]             --
|    └─ModuleList: 2                          []                        --
|    |    └─EncoderLayer: 3-1                 [-1, 60, 256]             527,104
|    |    └─EncoderLayer: 3-2                 [-1, 60, 256]             527,104
├─Decoder: 1-2                                [-1, 60, 10000]           --
|    └─Embedding: 2-5                         [-1, 60, 256]             2,560,000
|    └─Dropout: 2-6                           [-1, 60, 256]             --
|    └─ModuleList: 2                          []                   

Layer (type:depth-idx)                        Output Shape              Param #
├─Encoder: 1-1                                [-1, 60, 256]             --
|    └─Embedding: 2-1                         [-1, 60, 256]             2,560,000
|    └─Embedding: 2-2                         [-1, 60, 256]             25,600
|    └─Dropout: 2-3                           [-1, 60, 256]             --
|    └─Dropout: 2-4                           [-1, 60, 256]             --
|    └─ModuleList: 2                          []                        --
|    |    └─EncoderLayer: 3-1                 [-1, 60, 256]             527,104
|    |    └─EncoderLayer: 3-2                 [-1, 60, 256]             527,104
├─Decoder: 1-2                                [-1, 60, 10000]           --
|    └─Embedding: 2-5                         [-1, 60, 256]             2,560,000
|    └─Dropout: 2-6                           [-1, 60, 256]             --
|    └─ModuleList: 2                          []                   

### Weight init

In [32]:
for param in model.named_parameters():
    if 'weight' in param[0] and 'layerNorm' not in param[0] :
        torch.nn.init.xavier_uniform_(param[1])

### Optimizer

In [33]:
optimizer = torch.optim.Adam(params = model.parameters(), lr = LEARNING_RATE, weight_decay = WEIGHT_DECAY)

### Loss Function

In [34]:
def criterion(logits: torch.tensor, targets: torch.tensor):
    return nn.CrossEntropyLoss(ignore_index=PAD_IDX)(logits.view(-1,VOCAB_SIZE), targets.view(-1))

### Training Function

In [35]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    # train 모드로 변경
    model.train()

    # for the Mixed Precision
    # Pytorch 예제 : https://pytorch.org/docs/stable/notes/amp_examples.html#amp-examples
    if(FP16):
        scaler = amp.GradScaler()

    dataset_size = 0
    running_loss = 0
    running_accuracy = 0
    accuracy = 0

    bar = tqdm(enumerate(dataloader), total=len(dataloader))

    for step, (src, trg_input, trg_output) in bar:
        src = src.to(device)
        trg_input = trg_input.to(device)
        trg_output = trg_output.to(device)

        batch_size = src.shape[0]

        if(FP16):
            with amp.autocast(enabled=True):
                logits, output = model(enc_src=src, dec_src=trg_input)
                loss = criterion(logits, trg_output)

                # loss를 Scale
                # Scaled Grdients를 계산(call)하기 위해 scaled loss를 backward()
                scaler.scale(loss).backward()
                # scaler.step() first unscales the gradients of the optimizer's assigned params.
                # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
                # otherwise, optimizer.step() is skipped.
                scaler.step(optimizer)

                # Updates the scale for next iteration.
                scaler.update()

        else:
            logits, output = model(enc_src=src, dec_src=trg_input)
            loss = criterion(logits, trg_output)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()

        # logits (bs, seq_len, VOCAB_SIZE)
        # trg_output (bs, seq_len)

        # zero the parameter gradients
        optimizer.zero_grad()

        # change learning rate by Scheduler
        if scheduler is not None:
            scheduler.step()

        # loss.item()은 loss를 Python Float으로 반환
        # loss.item()은 batch data의 average loss이므로, sum of loss를 구하기 위해 batch_size를 곱해준다
        running_loss += loss.item() * batch_size
        running_accuracy = np.mean(
            output.view(-1).detach().cpu().numpy() == trg_output.view(-1).detach().cpu().numpy())

        accuracy += running_accuracy

        dataset_size += batch_size
        epoch_loss = running_loss / dataset_size

        bar.set_postfix(
            Epoch=epoch, Train_Loss=epoch_loss, LR=optimizer.param_groups[0]["lr"], accuracy=accuracy / float(
                step+1)
        )

        # break

    accuracy /= len(dataloader)
    # Garbage Collector
    gc.collect()

    return epoch_loss, accuracy

### Validation Function

In [36]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()

    dataset_size = 0
    running_loss = 0
    accuracy = 0

    bar = tqdm(enumerate(dataloader), total=len(dataloader))

    for step, (src, trg_input, trg_output) in bar:
        src = src.to(device)
        trg_input = trg_input.to(device)
        trg_output = trg_output.to(device)

        batch_size = src.shape[0]

        logits, output = model(enc_src = src, dec_src = trg_input)
        loss = criterion(logits, trg_output)

        running_loss += loss.item() * batch_size
        dataset_size += batch_size

        # 실시간으로 정보를 표시하기 위한 epoch loss
        val_loss = running_loss / dataset_size
        running_accuracy = np.mean(output.view(-1).detach().cpu().numpy() == trg_output.view(-1).detach().cpu().numpy())
        
        accuracy += running_accuracy

        bar.set_postfix(
            Epoch=epoch, Valid_Loss=val_loss, LR=optimizer.param_groups[0]["lr"], accuracy = accuracy / float(step + 1)
        )

        # break

    accuracy /= len(dataloader)

    gc.collect()

    return val_loss, accuracy

In [37]:
def run_training(
    model,
    optimizer,
    scheduler,
    device,
    num_epochs,
    train_dataloader,
    valid_dataloader,
    file_prefix="",
    early_stopping=True,
    early_stopping_step=10,
):
    if torch.cuda.is_available():
        print("[INFO] Using GPU:{}\n".format(torch.cuda.get_device_name()))

    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = np.inf
    history = defaultdict(list)
    early_stop_counter = 0

    for epoch in range(1, num_epochs + 1):
        gc.collect()

        train_epoch_loss, train_accuracy = train_one_epoch(
            model,
            optimizer,
            scheduler,
            dataloader=train_dataloader,
            device=device,
            epoch=epoch,
        )

        val_loss, val_accuracy = valid_one_epoch(
            model, valid_dataloader, device=device, epoch=epoch
        )

        # Log metrics
        print(f"Epoch [{epoch}/{num_epochs}], Train Loss: {train_epoch_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
        print(f"Epoch [{epoch}/{num_epochs}], Valid Loss: {val_loss:.4f}, Valid Accuracy: {val_accuracy:.4f}")

        # Update history
        history['Train Loss'].append(train_epoch_loss)
        history['Train Accuracy'].append(train_accuracy)
        history['Valid Loss'].append(val_loss)
        history['Valid Accuracy'].append(val_accuracy)

        # Save best model
        if val_loss <= best_loss:
            early_stop_counter = 0
            print(f"Validation Loss improved ({best_loss:.4f} ---> {val_loss:.4f})")

            best_loss = val_loss
            best_model_wts = copy.deepcopy(model.state_dict())

            # Save model weights
            model_path = f"{file_prefix}epoch_{epoch}_loss_{best_loss:.4f}.pth"
            torch.save(model.state_dict(), model_path)
            print(f"Model Saved: {model_path}")

        elif early_stopping:
            early_stop_counter += 1
            if early_stop_counter > early_stopping_step:
                break

    end = time.time()
    time_elapsed = end - start
    print(
        "Training complete in {:.0f}h {:.0f}m {:.0f}s".format(
            time_elapsed // 3600,
            (time_elapsed % 3600) // 60,
            (time_elapsed % 3600) % 60,
        )
    )
    print("Best Loss: {:.4f}".format(best_loss))

    # Load best model weights
    model.load_state_dict(best_model_wts)

    return model, history

### Run training

In [38]:
run_training(
    model=model,
    optimizer=optimizer,
    scheduler=torch.optim.lr_scheduler.CosineAnnealingLR(optimizer=optimizer, T_max=100, eta_min=1e-5),
    device=device,
    num_epochs=50,
    train_dataloader= train_dataloader,
    valid_dataloader = valid_dataloader,
    file_prefix="./model/",
    early_stopping=True,
    early_stopping_step=10,
)


100%|██████████| 389/389 [57:55<00:00,  8.94s/it, Epoch=1, LR=9.73e-5, Train_Loss=7.93, accuracy=0.0275]    
100%|██████████| 98/98 [02:32<00:00,  1.56s/it, Epoch=1, LR=9.73e-5, Valid_Loss=7.48, accuracy=0.0441]


Epoch [1/50], Train Loss: 7.9259, Train Accuracy: 0.0275
Epoch [1/50], Valid Loss: 7.4763, Valid Accuracy: 0.0441
Validation Loss improved (inf ---> 7.4763)
Model Saved: ./model/epoch_1_loss_7.4763.pth


100%|██████████| 389/389 [23:13<00:00,  3.58s/it, Epoch=2, LR=8.97e-5, Train_Loss=7.19, accuracy=0.0648]
100%|██████████| 98/98 [02:58<00:00,  1.82s/it, Epoch=2, LR=8.97e-5, Valid_Loss=6.92, accuracy=0.0669]


Epoch [2/50], Train Loss: 7.1871, Train Accuracy: 0.0648
Epoch [2/50], Valid Loss: 6.9162, Valid Accuracy: 0.0669
Validation Loss improved (7.4763 ---> 6.9162)
Model Saved: ./model/epoch_2_loss_6.9162.pth


100%|██████████| 389/389 [20:20<00:00,  3.14s/it, Epoch=3, LR=7.79e-5, Train_Loss=6.74, accuracy=0.0698]
100%|██████████| 98/98 [02:29<00:00,  1.53s/it, Epoch=3, LR=7.79e-5, Valid_Loss=6.56, accuracy=0.0732]


Epoch [3/50], Train Loss: 6.7404, Train Accuracy: 0.0698
Epoch [3/50], Valid Loss: 6.5592, Valid Accuracy: 0.0732
Validation Loss improved (6.9162 ---> 6.5592)
Model Saved: ./model/epoch_3_loss_6.5592.pth


100%|██████████| 389/389 [23:58<00:00,  3.70s/it, Epoch=4, LR=6.34e-5, Train_Loss=6.43, accuracy=0.0767]
100%|██████████| 98/98 [02:31<00:00,  1.55s/it, Epoch=4, LR=6.34e-5, Valid_Loss=6.29, accuracy=0.0798]


Epoch [4/50], Train Loss: 6.4254, Train Accuracy: 0.0767
Epoch [4/50], Valid Loss: 6.2856, Valid Accuracy: 0.0798
Validation Loss improved (6.5592 ---> 6.2856)
Model Saved: ./model/epoch_4_loss_6.2856.pth


100%|██████████| 389/389 [27:33<00:00,  4.25s/it, Epoch=5, LR=4.8e-5, Train_Loss=6.19, accuracy=0.0818] 
100%|██████████| 98/98 [04:44<00:00,  2.91s/it, Epoch=5, LR=4.8e-5, Valid_Loss=6.09, accuracy=0.0845]


Epoch [5/50], Train Loss: 6.1879, Train Accuracy: 0.0818
Epoch [5/50], Valid Loss: 6.0850, Valid Accuracy: 0.0845
Validation Loss improved (6.2856 ---> 6.0850)
Model Saved: ./model/epoch_5_loss_6.0850.pth


100%|██████████| 389/389 [36:49<00:00,  5.68s/it, Epoch=6, LR=3.33e-5, Train_Loss=6, accuracy=0.0862]   
100%|██████████| 98/98 [02:40<00:00,  1.64s/it, Epoch=6, LR=3.33e-5, Valid_Loss=5.93, accuracy=0.0878]


Epoch [6/50], Train Loss: 6.0042, Train Accuracy: 0.0862
Epoch [6/50], Valid Loss: 5.9284, Valid Accuracy: 0.0878
Validation Loss improved (6.0850 ---> 5.9284)
Model Saved: ./model/epoch_6_loss_5.9284.pth


100%|██████████| 389/389 [24:10<00:00,  3.73s/it, Epoch=7, LR=2.12e-5, Train_Loss=5.85, accuracy=0.0899]
100%|██████████| 98/98 [02:34<00:00,  1.58s/it, Epoch=7, LR=2.12e-5, Valid_Loss=5.79, accuracy=0.091] 


Epoch [7/50], Train Loss: 5.8453, Train Accuracy: 0.0899
Epoch [7/50], Valid Loss: 5.7902, Valid Accuracy: 0.0910
Validation Loss improved (5.9284 ---> 5.7902)
Model Saved: ./model/epoch_7_loss_5.7902.pth


100%|██████████| 389/389 [22:58<00:00,  3.54s/it, Epoch=8, LR=1.32e-5, Train_Loss=5.7, accuracy=0.0934] 
100%|██████████| 98/98 [02:37<00:00,  1.61s/it, Epoch=8, LR=1.32e-5, Valid_Loss=5.67, accuracy=0.0942]


Epoch [8/50], Train Loss: 5.7017, Train Accuracy: 0.0934
Epoch [8/50], Valid Loss: 5.6670, Valid Accuracy: 0.0942
Validation Loss improved (5.7902 ---> 5.6670)
Model Saved: ./model/epoch_8_loss_5.6670.pth


100%|██████████| 389/389 [24:22<00:00,  3.76s/it, Epoch=9, LR=1e-5, Train_Loss=5.57, accuracy=0.0965]   
100%|██████████| 98/98 [03:03<00:00,  1.87s/it, Epoch=9, LR=1e-5, Valid_Loss=5.56, accuracy=0.0967]


Epoch [9/50], Train Loss: 5.5726, Train Accuracy: 0.0965
Epoch [9/50], Valid Loss: 5.5589, Valid Accuracy: 0.0967
Validation Loss improved (5.6670 ---> 5.5589)
Model Saved: ./model/epoch_9_loss_5.5589.pth


100%|██████████| 389/389 [23:00<00:00,  3.55s/it, Epoch=10, LR=1.22e-5, Train_Loss=5.45, accuracy=0.0992]
100%|██████████| 98/98 [01:46<00:00,  1.09s/it, Epoch=10, LR=1.22e-5, Valid_Loss=5.47, accuracy=0.0984]


Epoch [10/50], Train Loss: 5.4524, Train Accuracy: 0.0992
Epoch [10/50], Valid Loss: 5.4675, Valid Accuracy: 0.0984
Validation Loss improved (5.5589 ---> 5.4675)
Model Saved: ./model/epoch_10_loss_5.4675.pth


100%|██████████| 389/389 [15:37<00:00,  2.41s/it, Epoch=11, LR=1.94e-5, Train_Loss=5.34, accuracy=0.102]
100%|██████████| 98/98 [01:20<00:00,  1.22it/s, Epoch=11, LR=1.94e-5, Valid_Loss=5.38, accuracy=0.1]  


Epoch [11/50], Train Loss: 5.3419, Train Accuracy: 0.1016
Epoch [11/50], Valid Loss: 5.3819, Valid Accuracy: 0.1002
Validation Loss improved (5.4675 ---> 5.3819)
Model Saved: ./model/epoch_11_loss_5.3819.pth


100%|██████████| 389/389 [16:03<00:00,  2.48s/it, Epoch=12, LR=3.09e-5, Train_Loss=5.24, accuracy=0.104]
100%|██████████| 98/98 [01:48<00:00,  1.11s/it, Epoch=12, LR=3.09e-5, Valid_Loss=5.3, accuracy=0.102] 


Epoch [12/50], Train Loss: 5.2394, Train Accuracy: 0.1039
Epoch [12/50], Valid Loss: 5.3037, Valid Accuracy: 0.1016
Validation Loss improved (5.3819 ---> 5.3037)
Model Saved: ./model/epoch_12_loss_5.3037.pth


100%|██████████| 389/389 [16:56<00:00,  2.61s/it, Epoch=13, LR=4.52e-5, Train_Loss=5.15, accuracy=0.106]
100%|██████████| 98/98 [01:46<00:00,  1.09s/it, Epoch=13, LR=4.52e-5, Valid_Loss=5.24, accuracy=0.103]


Epoch [13/50], Train Loss: 5.1458, Train Accuracy: 0.1060
Epoch [13/50], Valid Loss: 5.2397, Valid Accuracy: 0.1028
Validation Loss improved (5.3037 ---> 5.2397)
Model Saved: ./model/epoch_13_loss_5.2397.pth


100%|██████████| 389/389 [16:35<00:00,  2.56s/it, Epoch=14, LR=6.06e-5, Train_Loss=5.05, accuracy=0.108]
100%|██████████| 98/98 [01:46<00:00,  1.09s/it, Epoch=14, LR=6.06e-5, Valid_Loss=5.17, accuracy=0.105]


Epoch [14/50], Train Loss: 5.0544, Train Accuracy: 0.1079
Epoch [14/50], Valid Loss: 5.1730, Valid Accuracy: 0.1045
Validation Loss improved (5.2397 ---> 5.1730)
Model Saved: ./model/epoch_14_loss_5.1730.pth


100%|██████████| 389/389 [16:39<00:00,  2.57s/it, Epoch=15, LR=7.54e-5, Train_Loss=4.97, accuracy=0.11] 
100%|██████████| 98/98 [01:49<00:00,  1.12s/it, Epoch=15, LR=7.54e-5, Valid_Loss=5.12, accuracy=0.106]


Epoch [15/50], Train Loss: 4.9695, Train Accuracy: 0.1099
Epoch [15/50], Valid Loss: 5.1167, Valid Accuracy: 0.1056
Validation Loss improved (5.1730 ---> 5.1167)
Model Saved: ./model/epoch_15_loss_5.1167.pth


100%|██████████| 389/389 [16:40<00:00,  2.57s/it, Epoch=16, LR=8.78e-5, Train_Loss=4.89, accuracy=0.112]
100%|██████████| 98/98 [01:51<00:00,  1.13s/it, Epoch=16, LR=8.78e-5, Valid_Loss=5.07, accuracy=0.107]


Epoch [16/50], Train Loss: 4.8882, Train Accuracy: 0.1116
Epoch [16/50], Valid Loss: 5.0675, Valid Accuracy: 0.1068
Validation Loss improved (5.1167 ---> 5.0675)
Model Saved: ./model/epoch_16_loss_5.0675.pth


100%|██████████| 389/389 [16:38<00:00,  2.57s/it, Epoch=17, LR=9.63e-5, Train_Loss=4.81, accuracy=0.113]
100%|██████████| 98/98 [01:45<00:00,  1.08s/it, Epoch=17, LR=9.63e-5, Valid_Loss=5.01, accuracy=0.108]


Epoch [17/50], Train Loss: 4.8112, Train Accuracy: 0.1134
Epoch [17/50], Valid Loss: 5.0092, Valid Accuracy: 0.1082
Validation Loss improved (5.0675 ---> 5.0092)
Model Saved: ./model/epoch_17_loss_5.0092.pth


100%|██████████| 389/389 [15:24<00:00,  2.38s/it, Epoch=18, LR=9.99e-5, Train_Loss=4.74, accuracy=0.115]
100%|██████████| 98/98 [01:19<00:00,  1.24it/s, Epoch=18, LR=9.99e-5, Valid_Loss=4.97, accuracy=0.109]


Epoch [18/50], Train Loss: 4.7404, Train Accuracy: 0.1153
Epoch [18/50], Valid Loss: 4.9660, Valid Accuracy: 0.1092
Validation Loss improved (5.0092 ---> 4.9660)
Model Saved: ./model/epoch_18_loss_4.9660.pth


100%|██████████| 389/389 [15:22<00:00,  2.37s/it, Epoch=19, LR=9.82e-5, Train_Loss=4.67, accuracy=0.117]
100%|██████████| 98/98 [01:43<00:00,  1.06s/it, Epoch=19, LR=9.82e-5, Valid_Loss=4.92, accuracy=0.11] 


Epoch [19/50], Train Loss: 4.6723, Train Accuracy: 0.1169
Epoch [19/50], Valid Loss: 4.9198, Valid Accuracy: 0.1104
Validation Loss improved (4.9660 ---> 4.9198)
Model Saved: ./model/epoch_19_loss_4.9198.pth


100%|██████████| 389/389 [14:12<00:00,  2.19s/it, Epoch=20, LR=9.14e-5, Train_Loss=4.61, accuracy=0.118]
100%|██████████| 98/98 [01:54<00:00,  1.16s/it, Epoch=20, LR=9.14e-5, Valid_Loss=4.88, accuracy=0.112]


Epoch [20/50], Train Loss: 4.6088, Train Accuracy: 0.1184
Epoch [20/50], Valid Loss: 4.8798, Valid Accuracy: 0.1117
Validation Loss improved (4.9198 ---> 4.8798)
Model Saved: ./model/epoch_20_loss_4.8798.pth


100%|██████████| 389/389 [16:38<00:00,  2.57s/it, Epoch=21, LR=8.03e-5, Train_Loss=4.55, accuracy=0.12] 
100%|██████████| 98/98 [01:46<00:00,  1.09s/it, Epoch=21, LR=8.03e-5, Valid_Loss=4.85, accuracy=0.113]


Epoch [21/50], Train Loss: 4.5468, Train Accuracy: 0.1201
Epoch [21/50], Valid Loss: 4.8459, Valid Accuracy: 0.1127
Validation Loss improved (4.8798 ---> 4.8459)
Model Saved: ./model/epoch_21_loss_4.8459.pth


100%|██████████| 389/389 [16:19<00:00,  2.52s/it, Epoch=22, LR=6.62e-5, Train_Loss=4.49, accuracy=0.121]
100%|██████████| 98/98 [01:46<00:00,  1.09s/it, Epoch=22, LR=6.62e-5, Valid_Loss=4.81, accuracy=0.114]


Epoch [22/50], Train Loss: 4.4882, Train Accuracy: 0.1214
Epoch [22/50], Valid Loss: 4.8064, Valid Accuracy: 0.1139
Validation Loss improved (4.8459 ---> 4.8064)
Model Saved: ./model/epoch_22_loss_4.8064.pth


100%|██████████| 389/389 [16:13<00:00,  2.50s/it, Epoch=23, LR=5.08e-5, Train_Loss=4.43, accuracy=0.123]
100%|██████████| 98/98 [01:45<00:00,  1.08s/it, Epoch=23, LR=5.08e-5, Valid_Loss=4.77, accuracy=0.115]


Epoch [23/50], Train Loss: 4.4313, Train Accuracy: 0.1231
Epoch [23/50], Valid Loss: 4.7731, Valid Accuracy: 0.1147
Validation Loss improved (4.8064 ---> 4.7731)
Model Saved: ./model/epoch_23_loss_4.7731.pth


100%|██████████| 389/389 [16:25<00:00,  2.53s/it, Epoch=24, LR=3.58e-5, Train_Loss=4.38, accuracy=0.124]
100%|██████████| 98/98 [01:44<00:00,  1.07s/it, Epoch=24, LR=3.58e-5, Valid_Loss=4.74, accuracy=0.116]


Epoch [24/50], Train Loss: 4.3763, Train Accuracy: 0.1245
Epoch [24/50], Valid Loss: 4.7420, Valid Accuracy: 0.1156
Validation Loss improved (4.7731 ---> 4.7420)
Model Saved: ./model/epoch_24_loss_4.7420.pth


100%|██████████| 389/389 [16:14<00:00,  2.50s/it, Epoch=25, LR=2.32e-5, Train_Loss=4.32, accuracy=0.126]
100%|██████████| 98/98 [01:45<00:00,  1.08s/it, Epoch=25, LR=2.32e-5, Valid_Loss=4.72, accuracy=0.116]


Epoch [25/50], Train Loss: 4.3219, Train Accuracy: 0.1262
Epoch [25/50], Valid Loss: 4.7154, Valid Accuracy: 0.1163
Validation Loss improved (4.7420 ---> 4.7154)
Model Saved: ./model/epoch_25_loss_4.7154.pth


100%|██████████| 389/389 [16:12<00:00,  2.50s/it, Epoch=26, LR=1.43e-5, Train_Loss=4.27, accuracy=0.128]
100%|██████████| 98/98 [01:45<00:00,  1.08s/it, Epoch=26, LR=1.43e-5, Valid_Loss=4.69, accuracy=0.117]


Epoch [26/50], Train Loss: 4.2683, Train Accuracy: 0.1277
Epoch [26/50], Valid Loss: 4.6852, Valid Accuracy: 0.1174
Validation Loss improved (4.7154 ---> 4.6852)
Model Saved: ./model/epoch_26_loss_4.6852.pth


100%|██████████| 389/389 [16:18<00:00,  2.52s/it, Epoch=27, LR=1.02e-5, Train_Loss=4.22, accuracy=0.129]
100%|██████████| 98/98 [01:46<00:00,  1.08s/it, Epoch=27, LR=1.02e-5, Valid_Loss=4.66, accuracy=0.118]


Epoch [27/50], Train Loss: 4.2150, Train Accuracy: 0.1292
Epoch [27/50], Valid Loss: 4.6620, Valid Accuracy: 0.1182
Validation Loss improved (4.6852 ---> 4.6620)
Model Saved: ./model/epoch_27_loss_4.6620.pth


100%|██████████| 389/389 [16:25<00:00,  2.53s/it, Epoch=28, LR=1.14e-5, Train_Loss=4.16, accuracy=0.131]
100%|██████████| 98/98 [01:45<00:00,  1.07s/it, Epoch=28, LR=1.14e-5, Valid_Loss=4.64, accuracy=0.119]


Epoch [28/50], Train Loss: 4.1632, Train Accuracy: 0.1309
Epoch [28/50], Valid Loss: 4.6383, Valid Accuracy: 0.1187
Validation Loss improved (4.6620 ---> 4.6383)
Model Saved: ./model/epoch_28_loss_4.6383.pth


100%|██████████| 389/389 [16:45<00:00,  2.58s/it, Epoch=29, LR=1.78e-5, Train_Loss=4.11, accuracy=0.133]
100%|██████████| 98/98 [01:44<00:00,  1.07s/it, Epoch=29, LR=1.78e-5, Valid_Loss=4.62, accuracy=0.12] 


Epoch [29/50], Train Loss: 4.1128, Train Accuracy: 0.1328
Epoch [29/50], Valid Loss: 4.6163, Valid Accuracy: 0.1198
Validation Loss improved (4.6383 ---> 4.6163)
Model Saved: ./model/epoch_29_loss_4.6163.pth


100%|██████████| 389/389 [17:37<00:00,  2.72s/it, Epoch=30, LR=2.85e-5, Train_Loss=4.06, accuracy=0.134]
100%|██████████| 98/98 [01:45<00:00,  1.07s/it, Epoch=30, LR=2.85e-5, Valid_Loss=4.6, accuracy=0.121] 


Epoch [30/50], Train Loss: 4.0628, Train Accuracy: 0.1342
Epoch [30/50], Valid Loss: 4.5956, Valid Accuracy: 0.1208
Validation Loss improved (4.6163 ---> 4.5956)
Model Saved: ./model/epoch_30_loss_4.5956.pth


100%|██████████| 389/389 [17:18<00:00,  2.67s/it, Epoch=31, LR=4.24e-5, Train_Loss=4.01, accuracy=0.136]
100%|██████████| 98/98 [01:44<00:00,  1.07s/it, Epoch=31, LR=4.24e-5, Valid_Loss=4.58, accuracy=0.121]


Epoch [31/50], Train Loss: 4.0140, Train Accuracy: 0.1359
Epoch [31/50], Valid Loss: 4.5804, Valid Accuracy: 0.1211
Validation Loss improved (4.5956 ---> 4.5804)
Model Saved: ./model/epoch_31_loss_4.5804.pth


100%|██████████| 389/389 [17:00<00:00,  2.62s/it, Epoch=32, LR=5.78e-5, Train_Loss=3.97, accuracy=0.138]
100%|██████████| 98/98 [01:45<00:00,  1.07s/it, Epoch=32, LR=5.78e-5, Valid_Loss=4.56, accuracy=0.122]


Epoch [32/50], Train Loss: 3.9665, Train Accuracy: 0.1375
Epoch [32/50], Valid Loss: 4.5612, Valid Accuracy: 0.1217
Validation Loss improved (4.5804 ---> 4.5612)
Model Saved: ./model/epoch_32_loss_4.5612.pth


100%|██████████| 389/389 [17:29<00:00,  2.70s/it, Epoch=33, LR=7.29e-5, Train_Loss=3.92, accuracy=0.139]
100%|██████████| 98/98 [01:43<00:00,  1.06s/it, Epoch=33, LR=7.29e-5, Valid_Loss=4.55, accuracy=0.123]


Epoch [33/50], Train Loss: 3.9202, Train Accuracy: 0.1392
Epoch [33/50], Valid Loss: 4.5481, Valid Accuracy: 0.1230
Validation Loss improved (4.5612 ---> 4.5481)
Model Saved: ./model/epoch_33_loss_4.5481.pth


100%|██████████| 389/389 [17:18<00:00,  2.67s/it, Epoch=34, LR=8.58e-5, Train_Loss=3.88, accuracy=0.141]
100%|██████████| 98/98 [01:49<00:00,  1.12s/it, Epoch=34, LR=8.58e-5, Valid_Loss=4.53, accuracy=0.123]


Epoch [34/50], Train Loss: 3.8761, Train Accuracy: 0.1409
Epoch [34/50], Valid Loss: 4.5325, Valid Accuracy: 0.1234
Validation Loss improved (4.5481 ---> 4.5325)
Model Saved: ./model/epoch_34_loss_4.5325.pth


100%|██████████| 389/389 [17:26<00:00,  2.69s/it, Epoch=35, LR=9.51e-5, Train_Loss=3.83, accuracy=0.142]
100%|██████████| 98/98 [01:26<00:00,  1.14it/s, Epoch=35, LR=9.51e-5, Valid_Loss=4.52, accuracy=0.124]


Epoch [35/50], Train Loss: 3.8328, Train Accuracy: 0.1423
Epoch [35/50], Valid Loss: 4.5242, Valid Accuracy: 0.1238
Validation Loss improved (4.5325 ---> 4.5242)
Model Saved: ./model/epoch_35_loss_4.5242.pth


100%|██████████| 389/389 [17:02<00:00,  2.63s/it, Epoch=36, LR=9.96e-5, Train_Loss=3.79, accuracy=0.144]
100%|██████████| 98/98 [01:26<00:00,  1.13it/s, Epoch=36, LR=9.96e-5, Valid_Loss=4.5, accuracy=0.124] 


Epoch [36/50], Train Loss: 3.7920, Train Accuracy: 0.1438
Epoch [36/50], Valid Loss: 4.5025, Valid Accuracy: 0.1242
Validation Loss improved (4.5242 ---> 4.5025)
Model Saved: ./model/epoch_36_loss_4.5025.pth


100%|██████████| 389/389 [18:49<00:00,  2.90s/it, Epoch=37, LR=9.89e-5, Train_Loss=3.75, accuracy=0.146]
100%|██████████| 98/98 [01:27<00:00,  1.12it/s, Epoch=37, LR=9.89e-5, Valid_Loss=4.49, accuracy=0.125]


Epoch [37/50], Train Loss: 3.7503, Train Accuracy: 0.1456
Epoch [37/50], Valid Loss: 4.4931, Valid Accuracy: 0.1252
Validation Loss improved (4.5025 ---> 4.4931)
Model Saved: ./model/epoch_37_loss_4.4931.pth


100%|██████████| 389/389 [21:48<00:00,  3.36s/it, Epoch=38, LR=9.3e-5, Train_Loss=3.71, accuracy=0.147] 
100%|██████████| 98/98 [02:15<00:00,  1.38s/it, Epoch=38, LR=9.3e-5, Valid_Loss=4.47, accuracy=0.126]


Epoch [38/50], Train Loss: 3.7095, Train Accuracy: 0.1472
Epoch [38/50], Valid Loss: 4.4739, Valid Accuracy: 0.1262
Validation Loss improved (4.4931 ---> 4.4739)
Model Saved: ./model/epoch_38_loss_4.4739.pth


100%|██████████| 389/389 [18:48<00:00,  2.90s/it, Epoch=39, LR=8.26e-5, Train_Loss=3.67, accuracy=0.149]
100%|██████████| 98/98 [02:40<00:00,  1.64s/it, Epoch=39, LR=8.26e-5, Valid_Loss=4.46, accuracy=0.126]


Epoch [39/50], Train Loss: 3.6722, Train Accuracy: 0.1485
Epoch [39/50], Valid Loss: 4.4576, Valid Accuracy: 0.1263
Validation Loss improved (4.4739 ---> 4.4576)
Model Saved: ./model/epoch_39_loss_4.4576.pth


100%|██████████| 389/389 [24:02<00:00,  3.71s/it, Epoch=40, LR=6.89e-5, Train_Loss=3.63, accuracy=0.15] 
100%|██████████| 98/98 [02:39<00:00,  1.62s/it, Epoch=40, LR=6.89e-5, Valid_Loss=4.45, accuracy=0.127]


Epoch [40/50], Train Loss: 3.6319, Train Accuracy: 0.1500
Epoch [40/50], Valid Loss: 4.4511, Valid Accuracy: 0.1272
Validation Loss improved (4.4576 ---> 4.4511)
Model Saved: ./model/epoch_40_loss_4.4511.pth


100%|██████████| 389/389 [24:31<00:00,  3.78s/it, Epoch=41, LR=5.36e-5, Train_Loss=3.59, accuracy=0.152]
100%|██████████| 98/98 [02:36<00:00,  1.59s/it, Epoch=41, LR=5.36e-5, Valid_Loss=4.44, accuracy=0.128]


Epoch [41/50], Train Loss: 3.5939, Train Accuracy: 0.1519
Epoch [41/50], Valid Loss: 4.4383, Valid Accuracy: 0.1279
Validation Loss improved (4.4511 ---> 4.4383)
Model Saved: ./model/epoch_41_loss_4.4383.pth


100%|██████████| 389/389 [23:26<00:00,  3.61s/it, Epoch=42, LR=3.84e-5, Train_Loss=3.56, accuracy=0.153]
100%|██████████| 98/98 [02:35<00:00,  1.59s/it, Epoch=42, LR=3.84e-5, Valid_Loss=4.43, accuracy=0.128]


Epoch [42/50], Train Loss: 3.5565, Train Accuracy: 0.1534
Epoch [42/50], Valid Loss: 4.4279, Valid Accuracy: 0.1284
Validation Loss improved (4.4383 ---> 4.4279)
Model Saved: ./model/epoch_42_loss_4.4279.pth


100%|██████████| 389/389 [24:16<00:00,  3.74s/it, Epoch=43, LR=2.52e-5, Train_Loss=3.52, accuracy=0.155]
100%|██████████| 98/98 [02:49<00:00,  1.73s/it, Epoch=43, LR=2.52e-5, Valid_Loss=4.42, accuracy=0.128]


Epoch [43/50], Train Loss: 3.5160, Train Accuracy: 0.1552
Epoch [43/50], Valid Loss: 4.4198, Valid Accuracy: 0.1284
Validation Loss improved (4.4279 ---> 4.4198)
Model Saved: ./model/epoch_43_loss_4.4198.pth


100%|██████████| 389/389 [18:36<00:00,  2.87s/it, Epoch=44, LR=1.56e-5, Train_Loss=3.48, accuracy=0.157]
100%|██████████| 98/98 [02:37<00:00,  1.61s/it, Epoch=44, LR=1.56e-5, Valid_Loss=4.42, accuracy=0.129]


Epoch [44/50], Train Loss: 3.4764, Train Accuracy: 0.1570
Epoch [44/50], Valid Loss: 4.4157, Valid Accuracy: 0.1292
Validation Loss improved (4.4198 ---> 4.4157)
Model Saved: ./model/epoch_44_loss_4.4157.pth


100%|██████████| 389/389 [18:09<00:00,  2.80s/it, Epoch=45, LR=1.06e-5, Train_Loss=3.44, accuracy=0.159]
100%|██████████| 98/98 [02:33<00:00,  1.57s/it, Epoch=45, LR=1.06e-5, Valid_Loss=4.41, accuracy=0.13] 


Epoch [45/50], Train Loss: 3.4392, Train Accuracy: 0.1588
Epoch [45/50], Valid Loss: 4.4077, Valid Accuracy: 0.1297
Validation Loss improved (4.4157 ---> 4.4077)
Model Saved: ./model/epoch_45_loss_4.4077.pth


100%|██████████| 389/389 [18:24<00:00,  2.84s/it, Epoch=46, LR=1.08e-5, Train_Loss=3.4, accuracy=0.161] 
100%|██████████| 98/98 [02:47<00:00,  1.71s/it, Epoch=46, LR=1.08e-5, Valid_Loss=4.4, accuracy=0.13]  


Epoch [46/50], Train Loss: 3.3998, Train Accuracy: 0.1605
Epoch [46/50], Valid Loss: 4.4012, Valid Accuracy: 0.1303
Validation Loss improved (4.4077 ---> 4.4012)
Model Saved: ./model/epoch_46_loss_4.4012.pth


100%|██████████| 389/389 [18:39<00:00,  2.88s/it, Epoch=47, LR=1.63e-5, Train_Loss=3.36, accuracy=0.163]
100%|██████████| 98/98 [02:32<00:00,  1.56s/it, Epoch=47, LR=1.63e-5, Valid_Loss=4.4, accuracy=0.13]  


Epoch [47/50], Train Loss: 3.3603, Train Accuracy: 0.1627
Epoch [47/50], Valid Loss: 4.3975, Valid Accuracy: 0.1304
Validation Loss improved (4.4012 ---> 4.3975)
Model Saved: ./model/epoch_47_loss_4.3975.pth


100%|██████████| 389/389 [18:15<00:00,  2.82s/it, Epoch=48, LR=2.63e-5, Train_Loss=3.32, accuracy=0.164]
100%|██████████| 98/98 [02:36<00:00,  1.60s/it, Epoch=48, LR=2.63e-5, Valid_Loss=4.4, accuracy=0.131] 


Epoch [48/50], Train Loss: 3.3202, Train Accuracy: 0.1645
Epoch [48/50], Valid Loss: 4.3969, Valid Accuracy: 0.1307
Validation Loss improved (4.3975 ---> 4.3969)
Model Saved: ./model/epoch_48_loss_4.3969.pth


100%|██████████| 389/389 [18:21<00:00,  2.83s/it, Epoch=49, LR=3.98e-5, Train_Loss=3.28, accuracy=0.166]
100%|██████████| 98/98 [02:33<00:00,  1.57s/it, Epoch=49, LR=3.98e-5, Valid_Loss=4.39, accuracy=0.131]


Epoch [49/50], Train Loss: 3.2831, Train Accuracy: 0.1663
Epoch [49/50], Valid Loss: 4.3939, Valid Accuracy: 0.1310
Validation Loss improved (4.3969 ---> 4.3939)
Model Saved: ./model/epoch_49_loss_4.3939.pth


100%|██████████| 389/389 [18:48<00:00,  2.90s/it, Epoch=50, LR=5.5e-5, Train_Loss=3.25, accuracy=0.168] 
100%|██████████| 98/98 [02:35<00:00,  1.58s/it, Epoch=50, LR=5.5e-5, Valid_Loss=4.39, accuracy=0.131]

Epoch [50/50], Train Loss: 3.2461, Train Accuracy: 0.1681
Epoch [50/50], Valid Loss: 4.3944, Valid Accuracy: 0.1311
Training complete in 18h 26m 49s
Best Loss: 4.3939





(Transformer(
   (encoder): Encoder(
     (embedding): Embedding(10000, 256, padding_idx=0)
     (pos_embedding): Embedding(100, 256)
     (enc_layers): ModuleList(
       (0): EncoderLayer(
         (multiheadattention): Multiheadattention(
           (fcQ): Linear(in_features=256, out_features=256, bias=True)
           (fcK): Linear(in_features=256, out_features=256, bias=True)
           (fcV): Linear(in_features=256, out_features=256, bias=True)
           (fcOut): Linear(in_features=256, out_features=256, bias=True)
           (dropout): Dropout(p=0.1, inplace=False)
         )
         (ffn): FFN(
           (fc1): Linear(in_features=256, out_features=512, bias=True)
           (fc2): Linear(in_features=512, out_features=256, bias=True)
           (relu): ReLU()
           (dropout): Dropout(p=0.1, inplace=False)
         )
         (layerNorm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
         (layerNorm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
   

In [39]:
torch.save(model.state_dict(), 'final.bin')

In [40]:
def predict(src_sentence):
    # Prepare Sample Sentence
    dec_sentence = ''

    enc_src = sp_src.EncodeAsIds(src_sentence)
    dec_src = []
    dec_src = np.insert(dec_src, 0, sp_trg.bos_id())
    # dec_src = ko_encode(dec_sentence)

    enc_src = torch.Tensor(enc_src).view(1, -1).int().to(device)
    dec_src = torch.Tensor(dec_src).view(1, -1).int().to(device)
    # enc_src : (1,seq_len)
    # dec_src : (1,seq_len)

    last_token = None
    last_token_idx = 0

    while(True):

        # dec_src에 dec_output의 last token을 추가합니다
        enc_output = model.encoder(enc_src)
        # enc_output : (1,seq_len, hidden_dim)

        dec_logits, dec_output = model.decoder(
            input=dec_src, enc_src=enc_src, enc_output=enc_output
        )
        # dec_output : (1,seq_len)
        # dec_logits : (1, seq_len, VOCAB_SIZE)

        last_token = dec_output[:, last_token_idx].item()
        last_token = torch.Tensor([last_token]).view(-1, 1).int()

        # last_token : (1, 1)
        dec_src = torch.cat((dec_src, last_token), dim=-1)

        last_token_idx = last_token_idx + 1

        # print(dec_src)
        # print(sp_trg.Decode(dec_src.tolist()))
        # print(last_token.item())
        if last_token.item() is EOS_IDX:
            break

    # ic(dec_src.tolist())
    return sp_trg.Decode(dec_src.tolist())

In [41]:
# Prepare 10 Sample Sentence
indices = np.random.choice(len(data['en']), 10, replace=False)
sentences = data['en'][indices].to_list()
answers = data['ko'][indices].to_list()

for idx in range(len(sentences)):
    sentence = sentences[idx]
    print(f'en = {sentence}')
    print(f'answer = {answers[idx]}')
    print(f'ko = {predict(sentence)}')

en =  All the people saw this and began to mutter, "He has gone to be the guest of a 'sinner.' "
answer =  그런데 사람들이 보고서, 모두 수군거리며 말하기를 "그가 죄인의 집에 묵으려고 들어갔다" 하였다.
ko = ['이 백성은 모두 예수를 모욕하고, 귀신 들린 사람이 예수께 말하였다. "이렇게 서 있습니다."']
en =  ten head of stall-fed cattle, twenty of pasture-fed cattle and a hundred sheep and goats, as well as deer, gazelles, roebucks and choice fowl.
answer =  살진 소 열 마리와 목장 소 스무 마리와 양 백 마리이고, 그 밖에 수사슴과 노루와 암사슴과 살진 새 들이었다.
ko = ['양 떼 가운데서, 염소 털옷과 소 떼이고, 고운 밀가루와 소와 나귀와 염소 털과 소와']
en =  So give your servant a discerning heart to govern your people and to distinguish between right and wrong. For who is able to govern this great people of yours?"
answer =  그러므로 주의 종에게 지혜로운 마음을 주셔서, 주의 백성을 재판하고, 선과 악을 분별할 수 있게 해주시기를 바랍니다. 이렇게 많은 주의 백성을 누가 재판할 수 있겠습니까?"
ko = ['그러므로 주의 종이요, 주의 종이요, 주의 종입니다. 이 백성의 마음을 다하여 주님을 경외하겠습니까?"']
en =  So Samuel told him everything, hiding nothing from him. Then Eli said, "He is the LORD; let him do what is good in his eyes."
answer =  사무엘은 그에게 하나도 숨