### set device and data

In [1]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'using device: {device}')

using device: cuda


In [2]:
def file2list(file):
    lst = []
    for line in open(f'./{file}', 'r', encoding='utf-8-sig'):
      line = line.replace('\n', '')
      lst.append(line)
    return lst

# train, valid, test texts
en_train_lst = file2list('train.en')
en_valid_lst = file2list('val.en')
en_test_lst  = file2list('test.en')
print(f'en data: {len(en_train_lst), len(en_valid_lst), len(en_test_lst)}')

de_train_lst = file2list('train.de')
de_valid_lst = file2list('val.de')
de_test_lst  = file2list('test.de')
print(f'de data: {len(de_train_lst), len(de_valid_lst), len(de_test_lst)}')

en data: (29001, 1015, 1000)
de data: (29001, 1015, 1000)


### build vocabularies

In [3]:
# using spacy tokenizer
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 1.5 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting de-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.4.0/de_core_news_sm-3.4.0-py3-none-any.whl (14.6 MB)
[K     |████████████████████████████████| 14.6 MB 1.8 MB/s 
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.4.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [4]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

# using spacy tokenizer
en_tokenizer = get_tokenizer(tokenizer='spacy', language='en_core_web_sm')
de_tokenizer = get_tokenizer(tokenizer='spacy', language='de_core_news_sm')

# vocabulary for training texts
en_vocab = build_vocab_from_iterator(map(en_tokenizer, [text for text in en_train_lst]),
                                     min_freq=2,
                                     specials=['<unk>','<sos>','<eos>','<pad>'],
                                     special_first=True)
de_vocab = build_vocab_from_iterator(map(de_tokenizer, [text for text in de_train_lst]),
                                     min_freq=2,
                                     specials=['<unk>','<sos>','<eos>','<pad>'],
                                     special_first=True)
print(f'en vocab size: {len(en_vocab)}')
print(f'de vocab size: {len(de_vocab)}')

en vocab size: 6191
de vocab size: 8014


### make preprocessor for encode & decode texts

In [5]:
class Preprocessor:
  unk_token_id = 0
  sos_token_id = 1
  eos_token_id = 2
  pad_token_id = 3

  def __init__(self, src_tokenizer, tgt_tokenizer, src_vocab, tgt_vocab):
    self.src_tokenizer = src_tokenizer
    self.tgt_tokenizer = tgt_tokenizer

    # token -> id
    self.src_token2id = src_vocab.get_stoi()
    self.tgt_token2id = tgt_vocab.get_stoi()
    # id -> token
    self.src_id2token = src_vocab.get_itos()
    self.tgt_id2token = tgt_vocab.get_itos()

  # encode token -> id for source sentence
  def src_encode(self, text):
    if type(text) == list: text = ' '.join(text)
    tokenized = self.src_tokenizer(text)
    encoded   = [self.src_token2id.get(token, self.src_token2id['<unk>']) for token in tokenized]
    return encoded

  # encode token -> id, attach <sos> and <eos> for target sentence
  def tgt_encode(self, text):
    if type(text) == list: text = ' '.join(text)
    tokenized = self.tgt_tokenizer(text)
    encoded   = [self.tgt_token2id['<sos>']] \
    + [self.tgt_token2id.get(token, self.tgt_token2id['<unk>']) for token in tokenized] \
    + [self.tgt_token2id['<eos>']]
    return encoded

  # decode source sentence
  def src_decode(self, ids):
    decoded = list(map(lambda x: self.src_id2token[x], ids))
    return ' '.join(decoded)

  # decode target sentence
  def tgt_decode(self, ids):
    decoded = list(map(lambda x: self.tgt_id2token[x], ids))[1:-1]
    return ' '.join(decoded)

preprocessor = Preprocessor(en_tokenizer, de_tokenizer, en_vocab, de_vocab)

### make custom dataset, data loader

In [6]:
from torch.utils.data.dataset import Dataset

class CustomDataset(Dataset):
  def __init__(self, en_lst, de_lst, preprocessor):
    self.en_lst = en_lst
    self.de_lst = de_lst
    assert len(self.en_lst) == len(self.de_lst)

    self.preprocessor = preprocessor
    self.dataset      = self.make_dataset()

  # make (en, de) sentence pair dataset
  def make_dataset(self):
    dataset = [(self.preprocessor.src_encode(en), self.preprocessor.tgt_encode(de))
               for en, de in zip(self.en_lst, self.de_lst)
               if len(en) > 0 and len(de) > 0]
    return dataset

  def __getitem__(self, idx):
    return self.dataset[idx]

  def __len__(self):
    return len(self.dataset)

train_dataset = CustomDataset(en_train_lst, de_train_lst, preprocessor)
valid_dataset = CustomDataset(en_valid_lst, de_valid_lst, preprocessor)
test_dataset  = CustomDataset(en_test_lst, de_test_lst, preprocessor)
print(f'dataset size: {len(train_dataset), len(valid_dataset), len(test_dataset)}')

dataset size: (29000, 1014, 1000)


In [7]:
import random
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

# sequence padding in batch
def collate_fn(batch_samples):
  pad_token_id = preprocessor.pad_token_id
  src_sent = pad_sequence([torch.tensor(src) for src, _ in batch_samples],
                          batch_first=True,
                          padding_value=pad_token_id)
  tgt_sent = pad_sequence([torch.tensor(tgt) for _, tgt in batch_samples],
                          batch_first=True,
                          padding_value=pad_token_id)
  return src_sent.to(device), tgt_sent.to(device)

# composing batch to match similar lengths of sequences
def batch_sampling(sequence_lengths, BATCH_SIZE):
  seq_lens = [(i, seq_len, tgt_len) for i, (seq_len, tgt_len) in enumerate(sequence_lengths)]
  seq_lens = sorted(seq_lens, key=lambda x: x[1])
  seq_lens = [sample[0] for sample in seq_lens]

  sample_indices = [seq_lens[i:i+BATCH_SIZE] for i in range(0, len(seq_lens), BATCH_SIZE)]
  random.shuffle(sample_indices)
  return sample_indices

# make dataloader using sequence padding & composing batch
def make_loader(dataset, BATCH_SIZE):
  sequence_lengths = list(map(lambda x: (len(x[0]), len(x[1])), dataset))
  batch_sampler = batch_sampling(sequence_lengths, BATCH_SIZE)
  loader = DataLoader(dataset,
                      collate_fn=collate_fn,
                      batch_sampler=batch_sampler)
  return loader

BATCH_SIZE = 64
train_loader = make_loader(train_dataset, BATCH_SIZE)
valid_loader = make_loader(valid_dataset, BATCH_SIZE)
test_loader  = make_loader(test_dataset, BATCH_SIZE)
print('check dataset and batch size,')

print(f'train data length       : {len(en_train_lst)}')
print(f'train data loader length: {len(train_loader)}')
print(f'train data loader length: {len(en_train_lst) / BATCH_SIZE}\n')

print(f'valid data length       : {len(en_valid_lst)}')
print(f'valid data loader length: {len(valid_loader)}')
print(f'valid data loader length: {len(en_valid_lst) / BATCH_SIZE}\n')

print(f'test data length       : {len(en_test_lst)}')
print(f'test data loader length: {len(test_loader)}')
print(f'test data loader length: {len(en_test_lst) / BATCH_SIZE}')

check dataset and batch size,
train data length       : 29001
train data loader length: 454
train data loader length: 453.140625

valid data length       : 1015
valid data loader length: 16
valid data loader length: 15.859375

test data length       : 1000
test data loader length: 16
test data loader length: 15.625


### multi-head attention, position-wise feed forward

In [8]:
import torch.nn as nn

class MultiHeadAttentionLayer(nn.Module):
  def __init__(self, hidden_dim, n_heads, dropout_ratio, device):
    super().__init__()
    assert hidden_dim % n_heads == 0
    self.hidden_dim = hidden_dim  
    self.n_heads    = n_heads
    self.head_dim   = hidden_dim // n_heads  # n_heads * hidden_dim = head_dim

    self.fc_query = nn.Linear(hidden_dim, hidden_dim)
    self.fc_key   = nn.Linear(hidden_dim, hidden_dim)
    self.fc_value = nn.Linear(hidden_dim, hidden_dim)
    self.fc_o     = nn.Linear(hidden_dim, hidden_dim)

    self.scale   = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
    self.dropout = nn.Dropout(dropout_ratio)

  def forward(self, query, key, value, mask=None):
    batch_size = query.shape[0]
    # query: [batch_size, query_len, hidden_dim]
    # key  : [batch_size, key_len, hidden_dim]
    # value: [batch_size, value_len, hidden_dim]

    Q = self.fc_query(query)
    K = self.fc_key(key)
    V = self.fc_value(value)
    # Q: [batch_size, query_len, hidden_dim]
    # K: [batch_size, key_len, hidden_dim]
    # V: [batch_size, value_len, hidden_dim]

    # hidden_dim to n_heads * head_dim
    Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
    K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
    V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
    # Q: [batch_size, n_heads, query_len, head_dim]
    # K: [batch_size, n_heads, key_len, head_dim]
    # V: [batch_size, n_heads, value_len, head_dim]

    # attention energies, (Q*K^T/n)
    energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
    # energy: [batch_size, n_heads, query_len, key_len]

    # masking
    if mask is not None:
      energy = energy.masked_fill(mask==0, -1e10)

    # attention score, softmax(Q*K^T/n)
    attention = torch.softmax(energy, dim=-1)
    # attention: [batch_size, n_heads, query_len, key_len]

    # softmax(Q*K^T/n)*V
    x = torch.matmul(self.dropout(attention), V)
    # x: [batch_size, n_heads, query_len, head_dim]

    x = x.permute(0, 2, 1, 3).contiguous()
    x = x.view(batch_size, -1, self.hidden_dim)
    # x: [batch_size, query_len, hidden_dim]
    x = self.fc_o(x)
    return x, attention


class PositionwiseFeedforwardLayer(nn.Module):
  def __init__(self, hidden_dim, pf_dim, dropout_ratio):
    super().__init__()
    self.fc_1 = nn.Linear(hidden_dim, pf_dim)
    self.fc_2 = nn.Linear(pf_dim, hidden_dim)
    self.dropout = nn.Dropout(dropout_ratio)

  def forward(self, x):
    # x: [batch_size, seq_len, hidden_dim]

    x = self.dropout(torch.relu(self.fc_1(x)))
    # x: [batch_size, seq_len, pf_dim]

    x = self.fc_2(x)
    # x: [batch_size, seq_len, hidden_dim]
    return x

### transformer encoder

In [9]:
class EncoderLayer(nn.Module):
  def __init__(self, hidden_dim, n_heads, pf_dim, dropout_ratio, device):
    super().__init__()
    self.self_attn_layer_norm = nn.LayerNorm(hidden_dim)
    self.ff_layer_norm        = nn.LayerNorm(hidden_dim)

    self.self_attention           = MultiHeadAttentionLayer(hidden_dim, n_heads, dropout_ratio, device)
    self.positionwise_feedforward = PositionwiseFeedforwardLayer(hidden_dim, pf_dim, dropout_ratio)
    self.dropout = nn.Dropout(dropout_ratio)

  def forward(self, src, src_mask):
    # src     : [batch_size, src_len, hidden_dim]
    # src_mask: [batch_size, src_len]

    # self attention
    _src, _ = self.self_attention(src, src, src, src_mask)

    # residual connection & layer norm
    src = self.self_attn_layer_norm(src + self.dropout(_src))

    # position-wise feed forward
    _src = self.positionwise_feedforward(src)

    # residual connection & layer norm
    src = self.ff_layer_norm(src + self.dropout(_src))
    return src


class Encoder(nn.Module):
  def __init__(self, input_dim, hidden_dim, n_layers, n_heads, pf_dim, dropout_ratio, device, max_length=100):
    super().__init__()
    self.device = device
    self.tok_embedding = nn.Embedding(input_dim, hidden_dim)
    self.pos_embedding = nn.Embedding(max_length, hidden_dim)
    self.layers  = nn.ModuleList([EncoderLayer(hidden_dim, n_heads, pf_dim, dropout_ratio, device)
                                  for _ in range(n_layers)])
    
    self.scale   = torch.sqrt(torch.FloatTensor([hidden_dim])).to(device)
    self.dropout = nn.Dropout(dropout_ratio)

  def forward(self, src, src_mask):
    # src     : [batch_size, src_len]
    # src_mask: [batch_size, src_len]
    
    batch_size = src.shape[0]  # number of sentences
    src_len    = src.shape[1]  # longest sentence's length

    pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
    # pos: [batch_size, src_len]

    # token embedding + position embedding
    src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))

    # forward for all layers
    for layer in self.layers:
      src = layer(src, src_mask)
    # src: [batch_size, src_len, hidden_dim]
    return src

### transformer decoder

In [10]:
class DecoderLayer(nn.Module):
  def __init__(self, hidden_dim, n_heads, pf_dim, dropout_ratio, device):
    super().__init__()
    self.self_attn_layer_norm  = nn.LayerNorm(hidden_dim)
    self.enc_attn_layer_norm   = nn.LayerNorm(hidden_dim)
    self.ff_layer_norm         = nn.LayerNorm(hidden_dim)

    self.self_attention           = MultiHeadAttentionLayer(hidden_dim, n_heads, dropout_ratio, device)
    self.encoder_attention        = MultiHeadAttentionLayer(hidden_dim, n_heads, dropout_ratio, device)
    self.positionwise_feedforward = PositionwiseFeedforwardLayer(hidden_dim, pf_dim, dropout_ratio)
    self.dropout = nn.Dropout(dropout_ratio)

  # attention for encoding's output
  def forward(self, trg, enc_src, trg_mask, src_mask):
    # trg     : [batch_size, trg_len, hidden_dim]
    # enc_src : [batch_size, src_len, hidden_dim]
    # trg_mask: [batch_size, trg_len]
    # src_mask: [batch_size, src_len]

    # self attention
    _trg, _ = self.self_attention(trg, trg, trg, trg_mask)

    # residual connection & layer norm
    trg = self.self_attn_layer_norm(trg + self.dropout(_trg))

    # encoder attention
    _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)

    # residual connection & layer_norm
    trg = self.enc_attn_layer_norm(trg + self.dropout(_trg)) 

    # positionwise feed forward
    _trg = self.positionwise_feedforward(trg)

    # residual connection & layer norm
    trg = self.ff_layer_norm(trg + self.dropout(_trg))

    # trg      : [batch_size, trg_len, hidden_dim]
    # attention: [batch_size, n_heads, trg_len, src_len]
    return trg, attention


class Decoder(nn.Module):
  def __init__(self, output_dim, hidden_dim, n_layers, n_heads, pf_dim, dropout_ratio, device, max_length=100):
    super().__init__()
    self.device = device
    self.tok_embedding = nn.Embedding(output_dim, hidden_dim)
    self.pos_embedding = nn.Embedding(max_length, hidden_dim)
    self.layers = nn.ModuleList([DecoderLayer(hidden_dim, n_heads, pf_dim, dropout_ratio, device)
                                 for _ in range(n_layers)])

    self.fc_out  = nn.Linear(hidden_dim, output_dim)
    self.scale   = torch.sqrt(torch.FloatTensor([hidden_dim])).to(device)
    self.dropout = nn.Dropout(dropout_ratio)

  def forward(self, trg, enc_src, trg_mask, src_mask):
    # trg     : [batch_size, trg_len]
    # enc_src : [batch_size, src_len, hidden_dim]
    # trg_mask: [batch_size, trg_len]
    # src_mask: [batch_size, src_len]

    batch_size = trg.shape[0]
    trg_len    = trg.shape[1]

    pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
    # pos: [batch_size, trg_len]

    trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
    # trg: [batch_size, trg_len, hidden_dim]

    for layer in self.layers:
      trg, attention = layer(trg, enc_src, trg_mask, src_mask)
    # trg      : [batch_size, trg_len, hidden_dim]
    # attention: [batch_size, n_heads, trg_len, src_len]

    output = self.fc_out(trg)
    # output: [batch_size, trg_len, output_dim]
    return output, attention

### transformer

In [11]:
class Transformer(nn.Module):
  def __init__(self, encoder, decoder, pad_idx, device):
    super().__init__()
    self.device = device
    self.encoder = encoder
    self.decoder = decoder
    self.src_pad_idx, self.trg_pad_idx = pad_idx, pad_idx

  # masking for <pad> token
  def make_src_mask(self, src):
    # src: [batch_size, src_len]

    src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
    # src_mask: [batch_size, 1, 1, src_len]
    return src_mask

  # masking for next tokens in target sentence
  def make_trg_mask(self, trg):
    # trg: [batch_size, trg_len]

    trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
    # trg_pad_mask: [batch_size, 1, 1, trg_len]

    trg_len = trg.shape[1]
    trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=self.device)).bool()
    # trg_sub_mask: [trg_len, trg_len]

    trg_mask = trg_pad_mask & trg_sub_mask
    # trg_mask: [batch_size, 1, trg_len, trg_len]
    return trg_mask

  def forward(self, src, trg):
    # src: [batch_size, src_len]
    # trg: [batch_size, trg_len]

    src_mask = self.make_src_mask(src)
    trg_mask = self.make_trg_mask(trg)
    # src_mask: [batch_size, 1, 1, src_len]
    # trg_mask: [batch_size, 1, trg_len, trg_len]

    enc_src = self.encoder(src, src_mask)
    # enc_src: [batch_size, src_len, hidden_dim]

    output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
    # output   : [batch_size, trg_len, output_dim]
    # attention: [batch_size, n_heads, trg_len, src_len]
    return output, attention

In [12]:
# hyper parameters
INPUT_DIM, OUTPUT_DIM = len(en_vocab), len(de_vocab)
HIDDEN_DIM = 256
LAYERS     = 3
HEADS      = 8
PF_DIM     = 512
DROPOUT    = 0.1

In [13]:
encoder = Encoder(INPUT_DIM, HIDDEN_DIM, LAYERS, HEADS, PF_DIM, DROPOUT, device)
decoder = Decoder(OUTPUT_DIM, HIDDEN_DIM, LAYERS, HEADS, PF_DIM, DROPOUT, device)
model   = Transformer(encoder, decoder, preprocessor.pad_token_id, device)

def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters.\n')

def initialize_weights(model):
  for p in model.parameters():
    if p.dim() > 1: nn.init.xavier_uniform_(p)

model.apply(initialize_weights)
model = model.to(device)

The model has 9,700,942 trainable parameters.



In [14]:
import torch.optim as optim

# using adam optimizer
LR_RATE   = 0.0005
optimizer = torch.optim.Adam(model.parameters(), lr=LR_RATE)

# define loss function, ignore for padding value
loss_fn = nn.CrossEntropyLoss(ignore_index=preprocessor.pad_token_id)

### training

In [15]:
def train(model, iterator, optimizer, loss_fn, clip):
  model.train()
  epoch_loss = 0

  for i, batch in enumerate(iterator):
    src, trg = batch[0], batch[1]
    
    optimizer.zero_grad()  # make gradients zero before backpropagation

    output, _ = model(src, trg[:,:-1])        # ignore for target's <eos>
    # output: [batch_size, trg_len-1, output_dim]
    # trg   : [batch_size, trg_len]

    output_dim = output.shape[-1]
    output = output.contiguous().view(-1, output_dim)
    trg    = trg[:,1:].contiguous().view(-1)  # ignore for target's <sos>
    # output: [batch_size * trg_len-1, output_dim]
    # trg   : [batch_size * trg_len-1]

    loss = loss_fn(output, trg)
    epoch_loss += loss.item()

    loss.backward()                                           # compute gradient
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)  # gradient clipping
    optimizer.step()                                          # update parameters
  return epoch_loss / len(iterator)

def evaluate(model, iterator, loss_fn):
  model.eval()
  epoch_loss = 0

  with torch.no_grad():
    for i, batch in enumerate(iterator):
      src, trg = batch[0], batch[1]

      output, _ = model(src, trg[:,:-1])        # ignore for target's <eos>
      # output: [batch_size, trg_len-1, output_dim]
      # trg   : [batch_size, trg_len]

      output_dim = output.shape[-1]
      output = output.contiguous().view(-1, output_dim)
      trg    = trg[:,1:].contiguous().view(-1)  # ignore for target's <sos>
      # output: [batch_size * trg_len-1, output_dim]
      # trg   : [batch_size * trg_len-1]

      loss = loss_fn(output, trg)
      epoch_loss += loss.item()
  return epoch_loss / len(iterator)

def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

In [16]:
import time
import math

EPOCHS = 10
CLIP   = 1
best_valid_loss = float('inf')

for epoch in range(EPOCHS):
  start_time = time.time()

  train_loss = train(model, train_loader, optimizer, loss_fn, CLIP)
  valid_loss = evaluate(model, valid_loader, loss_fn)

  end_time = time.time()
  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'transformer_en_to_de.pt')

  print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):.3f}')
  print(f'\tValidation Loss: {valid_loss:.3f} | Validation PPL: {math.exp(valid_loss):.3f}')
  print('\n')

Epoch: 01 | Time: 0m 19s
	Train Loss: 3.977 | Train PPL: 53.343
	Validation Loss: 2.898 | Validation PPL: 18.131


Epoch: 02 | Time: 0m 17s
	Train Loss: 2.592 | Train PPL: 13.352
	Validation Loss: 2.208 | Validation PPL: 9.099


Epoch: 03 | Time: 0m 16s
	Train Loss: 2.010 | Train PPL: 7.464
	Validation Loss: 1.879 | Validation PPL: 6.550


Epoch: 04 | Time: 0m 16s
	Train Loss: 1.663 | Train PPL: 5.276
	Validation Loss: 1.705 | Validation PPL: 5.500


Epoch: 05 | Time: 0m 16s
	Train Loss: 1.418 | Train PPL: 4.129
	Validation Loss: 1.599 | Validation PPL: 4.950


Epoch: 06 | Time: 0m 17s
	Train Loss: 1.230 | Train PPL: 3.422
	Validation Loss: 1.564 | Validation PPL: 4.777


Epoch: 07 | Time: 0m 16s
	Train Loss: 1.081 | Train PPL: 2.949
	Validation Loss: 1.549 | Validation PPL: 4.706


Epoch: 08 | Time: 0m 16s
	Train Loss: 0.966 | Train PPL: 2.626
	Validation Loss: 1.593 | Validation PPL: 4.919


Epoch: 09 | Time: 0m 16s
	Train Loss: 0.871 | Train PPL: 2.390
	Validation Loss: 1.609 | Vali

In [17]:
# download saved model
from google.colab import files
files.download('transformer_en_to_de.pt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### testing

In [18]:
# load saved model
model.load_state_dict(torch.load('./transformer_en_to_de.pt'))

# test
test_loss = evaluate(model, test_loader, loss_fn)
print(f'Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):.3f}')

Test Loss: 1.614 | Test PPL: 5.023


In [19]:
# translation using my model
def translate_sentence(sentence, preprocessor, model, device, max_len=50):
  model.eval()

  # make source indices
  src_indices = preprocessor.src_encode(sentence)
  src_indices = [preprocessor.sos_token_id] + src_indices + [preprocessor.eos_token_id]

  # using encoder
  src_tensor = torch.LongTensor(src_indices).unsqueeze(0).to(device)
  src_mask   = model.make_src_mask(src_tensor)
  with torch.no_grad():
    enc_src = model.encoder(src_tensor, src_mask)

  # using decoder, make translated target indices
  trg_indices = [preprocessor.sos_token_id]            # has <sos> token
  for i in range(max_len):
    trg_tensor = torch.LongTensor(trg_indices).unsqueeze(0).to(device)
    trg_mask   = model.make_trg_mask(trg_tensor)
    with torch.no_grad():
      output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)

    pred_token = output.argmax(2)[:,-1].item()
    if pred_token == preprocessor.eos_token_id: break  # break when <eos> token
    else: trg_indices.append(pred_token)

  # convert indices to words
  trg_tokens = [preprocessor.tgt_id2token[id] for id in trg_indices]
  return trg_tokens[1:], attention                     # remove <sos> token

In [20]:
for i, batch in enumerate(test_dataset):
  if i == 10:
    src, tgt = batch[0], batch[1]
    break
src = preprocessor.src_decode(src)
tgt = preprocessor.tgt_decode(tgt)
print('original en sentence:')
print(f'\t{src}')
print('original de sentence:')
print(f'\t{tgt}')

translation, _ = translate_sentence(src, preprocessor, model, device)
translation = ' '.join(translation)
print('translated de sentence:')
print(f'\t{translation}')

original en sentence:
	A mother and her young song enjoying a beautiful day outside .
original de sentence:
	Eine Mutter und ihr kleiner Sohn genießen einen schönen Tag im Freien .
translated de sentence:
	Eine Mutter und ihre Mutter genießen einen schönen Tag im Freien .


In [21]:
# for bleu score
!pip install torchtext==0.6.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 2.1 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 9.8 MB/s 
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.13.1
    Uninstalling torchtext-0.13.1:
      Successfully uninstalled torchtext-0.13.1
Successfully installed sentencepiece-0.1.97 torchtext-0.6.0


In [22]:
from torchtext.data.metrics import bleu_score

def show_bleu(data, preprocessor, model, device, max_len=50):
  trgs, pred_trgs = [], []
  for i in range(len(data)):
    # original target
    src, trg = data[i][0], data[i][1]
    src = preprocessor.src_decode(src)
    trg = preprocessor.tgt_decode(trg).split(' ')
    trgs.append([trg])

    # predicted target
    pred_trg, _ = translate_sentence(src, preprocessor, model, device, max_len)
    pred_trg = pred_trg[:-1]
    pred_trgs.append(pred_trg)

    if (i + 1)%100 == 0:
      print(f"[{i+1}/{len(data)}]")
      print(f"정답: {trg}")
      print(f"예측: {pred_trg}\n")

  bleu = bleu_score(pred_trgs, trgs, max_n=4, weights=[0.25, 0.25, 0.25, 0.25])
  print(f'Total BLEU Score = {bleu*100:.2f}')

  bleu1_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[1, 0, 0, 0])
  bleu2_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[0, 1, 0, 0])
  bleu3_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[0, 0, 1, 0])
  bleu4_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[0, 0, 0, 1])
  print(f'BLEU1 score = {bleu1_score}') 
  print(f'BLEU2 score = {bleu2_score}') 
  print(f'BLEU3 score = {bleu3_score}') 
  print(f'BLEU4 score = {bleu4_score}') 

show_bleu(test_dataset, preprocessor, model, device)

[100/1000]
정답: ['Ein', 'kleiner', 'Junge', 'im', 'Fußballdress', 'hält', 'die', 'Hände', '<unk>', 'Gesicht', 'und', 'weint', '.']
예측: ['Ein', 'kleiner', 'Junge', 'in', 'Uniform', 'schreit', 'in', 'einen', 'Fußball', 'in', 'seiner', '<unk>']

[200/1000]
정답: ['Ein', 'Mann', 'macht', 'Werbung', 'mit', 'einem', 'riesigen', 'Schild', ',', 'das', 'auf', 'sein', 'Fahrrad', 'gebunden', 'ist', '.']
예측: ['Ein', 'Mann', '<unk>', 'mit', 'einem', 'riesigen', 'Schild', 'auf', 'seinem', 'Fahrrad']

[300/1000]
정답: ['Eine', 'Gruppe', 'junger', 'Menschen', 'trinkt', '<unk>', 'in', 'einem', '<unk>', '<unk>', '.']
예측: ['Eine', 'Gruppe', 'junger', 'Japaner', 'machen', 'in', 'einer', '<unk>']

[400/1000]
정답: ['Ein', 'lächelnder', 'Junge', 'spielt', 'im', 'Laub', 'mit', 'den', 'Enten', '.']
예측: ['Ein', 'lächelnder', 'kleiner', 'Junge', 'spielt', 'im', 'Laub', 'zwischen', 'Reihen', 'von', 'sich']

[500/1000]
정답: ['Eine', 'Frau', 'steht', 'auf', 'einem', 'grünen', 'Feld', ',', 'hält', 'einen', 'weißen', 'Hund'