### set device and data

In [1]:
import torch
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'using device: {DEVICE}')

using device: cuda


In [2]:
def file2list(file):
    lst = []
    for line in open(f'./{file}', 'r', encoding='utf-8-sig'):
      line = line.replace('\n', '')
      lst.append(line)
    return lst

# train, valid, test texts
en_train_lst = file2list('train.en')
en_valid_lst = file2list('val.en')
en_test_lst  = file2list('test.en')
print(f'en data: {len(en_train_lst), len(en_valid_lst), len(en_test_lst)}')

de_train_lst = file2list('train.de')
de_valid_lst = file2list('val.de')
de_test_lst  = file2list('test.de')
print(f'de data: {len(de_train_lst), len(de_valid_lst), len(de_test_lst)}')

en data: (29001, 1015, 1000)
de data: (29001, 1015, 1000)


### build vocabularies

In [3]:
# using spacy tokenizer
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 32.6 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting de-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.4.0/de_core_news_sm-3.4.0-py3-none-any.whl (14.6 MB)
[K     |████████████████████████████████| 14.6 MB 46.7 MB/s 
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.4.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [4]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

# using spacy tokenizer
en_tokenizer = get_tokenizer(tokenizer='spacy', language='en_core_web_sm')
de_tokenizer = get_tokenizer(tokenizer='spacy', language='de_core_news_sm')

# vocabulary for training texts
en_vocab = build_vocab_from_iterator(map(en_tokenizer, [text for text in en_train_lst]),
                                     min_freq=2,
                                     specials=['<unk>','<sos>','<eos>','<pad>'],
                                     special_first=True)
de_vocab = build_vocab_from_iterator(map(de_tokenizer, [text for text in de_train_lst]),
                                     min_freq=2,
                                     specials=['<unk>','<sos>','<eos>','<pad>'],
                                     special_first=True)
print(f'en vocab size: {len(en_vocab)}')
print(f'de vocab size: {len(de_vocab)}')

en vocab size: 6191
de vocab size: 8014


### make preprocessor for encode & decode texts

In [5]:
class Preprocessor:
  unk_token_id = 0
  sos_token_id = 1
  eos_token_id = 2
  pad_token_id = 3

  def __init__(self, src_tokenizer, tgt_tokenizer, src_vocab, tgt_vocab):
    self.src_tokenizer = src_tokenizer
    self.tgt_tokenizer = tgt_tokenizer

    # token -> id
    self.src_token2id = src_vocab.get_stoi()
    self.tgt_token2id = tgt_vocab.get_stoi()
    # id -> token
    self.src_id2token = src_vocab.get_itos()
    self.tgt_id2token = tgt_vocab.get_itos()

  # encode token -> id for source sentence
  def src_encode(self, text):
    if type(text) == list: text = ' '.join(text)
    tokenized = self.src_tokenizer(text)
    encoded   = [self.src_token2id.get(token, self.src_token2id['<unk>']) for token in tokenized]
    return encoded

  # encode token -> id, attach <sos> and <eos> for target sentence
  def tgt_encode(self, text):
    if type(text) == list: text = ' '.join(text)
    tokenized = self.tgt_tokenizer(text)
    encoded   = [self.tgt_token2id['<sos>']] \
    + [self.tgt_token2id.get(token, self.tgt_token2id['<unk>']) for token in tokenized] \
    + [self.tgt_token2id['<eos>']]
    return encoded

  # decode source sentence
  def src_decode(self, ids):
    decoded = list(map(lambda x: self.src_id2token[x], ids))
    return ' '.join(decoded)

  # decode target sentence
  def tgt_decode(self, ids):
    decoded = list(map(lambda x: self.tgt_id2token[x], ids))[1:-1]
    return ' '.join(decoded)

preprocessor = Preprocessor(en_tokenizer, de_tokenizer, en_vocab, de_vocab)

### make custom dataset, data loader

In [6]:
from torch.utils.data.dataset import Dataset

class CustomDataset(Dataset):
  def __init__(self, en_lst, de_lst, preprocessor):
    self.en_lst = en_lst
    self.de_lst = de_lst
    assert len(self.en_lst) == len(self.de_lst)

    self.preprocessor = preprocessor
    self.dataset      = self.make_dataset()

  # make (en, de) sentence pair dataset
  def make_dataset(self):
    dataset = [(self.preprocessor.src_encode(en), self.preprocessor.tgt_encode(de))
               for en, de in zip(self.en_lst, self.de_lst)
               if len(en) > 0 and len(de) > 0]
    return dataset

  def __getitem__(self, idx):
    return self.dataset[idx]

  def __len__(self):
    return len(self.dataset)

train_dataset = CustomDataset(en_train_lst, de_train_lst, preprocessor)
valid_dataset = CustomDataset(en_valid_lst, de_valid_lst, preprocessor)
test_dataset  = CustomDataset(en_test_lst, de_test_lst, preprocessor)
print(f'dataset size: {len(train_dataset), len(valid_dataset), len(test_dataset)}')

dataset size: (29000, 1014, 1000)


In [7]:
import random
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

# sequence padding in batch
def collate_fn(batch_samples):
  pad_token_id = preprocessor.pad_token_id
  src_sent = pad_sequence([torch.tensor(src) for src, _ in batch_samples],
                          batch_first=True,
                          padding_value=pad_token_id)
  tgt_sent = pad_sequence([torch.tensor(tgt) for _, tgt in batch_samples],
                          batch_first=True,
                          padding_value=pad_token_id)
  return src_sent.to(DEVICE), tgt_sent.to(DEVICE)

# composing batch to match similar lengths of sequences
def batch_sampling(sequence_lengths, BATCH_SIZE):
  seq_lens = [(i, seq_len, tgt_len) for i, (seq_len, tgt_len) in enumerate(sequence_lengths)]
  seq_lens = sorted(seq_lens, key=lambda x: x[1])
  seq_lens = [sample[0] for sample in seq_lens]

  sample_indices = [seq_lens[i:i+BATCH_SIZE] for i in range(0, len(seq_lens), BATCH_SIZE)]
  random.shuffle(sample_indices)
  return sample_indices

# make dataloader using sequence padding & composing batch
def make_loader(dataset, BATCH_SIZE):
  sequence_lengths = list(map(lambda x: (len(x[0]), len(x[1])), dataset))
  batch_sampler = batch_sampling(sequence_lengths, BATCH_SIZE)
  loader = DataLoader(dataset,
                      collate_fn=collate_fn,
                      batch_sampler=batch_sampler)
  return loader

BATCH_SIZE = 64
train_loader = make_loader(train_dataset, BATCH_SIZE)
valid_loader = make_loader(valid_dataset, BATCH_SIZE)
test_loader  = make_loader(test_dataset, BATCH_SIZE)
print('check dataset and batch size,')

print(f'train data length       : {len(en_train_lst)}')
print(f'train data loader length: {len(train_loader)}')
print(f'train data loader length: {len(en_train_lst) / BATCH_SIZE}\n')

print(f'valid data length       : {len(en_valid_lst)}')
print(f'valid data loader length: {len(valid_loader)}')
print(f'valid data loader length: {len(en_valid_lst) / BATCH_SIZE}\n')

print(f'test data length       : {len(en_test_lst)}')
print(f'test data loader length: {len(test_loader)}')
print(f'test data loader length: {len(en_test_lst) / BATCH_SIZE}')

check dataset and batch size,
train data length       : 29001
train data loader length: 454
train data loader length: 453.140625

valid data length       : 1015
valid data loader length: 16
valid data loader length: 15.859375

test data length       : 1000
test data loader length: 16
test data loader length: 15.625


### token embedding & positional encoding

In [8]:
from torch import nn

class TokenEmbedding(nn.Module):
  def __init__(self, VOCAB_SIZE, HIDDEN_DIM):
    super().__init__()
    self.HIDDEN_DIM = HIDDEN_DIM
    self.embedding  = nn.Embedding(VOCAB_SIZE, HIDDEN_DIM)
    self.scale      = torch.sqrt(torch.FloatTensor([HIDDEN_DIM])).to(DEVICE)

  def forward(self, src):
    return self.embedding(src) * self.scale

In [9]:
import math

class PositionalEncoding(nn.Module):
  def __init__(self, HIDDEN_DIM, dropout_ratio=0.1, max_length=5000):
    super().__init__()
    self.dropout = nn.Dropout(dropout_ratio)

    pos = torch.arange(max_length).unsqueeze(1)
    den = torch.exp(torch.arange(0, HIDDEN_DIM, 2) * (-math.log(10000) / HIDDEN_DIM))
    
    pos_embedding = torch.zeros(max_length, 1, HIDDEN_DIM)
    pos_embedding[:, 0, 0::2] = torch.sin(pos * den)
    pos_embedding[:, 0, 1::2] = torch.cos(pos * den)
    self.register_buffer('pos_embedding', pos_embedding)

  def forward(self, token_embedding):
    # x: [seq_len, batch_size, hidden_dim]
    token_embedding += self.pos_embedding[:token_embedding.size(0), :]
    return self.dropout(token_embedding)

### masking

In [10]:
def generate_square_subsequent_mask(size):
  mask = (torch.triu(torch.ones((size, size), device=DEVICE)) == 1).transpose(0, 1)
  mask = mask.float().masked_fill(mask==0, float('-inf'))\
                     .masked_fill(mask==1, float(0.0))
  return mask

def create_mask(src, tgt):
  src_seq_len = src.shape[0]
  src_mask    = torch.zeros((src_seq_len, src_seq_len), device=DEVICE).type(torch.bool)
  
  tgt_seq_len = tgt.shape[0]
  tgt_mask    = generate_square_subsequent_mask(tgt_seq_len)

  pad_idx = 3  # Preprocessor's pad_token_id
  src_padding_mask = (src == pad_idx).transpose(0, 1)
  tgt_padding_mask = (tgt == pad_idx).transpose(0, 1)
  return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

### transformer

In [11]:
from torch.nn import Transformer

class Transformer_(nn.Module):
  def __init__(self, INPUT_DIM, OUTPUT_DIM, HIDDEN_DIM, LAYERS, HEADS, PF_DIM, dropout_ratio):
    super().__init__()
    self.src_token_embedding = TokenEmbedding(INPUT_DIM, HIDDEN_DIM)
    self.tgt_token_embedding = TokenEmbedding(OUTPUT_DIM, HIDDEN_DIM)
    self.positional_encoding = PositionalEncoding(HIDDEN_DIM, dropout_ratio)
  
    self.transformer = Transformer(d_model=HIDDEN_DIM,
                                   nhead=HEADS,
                                   num_encoder_layers=LAYERS,
                                   num_decoder_layers=LAYERS,
                                   dim_feedforward=PF_DIM,
                                   dropout=dropout_ratio)
    self.fc_out = nn.Linear(HIDDEN_DIM, OUTPUT_DIM)

  def forward(self, src, tgt, src_mask, tgt_mask,
              src_padding_mask, tgt_padding_mask, memory_key_padding_mask):
    src_emb = self.src_token_embedding(src)
    tgt_emb = self.tgt_token_embedding(tgt)

    src_emb = self.positional_encoding(src_emb)
    tgt_emb = self.positional_encoding(tgt_emb)

    output = self.transformer(src_emb, tgt_emb,
                              src_mask, tgt_mask,
                              None,  # None for memory_mask
                              src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
    return self.fc_out(output)

  def encode(self, src, src_mask):
    src_emb = self.src_token_embedding(src)
    src_emb = self.positional_encoding(src_emb)
    return self.transformer.encoder(src_emb, src_mask)

  def decode(self, tgt, memory, tgt_mask):
    tgt_emb = self.tgt_token_embedding(tgt)
    tgt_emb = self.positional_encoding(tgt_emb)
    return self.transformer.decoder(tgt_emb, memory, tgt_mask)

In [12]:
# hyper parameters
INPUT_DIM, OUTPUT_DIM = len(en_vocab), len(de_vocab)
HIDDEN_DIM = 512
LAYERS     = 6
HEADS      = 8
PF_DIM     = 2048
DROPOUT    = 0.1

In [13]:
model = Transformer_(INPUT_DIM, OUTPUT_DIM, HIDDEN_DIM, LAYERS, HEADS, PF_DIM, DROPOUT)

def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters.\n')

def initialize_weights(model):
  for p in model.parameters():
    if p.dim() > 1: nn.init.xavier_uniform_(p)

model.apply(initialize_weights)
model = model.to(DEVICE)

The model has 55,524,686 trainable parameters.



In [14]:
import torch.optim as optim

# using adam optimizer
optimizer = torch.optim.Adam(model.parameters(),
                             lr=0.0001,
                             betas=(0.9, 0.98),
                             eps=1e-9) 

# define loss function, ignore for padding value
loss_fn = nn.CrossEntropyLoss(ignore_index=preprocessor.pad_token_id)

### training

In [15]:
def train(model, iterator, optimizer, loss_fn, clip):
  model.train()
  epoch_loss = 0

  for i, batch in enumerate(iterator):
    src = batch[0].T
    tgt = batch[1].T
    
    optimizer.zero_grad()  # make gradients zero before backpropagation

    src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt[:-1,:])

    output = model(src, tgt[:-1,:],
                   src_mask, tgt_mask,
                   src_padding_mask, tgt_padding_mask, src_padding_mask)
    
    tgt = tgt[1:, :].reshape(-1)  # ignore for target's <sos>
    output = output.reshape(-1, output.shape[-1])
    
    loss = loss_fn(output, tgt)
    epoch_loss += loss.item()

    loss.backward()                                           # compute gradient
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)  # gradient clipping
    optimizer.step()                                          # update parameters
  return epoch_loss / len(iterator)

def evaluate(model, iterator, loss_fn):
  model.eval()
  epoch_loss = 0

  with torch.no_grad():
    for i, batch in enumerate(iterator):
      src = batch[0].T
      tgt = batch[1].T

      src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt[:-1,:])

      output = model(src, tgt[:-1, :],
                    src_mask, tgt_mask,
                    src_padding_mask, tgt_padding_mask, src_padding_mask)

      tgt = tgt[1:, :].reshape(-1)  # ignore for target's <sos>
      output = output.reshape(-1, output.shape[-1])

      loss = loss_fn(output, tgt)
      epoch_loss += loss.item()
  return epoch_loss / len(iterator)

def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

In [16]:
import time

EPOCHS = 10
CLIP   = 1
best_valid_loss = float('inf')

for epoch in range(EPOCHS):
  start_time = time.time()

  train_loss = train(model, train_loader, optimizer, loss_fn, CLIP)
  valid_loss = evaluate(model, valid_loader, loss_fn)

  end_time = time.time()
  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'transformer_en_to_de.pt')

  print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):.3f}')
  print(f'\tValidation Loss: {valid_loss:.3f} | Validation PPL: {math.exp(valid_loss):.3f}')
  print('\n')

Epoch: 01 | Time: 0m 58s
	Train Loss: 4.813 | Train PPL: 123.115
	Validation Loss: 3.824 | Validation PPL: 45.779


Epoch: 02 | Time: 0m 57s
	Train Loss: 3.642 | Train PPL: 38.171
	Validation Loss: 3.313 | Validation PPL: 27.460


Epoch: 03 | Time: 0m 58s
	Train Loss: 3.233 | Train PPL: 25.349
	Validation Loss: 3.006 | Validation PPL: 20.208


Epoch: 04 | Time: 0m 59s
	Train Loss: 2.952 | Train PPL: 19.146
	Validation Loss: 2.804 | Validation PPL: 16.517


Epoch: 05 | Time: 1m 0s
	Train Loss: 2.719 | Train PPL: 15.160
	Validation Loss: 2.600 | Validation PPL: 13.470


Epoch: 06 | Time: 1m 0s
	Train Loss: 2.468 | Train PPL: 11.803
	Validation Loss: 2.387 | Validation PPL: 10.884


Epoch: 07 | Time: 1m 0s
	Train Loss: 2.233 | Train PPL: 9.326
	Validation Loss: 2.168 | Validation PPL: 8.743


Epoch: 08 | Time: 1m 0s
	Train Loss: 1.998 | Train PPL: 7.373
	Validation Loss: 1.975 | Validation PPL: 7.207


Epoch: 09 | Time: 1m 0s
	Train Loss: 1.795 | Train PPL: 6.018
	Validation Loss: 1.832 |

In [17]:
# download saved model
from google.colab import files
files.download('transformer_en_to_de.pt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### testing

In [18]:
# load saved model
model.load_state_dict(torch.load('./transformer_en_to_de.pt'))

# test
test_loss = evaluate(model, test_loader, loss_fn)
print(f'Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):.3f}')

Test Loss: 1.774 | Test PPL: 5.895


In [19]:
# translation using my model
def translate_sentence(sentence, preprocessor, model, max_len=50):
  model.eval()
  
  # make source indices
  src_indices = preprocessor.src_encode(sentence)
  src_indices = torch.LongTensor(src_indices).view(-1, 1).to(DEVICE)
  # src_indices: [seq_len, 1]
  
  # using encoder
  num_tokens = src_indices.shape[0]
  src_mask   = (torch.zeros(num_tokens, num_tokens)).type(torch.bool).to(DEVICE)
  enc_src    = model.encode(src_indices, src_mask)

  # using decoder, make translated target indices
  tgt_indices = torch.ones(1, 1).fill_(preprocessor.sos_token_id).type(torch.long).to(DEVICE)
  for i in range(max_len):
    enc_src  = enc_src.to(DEVICE)
    tgt_mask = (generate_square_subsequent_mask(tgt_indices.size(0)).type(torch.bool)).to(DEVICE)

    output = model.decode(tgt_indices, enc_src, tgt_mask)
    output = output.transpose(0, 1)
    prob   = model.fc_out(output[:, -1])
    _, next_word = torch.max(prob, dim=1)
    next_word = next_word.item()

    tgt_indices = torch.cat([tgt_indices,
                             torch.ones(1, 1).type_as(src_indices.data).fill_(next_word)], dim=0)
    if next_word == preprocessor.eos_token_id: break
  
  tgt_tokens = [preprocessor.tgt_id2token[id] for id in tgt_indices]
  return tgt_tokens[1:-1]

In [20]:
for i, batch in enumerate(test_dataset):
  if i == 10:
    src, tgt = batch[0], batch[1]
    break
src = preprocessor.src_decode(src)
tgt = preprocessor.tgt_decode(tgt)
print('original en sentence:')
print(f'\t{src}')
print('original de sentence:')
print(f'\t{tgt}')

translation = translate_sentence(src, preprocessor, model)
translation = ' '.join(translation)
print('translated de sentence:')
print(f'\t{translation}')

original en sentence:
	A mother and her young song enjoying a beautiful day outside .
original de sentence:
	Eine Mutter und ihr kleiner Sohn genießen einen schönen Tag im Freien .
translated de sentence:
	Eine Mutter und ihr kleiner Junge genießen einen schönen Tag im Freien .


In [21]:
# for bleu score
!pip install torchtext==0.6.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 2.0 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 43.8 MB/s 
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.13.1
    Uninstalling torchtext-0.13.1:
      Successfully uninstalled torchtext-0.13.1
Successfully installed sentencepiece-0.1.97 torchtext-0.6.0


In [22]:
from torchtext.data.metrics import bleu_score

def show_bleu(data, preprocessor, model, max_len=50):
  trgs, pred_trgs = [], []
  for i in range(len(data)):
    # original target
    src, trg = data[i][0], data[i][1]
    src = preprocessor.src_decode(src)
    trg = preprocessor.tgt_decode(trg).split(' ')
    trgs.append([trg])

    # predicted target
    pred_trg = translate_sentence(src, preprocessor, model)
    pred_trg = pred_trg[1:-1]
    pred_trgs.append(pred_trg)

    if (i + 1)%100 == 0:
      print(f"[{i+1}/{len(data)}]")
      print(f"정답: {trg}")
      print(f"예측: {pred_trg}\n")

  bleu = bleu_score(pred_trgs, trgs, max_n=4, weights=[0.25, 0.25, 0.25, 0.25])
  print(f'Total BLEU Score = {bleu*100:.2f}')

  bleu1_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[1, 0, 0, 0])
  bleu2_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[0, 1, 0, 0])
  bleu3_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[0, 0, 1, 0])
  bleu4_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[0, 0, 0, 1])
  print(f'BLEU1 score = {bleu1_score}') 
  print(f'BLEU2 score = {bleu2_score}') 
  print(f'BLEU3 score = {bleu3_score}') 
  print(f'BLEU4 score = {bleu4_score}') 

show_bleu(test_dataset, preprocessor, model)

[100/1000]
정답: ['Ein', 'kleiner', 'Junge', 'im', 'Fußballdress', 'hält', 'die', 'Hände', '<unk>', 'Gesicht', 'und', 'weint', '.']
예측: ['Junge', 'in', 'einem', 'Fußball', 'hält', 'seine', '<unk>', 'in', 'seiner', '<unk>']

[200/1000]
정답: ['Ein', 'Mann', 'macht', 'Werbung', 'mit', 'einem', 'riesigen', 'Schild', ',', 'das', 'auf', 'sein', 'Fahrrad', 'gebunden', 'ist', '.']
예측: ['Mann', '<unk>', 'mit', 'einem', '<unk>', 'auf', 'seinem', 'Fahrrad', 'ein', '<unk>']

[300/1000]
정답: ['Eine', 'Gruppe', 'junger', 'Menschen', 'trinkt', '<unk>', 'in', 'einem', '<unk>', '<unk>', '.']
예측: ['Gruppe', 'junger', 'Leute', ',', 'die', 'in', 'einer', '<unk>', '<unk>', '<unk>']

[400/1000]
정답: ['Ein', 'lächelnder', 'Junge', 'spielt', 'im', 'Laub', 'mit', 'den', 'Enten', '.']
예측: ['lächelnder', 'Junge', 'spielt', 'in', 'den', 'Wellen', 'zwischen', 'den', 'Wellen']

[500/1000]
정답: ['Eine', 'Frau', 'steht', 'auf', 'einem', 'grünen', 'Feld', ',', 'hält', 'einen', 'weißen', 'Hund', 'und', 'zeigt', 'auf', 'einen