# Building an RNN for Machine Translation
## Initial Data Work

In this file we will read in the data for the Vietnamese and Chinese to Engish corpuses, build a token2id and char2id mapping, vocabularies and data loaders

In [82]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch import optim
import pickle as pkl
import random
import csv
import pandas as pd
import matplotlib.pyplot as plt
import gensim
from random import randint
from gensim.models import KeyedVectors
import sacrebleu

random.seed(123)

PAD_IDX = 0
UNK_IDX = 1
SOS_IDX = 2
EOS_IDX = 3
BATCH_SIZE = 64

## Loading in training, validation and test sets

In [2]:
#Loading in the Vietnamese -> En datasets

train_vi_en = []
with open('../project_data/en-vi/train.tok.en') as inputfile:
    for line in inputfile:
        train_vi_en.append(line.strip().lower().split(' '))

train_vi_vi = []
with open('../project_data/en-vi/train.tok.vi') as inputfile:
    for line in inputfile:
        train_vi_vi.append(line.strip().lower().split(' '))

val_vi_en = []
with open('../project_data/en-vi/dev.tok.en') as inputfile:
    for line in inputfile:
        val_vi_en.append(line.strip().lower().split(' '))

val_vi_vi = []
with open('../project_data/en-vi/dev.tok.vi') as inputfile:
    for line in inputfile:
        val_vi_vi.append(line.strip().lower().split(' '))
        
test_vi_en = []
with open('../project_data/en-vi/test.tok.en') as inputfile:
    for line in inputfile:
        test_vi_en.append(line.strip().lower().split(' '))

test_vi_vi = []
with open('../project_data/en-vi/test.tok.vi') as inputfile:
    for line in inputfile:
        test_vi_vi.append(line.strip().lower().split(' '))

In [3]:
#Loading in the Chinese -> En datasets

train_zh_en = []
with open('../project_data/en-zh/train.tok.en') as inputfile:
    for line in inputfile:
        train_zh_en.append(line.strip().lower().split(' '))

train_zh_zh = []
with open('../project_data/en-zh/train.tok.zh') as inputfile:
    for line in inputfile:
        train_zh_zh.append(line.strip().lower().split(' '))

val_zh_en = []
with open('../project_data/en-zh/dev.tok.en') as inputfile:
    for line in inputfile:
        val_zh_en.append(line.strip().lower().split(' '))

val_zh_zh = []
with open('../project_data/en-zh/dev.tok.zh') as inputfile:
    for line in inputfile:
        val_zh_zh.append(line.strip().lower().split(' '))
        
test_zh_en = []
with open('../project_data/en-zh/test.tok.en') as inputfile:
    for line in inputfile:
        test_zh_en.append(line.strip().lower().split(' '))

test_zh_zh = []
with open('../project_data/en-zh/test.tok.zh') as inputfile:
    for line in inputfile:
        test_zh_zh.append(line.strip().lower().split(' '))

In [4]:
#Sanity Checking
print("Vi -> En | Training Examples: "+str(len(train_vi_en)))
print("Vi -> En | Training Examples: "+str(len(train_vi_vi)), '\n')

print("Vi -> En | Validation Examples: "+str(len(val_vi_en)))
print("Vi -> En | Validation Examples: "+str(len(val_vi_vi)), '\n')

print("Vi -> En | Testing Examples: "+str(len(test_vi_en)))
print("Vi -> En | Testing Examples: "+str(len(test_vi_vi)), '\n')

print("Zh -> En | Training Examples: "+str(len(train_zh_en)))
print("Zh -> En | Training Examples: "+str(len(train_zh_zh)), '\n')

print("Zh -> En | Validation Examples: "+str(len(val_zh_en)))
print("Zh -> En | Validation Examples: "+str(len(val_zh_zh)), '\n')

print("Zh -> En | Testing Examples: "+str(len(test_zh_en)))
print("Zh -> En | Testing Examples: "+str(len(test_zh_zh)), '\n')

Vi -> En | Training Examples: 133317
Vi -> En | Training Examples: 133317 

Vi -> En | Validation Examples: 1268
Vi -> En | Validation Examples: 1268 

Vi -> En | Testing Examples: 1553
Vi -> En | Testing Examples: 1553 

Zh -> En | Training Examples: 213377
Zh -> En | Training Examples: 213377 

Zh -> En | Validation Examples: 1261
Zh -> En | Validation Examples: 1261 

Zh -> En | Testing Examples: 1397
Zh -> En | Testing Examples: 1397 



In [5]:
#preserve original data for evaluation
train_vi_en_orig = train_vi_en
train_vi_vi_orig = train_vi_vi
val_vi_en_orig = val_vi_en
val_vi_vi_orig = val_vi_vi
test_vi_en_orig = test_vi_en
test_vi_vi_orig = test_vi_vi

train_zh_en_orig = train_zh_en
train_zh_zh_orig = train_zh_zh
val_zh_en_orig = val_zh_en
val_zh_zh_orig = val_zh_zh
test_zh_en_orig = test_zh_en
test_zh_zh_orig = test_zh_zh

## Loading in pre-trained fasttext embeddings for the three languages
### Building loaded embeddings, token2id, id2token and ordered words for all languages

In [6]:
# building the three vocabs from pre-trained embeddings
en_embeddings = KeyedVectors.load_word2vec_format('../pretrained_embeddings/wiki.en.vec')
vi_embeddings = KeyedVectors.load_word2vec_format('../pretrained_embeddings/wiki.vi.vec')
zh_embeddings = KeyedVectors.load_word2vec_format('../pretrained_embeddings/wiki.zh.vec')

In [7]:
en_loaded_embeddings = en_embeddings.vectors
vi_loaded_embeddings = vi_embeddings.vectors
zh_loaded_embeddings = zh_embeddings.vectors

In [8]:
#adding PAD AND UNK embeddings
en_loaded_embeddings = np.insert(en_loaded_embeddings, 0, np.zeros(300,), axis=0)
vi_loaded_embeddings = np.insert(vi_loaded_embeddings, 0, np.zeros(300,), axis=0)
zh_loaded_embeddings = np.insert(zh_loaded_embeddings, 0, np.zeros(300,), axis=0)

en_loaded_embeddings = np.insert(en_loaded_embeddings, 1, np.random.rand(300,), axis=0)
vi_loaded_embeddings = np.insert(vi_loaded_embeddings, 1, np.random.rand(300,), axis=0)
zh_loaded_embeddings = np.insert(zh_loaded_embeddings, 1, np.random.rand(300,), axis=0)

en_loaded_embeddings = np.insert(en_loaded_embeddings, 2, np.random.rand(300,), axis=0)
vi_loaded_embeddings = np.insert(vi_loaded_embeddings, 2, np.random.rand(300,), axis=0)
zh_loaded_embeddings = np.insert(zh_loaded_embeddings, 2, np.random.rand(300,), axis=0)

en_loaded_embeddings = np.insert(en_loaded_embeddings, 3, np.random.rand(300,), axis=0)
vi_loaded_embeddings = np.insert(vi_loaded_embeddings, 3, np.random.rand(300,), axis=0)
zh_loaded_embeddings = np.insert(zh_loaded_embeddings, 3, np.random.rand(300,), axis=0)

In [9]:
print('Words in EN Vocab: ', en_loaded_embeddings.shape[0]-4)
print('Words in VI Vocab: ', vi_loaded_embeddings.shape[0]-4)
print('Words in ZH Vocab: ', zh_loaded_embeddings.shape[0]-4)

Words in EN Vocab:  2519370
Words in VI Vocab:  292168
Words in ZH Vocab:  332647


In [10]:
# building out id2token and token2id for all languages
en_token2id = {j: i for i,j in enumerate(['PAD','UNK','SOS_IDX','EOS_IDX']+en_embeddings.index2word)}
en_id2token = {i: j for i,j in enumerate(['PAD','UNK','SOS_IDX','EOS_IDX']+en_embeddings.index2word)}
vi_token2id = {j: i for i,j in enumerate(['PAD','UNK','SOS_IDX','EOS_IDX']+vi_embeddings.index2word)}
vi_id2token = {i: j for i,j in enumerate(['PAD','UNK','SOS_IDX','EOS_IDX']+vi_embeddings.index2word)}
zh_token2id = {j: i for i,j in enumerate(['PAD','UNK','SOS_IDX','EOS_IDX']+zh_embeddings.index2word)}
zh_id2token = {i: j for i,j in enumerate(['PAD','UNK','SOS_IDX','EOS_IDX']+zh_embeddings.index2word)}

### Encoding our data

In [11]:
VI_EN_MAX_LENGTH = int(np.percentile([len(sentence) for sentence in train_vi_en+train_vi_vi], 90))+1
ZH_EN_MAX_LENGTH = int(np.percentile([len(sentence) for sentence in train_zh_en+train_zh_zh], 90))+1

In [12]:
def encoding_tokens(sentence, language, translator):
    if language== 'English':
        token2id = en_token2id
    elif language== 'Vietnamese':
        token2id = vi_token2id
    elif language== 'Chinese':
        token2id = zh_token2id
    tokens = [token2id[token] if token in token2id else UNK_IDX for token in sentence]
    if translator == 'vi':
        max_len = VI_EN_MAX_LENGTH-1
    elif translator == 'zh':
        max_len = ZH_EN_MAX_LENGTH-1
    tokens=tokens[:max_len]
    return tokens

def encoding_dataset(dataset, language, translator):
    data = [encoding_tokens(tokens, language, translator) for tokens in dataset] 
    return data

In [13]:
train_vi_en = encoding_dataset(train_vi_en, 'English', 'vi')
train_vi_vi = encoding_dataset(train_vi_vi, 'Vietnamese', 'vi')
test_vi_en = encoding_dataset(test_vi_en, 'English', 'vi')
test_vi_vi = encoding_dataset(test_vi_vi, 'Vietnamese', 'vi')
val_vi_en = encoding_dataset(val_vi_en, 'English', 'vi')
val_vi_vi = encoding_dataset(val_vi_vi, 'Vietnamese', 'vi')

train_zh_en = encoding_dataset(train_zh_en, 'English', 'zh')
train_zh_zh = encoding_dataset(train_zh_zh, 'Chinese', 'zh')
test_zh_en = encoding_dataset(test_zh_en, 'English', 'zh')
test_zh_zh = encoding_dataset(test_zh_zh, 'Chinese', 'zh')
val_zh_en = encoding_dataset(val_zh_en, 'English', 'zh')
val_zh_zh = encoding_dataset(val_zh_zh, 'Chinese', 'zh')

## Building Data Loaders

In [14]:
class translationDataset(Dataset):
    def __init__(self, data_list, target_list):
        self.data_list=data_list
        self.target_list=target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.target_list)

    def __getitem__(self, key):
        data = self.data_list[key][:MAX_SAMPLE_LENGTH]
        label = self.target_list[key][:MAX_SAMPLE_LENGTH]
        return [data, len(data), label, len(label)]

def translation_collate_func(batch):
    data_list = []
    label_list = []
    for datum in batch:
        padded_data = np.pad(np.array(datum[0]+[EOS_IDX]), 
                                pad_width=((0,MAX_SAMPLE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_data)
        padded_label = np.pad(np.array(datum[2]+[EOS_IDX]), 
                                pad_width=((0,MAX_SAMPLE_LENGTH-datum[3])), 
                                mode="constant", constant_values=0)
        label_list.append(padded_label)
    return [torch.from_numpy(np.array(data_list)), torch.from_numpy(np.array(label_list))]

In [15]:
# VI -> EN | dataloaders
MAX_SAMPLE_LENGTH = VI_EN_MAX_LENGTH

vi_en_train_dataset = translationDataset(train_vi_vi, train_vi_en)
vi_en_train_loader = torch.utils.data.DataLoader(dataset=vi_en_train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

vi_en_val_dataset = translationDataset(val_vi_vi, val_vi_en)
vi_en_val_loader = torch.utils.data.DataLoader(dataset=vi_en_val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

vi_en_test_dataset = translationDataset(test_vi_vi, test_vi_en)
vi_en_test_loader = torch.utils.data.DataLoader(dataset=vi_en_test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

In [16]:
# ZH -> EN | dataloaders
MAX_SAMPLE_LENGTH = ZH_EN_MAX_LENGTH

zh_en_train_dataset = translationDataset(train_zh_zh, train_zh_en)
zh_en_train_loader = torch.utils.data.DataLoader(dataset=zh_en_train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

zh_en_val_dataset = translationDataset(val_zh_zh, val_zh_en)
zh_en_val_loader = torch.utils.data.DataLoader(dataset=zh_en_val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

zh_en_test_dataset = translationDataset(test_zh_zh, test_zh_en)
zh_en_test_loader = torch.utils.data.DataLoader(dataset=zh_en_test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

## Building the RNN model

In [17]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, language, drop_rate=0):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.language = language
        if language == 'Vietnamese':
            self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(vi_loaded_embeddings), freeze=True)
        elif language == 'Chinese':
            self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(zh_loaded_embeddings), freeze=True)
        self.gru = nn.GRU(300, hidden_size)
        self.dropout = nn.Dropout(drop_rate)

    def forward(self, input):
        embedded = self.embedding(input)
        output = self.dropout(embedded)
        output, hidden = self.gru(output)
        return output, hidden


In [18]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, drop_rate=0):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(en_loaded_embeddings), freeze=True)
        self.gru = nn.GRU(hidden_size + 300, hidden_size)
        self.out = nn.Linear(hidden_size, len(en_token2id))
        self.softmax = nn.LogSoftmax(dim=1)
        self.dropout = nn.Dropout(p=drop_rate)

    def forward(self, input, hidden, enc_output):
        input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input)).unsqueeze(0)
        embedded_concat = torch.cat((embedded, enc_output), dim=2)
        output, hidden = self.gru(embedded_concat, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

In [75]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, max_length, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.max_length = max_length
        self.dropout_p = dropout_p
        
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(en_loaded_embeddings), freeze=True)
        self.gru = nn.GRU(self.hidden_size + 300, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)               
        self.dropout = nn.Dropout(p=dropout_p)
        

    def forward(self, input, hidden, encoder_outputs):
        input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input)).unsqueeze(0)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded, hidden), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded, attn_applied[0]), dim=1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    #def initHidden(self):
    #    return torch.zeros(1, 1, self.hidden_size, device=device)

In [20]:
def showPlot(points, string):
    plt.figure()
    fig, ax = plt.subplots()
    plt.plot(points)
    plt.title(string)
    plt.savefig((string+'.png'), dpi=300)

In [21]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [22]:
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    input_tensor = input_tensor.transpose(0,1)
    target_tensor = target_tensor.transpose(0,1)
    
    max_length = input_tensor.size(0)
    batch_size = input_tensor.size(1)
    vocab_size = len(en_token2id)
    
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size)

    loss = 0
    
    encoder_output, encoder_hidden = encoder(
        input_tensor)
    encoder_outputs = encoder_output[0,0]

    #decoder_input = torch.tensor([[SOS_IDX]])
    decoder_input = input_tensor[0,:]
    decoder_hidden = encoder_hidden
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    if use_teacher_forcing:
        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
               decoder_input, decoder_hidden, encoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]
    else:
        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
               decoder_input, decoder_hidden, encoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()
            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input[di].item() == EOS_IDX:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / max_length

In [23]:
def trainIters(loader, encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    language = encoder.language
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()
    
    for iter in range(1, n_iters + 1):
        for i, (data, labels) in enumerate(loader):
            input_tensor = data
            target_tensor = labels
    
            loss = train(input_tensor, target_tensor, encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion)
            print_loss_total += loss
            plot_loss_total += loss

            if iter % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                             iter, iter / n_iters * 100, print_loss_avg))

            if iter % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0
    showPlot(plot_losses, language)
    return plot_losses

In [76]:
#Training Model
hidden_size = 256
encoder1 = EncoderRNN(hidden_size, 'Vietnamese', drop_rate=0.1)
#decoder1 = DecoderRNN(hidden_size, drop_rate=0.1)

#en_num_words = en_loaded_embeddings.shape[0]-4
en_num_words = len(en_token2id)
attn_decoder1 = AttnDecoderRNN(hidden_size, en_num_words, VI_EN_MAX_LENGTH, dropout_p=0.1)

#encoder1.load_state_dict(torch.load("encoder.pth"))
#decoder1.load_state_dict(torch.load("attn_decoder.pth"))

In [77]:
plot_losses=trainIters(vi_en_val_loader, encoder1, attn_decoder1, n_iters=5, print_every=1, plot_every=1)

RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 1. Got 300 and 256 in dimension 2 at /Users/soumith/miniconda2/conda-bld/pytorch_1532624435833/work/aten/src/TH/generic/THTensorMath.cpp:3616

## Preparing the Data and Modeling

In [26]:
def indexesFromSentence2(lang, sentence):
    if lang = 'vi':
        return [vi_token2id[word] for word in sentence.split(' ')]
    elif lang = 'en':
        return [en_token2id[word] for word in sentence.split(' ')]
    elif lang = 'zh':
        return [zh_token2id[word] for word in sentence.split(' ')]

def tensorFromSentence2(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_IDX)
    return torch.tensor(indexes, dtype=torch.long).view(-1, 1)

In [27]:
def indexesFromSentence(lang, sentence):
    if lang == 'Vietnamese':
        out = []
        for word in sentence:
            if word in vi_token2id:
                out.append(vi_token2id[word])
            else:
                out.append(UNK_IDX)
    elif lang == 'English':
        out = []
        for word in sentence:
            if word in en_token2id:
                out.append(en_token2id[word])
            else:
                out.append(UNK_IDX)
    elif lang == 'Chinese':
        out = []
        for word in sentence:
            if word in zh_token2id:
                out.append(zh_token2id[word])
            else:
                out.append(UNK_IDX)
    return out

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_IDX)
    return torch.tensor(indexes, dtype=torch.long).view(-1, 1)

In [62]:
def evaluate(encoder, decoder, sentence, input_lang, max_length):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        
        encoder_hidden = torch.zeros(1, 1, encoder.hidden_size)
        encoder_outputs = torch.zeros(max_length, encoder.hidden_size)
        
        for ei in range(input_length): #call encoder token by token
            encoder_output, encoder_hidden = encoder(
                input_tensor[ei])
            encoder_outputs[ei] = encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_IDX]]) 
        decoder_hidden = encoder_hidden

        decoded_words = []
        #decoder_attentions = torch.zeros(max_length, max_length)
        
        print(encoder_outputs)
                
        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
               decoder_input, decoder_hidden, encoder_output)
            print(decoder_output)
            print(decoder_hidden)
            topv, topi = decoder_output.topk(1)
            #decoder_attentions[di] = decoder_attention.data
            decoder_input = topi.squeeze().detach()
            if topi.item() == EOS_IDX:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(en_id2token[topi.item()])

            #decoder_input = topi.squeeze().detach()

        return decoded_words#, decoder_attentions[:di + 1]

In [29]:
def evaluateRandomly(encoder, decoder, language, n=10):
    for i in range(n):
        if language == 'Vietnamese':
            index = randint(0, len(train_vi_en_orig))
            sentence1 = train_vi_vi_orig[index] 
            sentence2 = train_vi_en_orig[index]
        elif language == 'Chinese':
            index = randint(0, len(train_zh_en_orig))
            sentence1 = train_zh_zh_orig[index]
            sentence2 = train_zh_en_orig[index]
        
        print('>', sentence1)
        print('=', sentence2)
        output_words, attentions = evaluate(encoder, decoder, sentence1, language, MAX_LENGTH)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [63]:
MAX_LENGTH = VI_EN_MAX_LENGTH
evaluateRandomly(encoder1, decoder1, "Vietnamese")

> ['nhưng', 'tôi', 'đã', 'vẽ', '1100', 'tác_phẩm', 'và', 'rất', 'hiếm', 'các', 'nghệ_sĩ', 'đạt', 'được', 'số_lượng', 'này', '.']
= ['but', 'there', 'are', '1,100', ',', 'and', 'very', 'few', 'artists', 'have', 'drawn', 'so', 'many', 'faces', '.']
tensor([3258])


RuntimeError: input must have 3 dimensions, got 2

In [None]:
#TODO:calculate BLEU score on corpus level

hypotheses = []
targets = []
for i, (data, labels) in enumerate(vi_en_val_loader):
    output, attns = evaluate(encoder, decoder, data[0], language, MAX_LENGTH)
    hypotheses.append(output)
    targets.append(labels)
score = sacrebleu.corpus_bleu(hypotheses, targets)
print(score)