# Building an RNN for Machine Translation
## Initial Data Work

In this file we will read in the data for the Vietnamese and Chinese to Engish corpuses, build a token2id and char2id mapping, vocabularies and data loaders

In [13]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch import optim
import pickle as pkl
import random
import csv
import pandas as pd
import matplotlib.pyplot as plt
import gensim
from gensim.models import KeyedVectors
import io
from collections import Counter

random.seed(123)

PAD_IDX = 0
UNK_IDX = 1
SOS_IDX = 2
EOS_IDX = 3
BATCH_SIZE = 64


# Load Pre-trained Word Vectors
def load_embeddings(word2vec, word2id, embedding_dim):
    embeddings = np.zeros((len(word2id), embedding_dim))
    for word, index in word2id.items():
        try:
            embeddings[index] = word2vec[word]

        except KeyError:
            embeddings[index] = np.random.normal(scale=0.6, size=(300,))

    return embeddings


def load_vectors(fname, num_vecs=None):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))

        if num_vecs is None:
            pass
        else:
            if len(data) + 1 > num_vecs:
                break

    return data


#word_vectors = load_vectors('/Volumes/Samsung USB/nlp_project/word_embeds/wiki.en.vec',50000)

#print("Total number of words embedded is {:,d}".format(len(word_vectors)))


def data_dictionary(tokens, vocab_size_limit):
    token_counter = Counter()
    for token in tokens:
        token_counter[token] += 1

    vocab, count = zip(*token_counter.most_common(vocab_size_limit))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(4, 4 + len(vocab))))
    id2token = ['<pad>', '<unk>','<sos>','<eos>'] + id2token
    token2id['<pad>'] = PAD_IDX
    token2id['<unk>'] = UNK_IDX
    token2id['<sos>'] = SOS_IDX
    token2id['<eos>'] = EOS_IDX
    return token2id, id2token


#token2id, id2token = data_dictionary(list(word_vectors.keys()), 50000)

#print("Total number of words in token2id is {:,d}".format(len(token2id)))  # Included UNK and PAD index


def token2index_dataset(tokens_data):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

## Loading in training, validation and test sets

In [2]:
#Loading in the Vietnamese -> En datasets
path= '/Volumes/Samsung USB/nlp_project/iwslt-vi-en-processed/'

train_vi_en = []
with open(path +'train.tok.en') as inputfile:
    for line in inputfile:
        train_vi_en.append(line.strip().lower().split(' '))

train_vi_vi = []
with open(path+ 'train.tok.vi') as inputfile:
    for line in inputfile:
        train_vi_vi.append(line.strip().lower().split(' '))

val_vi_en = []
with open(path + 'dev.tok.en') as inputfile:
    for line in inputfile:
        val_vi_en.append(line.strip().lower().split(' '))

val_vi_vi = []
with open(path +'dev.tok.vi') as inputfile:
    for line in inputfile:
        val_vi_vi.append(line.strip().lower().split(' '))
        
test_vi_en = []
with open(path+'test.tok.en') as inputfile:
    for line in inputfile:
        test_vi_en.append(line.strip().lower().split(' '))

test_vi_vi = []
with open(path+ 'test.tok.vi') as inputfile:
    for line in inputfile:
        test_vi_vi.append(line.strip().lower().split(' '))


In [4]:
#Loading in the Chinese -> En datasets
path= '/Volumes/Samsung USB/nlp_project/iwslt-zh-en-processed/'

train_zh_en = []
with open(path + 'train.tok.en') as inputfile:
    for line in inputfile:
        train_zh_en.append(line.strip().lower().split('\t'))

i=0
train_zh_zh = []
fin = io.open(path +'train.tok.zh', 'r', encoding='utf-8',newline= '\n', errors='ignore')
#n, d = map(str, fin.readline().split())
for line in fin:
    if i % 2 == 0 :
        train_zh_zh.append(line.rstrip().split(' '))
    i+=1        
        
        
val_zh_en = []
with open(path+ 'dev.tok.en') as inputfile:
    for line in inputfile:
        val_zh_en.append(line.strip().lower().split('\t'))

i=0
val_zh_zh = []
fin = io.open(path +'dev.tok.zh', 'r', encoding='utf-8',newline= '\n', errors='ignore')
#n, d = map(str, fin.readline().split())
for line in fin:
    if i % 2 == 0 :
        val_zh_zh.append(line.rstrip().split(' '))
    i+=1
                
        

test_zh_en = []
with open(path +'test.tok.en') as inputfile:
    for line in inputfile:
        test_zh_en.append(line.strip().lower().split('\t'))

i=0
test_zh_zh = []
fin = io.open(path +'test.tok.zh', 'r', encoding='utf-8',newline= '\n', errors='ignore')
#n, d = map(str, fin.readline().split())
for line in fin:
    if i % 2 == 0 :
        test_zh_zh.append(line.rstrip().split(' '))
    i+=1


In [5]:
#Sanity Checking
print("Vi -> En | Training Examples: "+str(len(train_vi_en)))
print("Vi -> En | Training Examples: "+str(len(train_vi_vi)), '\n')

print("Vi -> En | Validation Examples: "+str(len(val_vi_en)))
print("Vi -> En | Validation Examples: "+str(len(val_vi_vi)), '\n')

print("Vi -> En | Testing Examples: "+str(len(test_vi_en)))
print("Vi -> En | Testing Examples: "+str(len(test_vi_vi)), '\n')

print("Zh -> En | Training Examples: "+str(len(train_zh_en)))
print("Zh -> En | Training Examples: "+str(len(train_zh_zh)), '\n')

print("Zh -> En | Validation Examples: "+str(len(val_zh_en)))
print("Zh -> En | Validation Examples: "+str(len(val_zh_zh)), '\n')

print("Zh -> En | Testing Examples: "+str(len(test_zh_en)))
print("Zh -> En | Testing Examples: "+str(len(test_zh_zh)), '\n')

Vi -> En | Training Examples: 133317
Vi -> En | Training Examples: 133317 

Vi -> En | Validation Examples: 1268
Vi -> En | Validation Examples: 1268 

Vi -> En | Testing Examples: 1553
Vi -> En | Testing Examples: 1553 

Zh -> En | Training Examples: 213377
Zh -> En | Training Examples: 213377 

Zh -> En | Validation Examples: 1261
Zh -> En | Validation Examples: 1261 

Zh -> En | Testing Examples: 1397
Zh -> En | Testing Examples: 1397 



## Loading in pre-trained fasttext embeddings for the three languages
### Building loaded embeddings, token2id, id2token and ordered words for all languages

In [14]:
#Load Word Embeddings

path= '/Volumes/Samsung USB/nlp_project/word_embeds/'

en_loaded_embeddings = load_vectors(path +'wiki.en.vec',20)
print("Total number of words embedded is {:,d}".format(len(en_loaded_embeddings)))

vi_loaded_embeddings = load_vectors(path+'wiki.vi.vec',20)
print("Total number of words embedded is {:,d}".format(len(vi_loaded_embeddings)))

zh_loaded_embeddings = load_vectors(path+'wiki.zh.vec',20)
print("Total number of words embedded is {:,d}".format(len(zh_loaded_embeddings)))


#Create token2Id, token2Id
en_token2id, en_id2token = data_dictionary(list(en_loaded_embeddings.keys()), 1000)
vi_token2id, vi_id2token = data_dictionary(list(vi_loaded_embeddings.keys()), 1000)
zh_token2id, zh_id2token = data_dictionary(list(zh_loaded_embeddings.keys()), 1000)


#Create Emedding Matrix
en_loaded_embeddings=load_embeddings(en_loaded_embeddings,en_token2id,300)
vi_loaded_embeddings=load_embeddings(vi_loaded_embeddings,vi_token2id,300)
zh_loaded_embeddings=load_embeddings(zh_loaded_embeddings,zh_token2id,300)



Total number of words embedded is 20
Total number of words embedded is 20
Total number of words embedded is 20


In [15]:
vi_token2id

{'</s>': 4,
 '.': 5,
 ',': 6,
 "'": 7,
 'thể': 8,
 'là': 9,
 'loại': 10,
 'được': 11,
 'một': 12,
 ')': 13,
 '(': 14,
 'năm': 15,
 'của': 16,
 '-': 17,
 'và': 18,
 'trong': 19,
 'loài': 20,
 'có': 21,
 'các': 22,
 'này': 23,
 '<pad>': 0,
 '<unk>': 1,
 '<sos>': 2,
 '<eos>': 3}

In [None]:
# building the three vocabs from pre-trained embeddings
en_embeddings = KeyedVectors.load_word2vec_format('../pretrained_embeddings/wiki.en.vec')
vi_embeddings = KeyedVectors.load_word2vec_format('../pretrained_embeddings/wiki.vi.vec')
zh_embeddings = KeyedVectors.load_word2vec_format('../pretrained_embeddings/wiki.zh.vec')

In [None]:
en_loaded_embeddings = en_embeddings.vectors
vi_loaded_embeddings = vi_embeddings.vectors
zh_loaded_embeddings = zh_embeddings.vectors

In [None]:
#adding PAD AND UNK embeddings
en_loaded_embeddings = np.insert(en_loaded_embeddings, 0, np.zeros(300,), axis=0)
vi_loaded_embeddings = np.insert(vi_loaded_embeddings, 0, np.zeros(300,), axis=0)
zh_loaded_embeddings = np.insert(zh_loaded_embeddings, 0, np.zeros(300,), axis=0)

en_loaded_embeddings = np.insert(en_loaded_embeddings, 1, np.random.rand(300,), axis=0)
vi_loaded_embeddings = np.insert(vi_loaded_embeddings, 1, np.random.rand(300,), axis=0)
zh_loaded_embeddings = np.insert(zh_loaded_embeddings, 1, np.random.rand(300,), axis=0)

en_loaded_embeddings = np.insert(en_loaded_embeddings, 2, np.random.rand(300,), axis=0)
vi_loaded_embeddings = np.insert(vi_loaded_embeddings, 2, np.random.rand(300,), axis=0)
zh_loaded_embeddings = np.insert(zh_loaded_embeddings, 2, np.random.rand(300,), axis=0)

en_loaded_embeddings = np.insert(en_loaded_embeddings, 3, np.random.rand(300,), axis=0)
vi_loaded_embeddings = np.insert(vi_loaded_embeddings, 3, np.random.rand(300,), axis=0)
zh_loaded_embeddings = np.insert(zh_loaded_embeddings, 3, np.random.rand(300,), axis=0)

In [None]:
# building out id2token and token2id for all languages
en_token2id = {j: i for i,j in enumerate(['PAD','UNK','SOS_IDX','EOS_IDX']+en_embeddings.index2word)}
en_id2token = {i: j for i,j in enumerate(['PAD','UNK','SOS_IDX','EOS_IDX']+en_embeddings.index2word)}
vi_token2id = {j: i for i,j in enumerate(['PAD','UNK','SOS_IDX','EOS_IDX']+vi_embeddings.index2word)}
vi_id2token = {i: j for i,j in enumerate(['PAD','UNK','SOS_IDX','EOS_IDX']+vi_embeddings.index2word)}
zh_token2id = {j: i for i,j in enumerate(['PAD','UNK','SOS_IDX','EOS_IDX']+zh_embeddings.index2word)}
zh_id2token = {i: j for i,j in enumerate(['PAD','UNK','SOS_IDX','EOS_IDX']+zh_embeddings.index2word)}

### Encoding our data

In [49]:
VI_EN_MAX_LENGTH = int(np.percentile([len(sentence) for sentence in train_vi_en+train_vi_vi], 90))+1
ZH_EN_MAX_LENGTH = int(np.percentile([len(sentence) for sentence in train_zh_en+train_zh_zh], 90))+1
print(VI_EN_MAX_LENGTH)

39


In [17]:
def encoding_tokens(sentence, language, translator):
    if language== 'English':
        token2id = en_token2id
    elif language== 'Vietnamese':
        token2id = vi_token2id
    elif language== 'Chinese':
        token2id = zh_token2id
    tokens = [token2id[token] if token in token2id else UNK_IDX for token in sentence]
    if translator == 'vi':
        max_len = VI_EN_MAX_LENGTH-1
    elif translator == 'zh':
        max_len = ZH_EN_MAX_LENGTH-1
    tokens=tokens[:max_len]
    return tokens

def encoding_dataset(dataset, language, translator):
    data = [encoding_tokens(tokens, language, translator) for tokens in dataset] 
    return data

In [18]:
train_vi_en = encoding_dataset(train_vi_en, 'English', 'vi')
train_vi_vi = encoding_dataset(train_vi_vi, 'Vietnamese', 'vi')
test_vi_en = encoding_dataset(test_vi_en, 'English', 'vi')
test_vi_vi = encoding_dataset(test_vi_vi, 'Vietnamese', 'vi')
val_vi_en = encoding_dataset(val_vi_en, 'English', 'vi')
val_vi_vi = encoding_dataset(val_vi_vi, 'Vietnamese', 'vi')

train_zh_en = encoding_dataset(train_zh_en, 'English', 'zh')
train_zh_zh = encoding_dataset(train_zh_zh, 'Chinese', 'zh')
test_zh_en = encoding_dataset(test_zh_en, 'English', 'zh')
test_zh_zh = encoding_dataset(test_zh_zh, 'Chinese', 'zh')
val_zh_en = encoding_dataset(val_zh_en, 'English', 'zh')
val_zh_zh = encoding_dataset(val_zh_zh, 'Chinese', 'zh')

In [20]:
train_vi_en[2]

[1,
 1,
 1,
 15,
 1,
 15,
 1,
 1,
 1,
 6,
 1,
 8,
 6,
 1,
 1,
 1,
 1,
 1,
 1,
 6,
 1,
 1,
 1,
 10,
 6,
 1,
 5]

## Building Data Loaders

In [21]:
MAX_SAMPLE_LENGTH = VI_EN_MAX_LENGTH

class translationDataset(Dataset):
    def __init__(self, data_list, target_list):
        self.data_list=data_list
        self.target_list=target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.target_list)

    def __getitem__(self, key):
        data = self.data_list[key][:MAX_SAMPLE_LENGTH]
        label = self.target_list[key][:MAX_SAMPLE_LENGTH]
        return [data, len(data), label, len(label)]

def translation_collate_func(batch):
    data_list = []
    label_list = []
    for datum in batch:
        padded_data = np.pad(np.array(datum[0]+[EOS_IDX]), 
                                pad_width=((0,MAX_SAMPLE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_data)
        padded_label = np.pad(np.array(datum[2]+[EOS_IDX]), 
                                pad_width=((0,MAX_SAMPLE_LENGTH-datum[3])), 
                                mode="constant", constant_values=0)
        label_list.append(padded_label)
    return [torch.from_numpy(np.array(data_list)), torch.from_numpy(np.array(label_list))]

In [54]:
# VI -> EN | dataloaders
MAX_SAMPLE_LENGTH = VI_EN_MAX_LENGTH

vi_en_train_dataset = translationDataset(train_vi_vi, train_vi_en)
vi_en_train_loader = torch.utils.data.DataLoader(dataset=vi_en_train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

vi_en_val_dataset = translationDataset(val_vi_vi, val_vi_en)
vi_en_val_loader = torch.utils.data.DataLoader(dataset=vi_en_val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

vi_en_test_dataset = translationDataset(test_vi_vi, test_vi_en)
vi_en_test_loader = torch.utils.data.DataLoader(dataset=vi_en_test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

In [52]:
# ZH -> EN | dataloaders

zh_en_train_dataset = translationDataset(train_zh_zh, train_zh_en)
zh_en_train_loader = torch.utils.data.DataLoader(dataset=zh_en_train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

zh_en_val_dataset = translationDataset(val_zh_zh, val_zh_en)
zh_en_val_loader = torch.utils.data.DataLoader(dataset=zh_en_val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

zh_en_test_dataset = translationDataset(test_zh_zh, test_zh_en)
zh_en_test_loader = torch.utils.data.DataLoader(dataset=zh_en_test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

In [53]:
vectors = list(vi_en_val_loader)
len(vector[0][6])

36

## Building the RNN model

In [24]:
# encoder

class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, language, drop_rate=0):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        if language == 'Vietnamese':
            self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(vi_loaded_embeddings), freeze=True)
        elif language == 'Chinese':
            self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(zh_loaded_embeddings), freeze=True)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.dropout = nn.Dropout(drop_rate)

    def forward(self, input, hidden):
        embedded = self.dropout(self.embedding(input).view(1, 1, -1))
        output, hidden = self.gru(hidden)
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)

In [25]:
# decoder

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, drop_rate=0):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(en_loaded_embeddings), freeze=True)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, len(en_loaded_embeddings))
        self.softmax = nn.LogSoftmax(dim=1)
        self.dropout = nn.Dropout(p=drop_rate)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)

In [26]:
def showPlot(points, string):
    plt.figure()
    fig, ax = plt.subplots()
    plt.plot(points)
    plt.title(string)
    plt.savefig((string+'.png'), dpi=300)

In [27]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [55]:
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=36):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0,0]

    decoder_input = torch.tensor([[SOS_IDX]])

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_IDX:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [65]:
encoder_outputs = torch.zeros(36,256)
encoder_outputs


tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [56]:
def trainIters(loader, encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        for i, (data, labels) in enumerate(loader):
            input_tensor = data
            target_tensor = labels

            loss = train(input_tensor, target_tensor, encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion)
            print_loss_total += loss
            plot_loss_total += loss

            if iter % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                             iter, iter / n_iters * 100, print_loss_avg))

            if iter % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0
    showPlot(plot_losses, language)

In [57]:
#Training Model
hidden_size = 256
encoder1 = EncoderRNN(hidden_size, 'Vietnamese', drop_rate=0.1)
decoder1 = DecoderRNN(hidden_size, drop_rate=0.1)

#encoder1.load_state_dict(torch.load("encoder.pth"))
#decoder1.load_state_dict(torch.load("attn_decoder.pth"))

In [59]:
trainIters(vi_en_val_loader, encoder1, decoder1, n_iters=200, print_every=100)

IndexError: index 36 is out of bounds for dimension 0 with size 36

In [None]:
def evaluate(encoder, decoder, sentence, max_length):
    """
    Function that generate translation.
    First, feed the source sentence into the encoder and obtain the hidden states from encoder.
    Secondly, feed the hidden states into the decoder and unfold the outputs from the decoder.
    Lastly, for each outputs from the decoder, collect the corresponding words in the target language's vocabulary.
    And collect the attention for each output words.
    @param encoder: the encoder network
    @param decoder: the decoder network
    @param sentence: string, a sentence in source language to be translated
    @param max_length: the max # of words that the decoder can return
    @output decoded_words: a list of words in target language
    """    
    # process input sentence
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        # encode the source lanugage
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_IDX]])  # SOS
        # decode the context vector
        decoder_hidden = encoder_hidden # decoder starts from the last encoding sentence
        # output of this function
        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            # for each time step, the decoder network takes two inputs: previous outputs and the previous hidden states
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            
            # hint: print out decoder_output and decoder_attention
            # TODO: add your code here to populate decoded_words and decoder_attentions
            # TODO: do this in 2 ways discussed in class: greedy & beam_search
            
            # END TO DO
            
            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

## Preparing the Data and Modeling

In [None]:
def indexesFromSentence(lang, sentence):
    if lang = 'vi':
        return [vi_token2id[word] for word in sentence.split(' ')]
    elif lang = 'en':
        return [en_token2id[word] for word in sentence.split(' ')]
    elif lang = 'zh':
        return [zh_token2id[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_IDX)
    return torch.tensor(indexes, dtype=torch.long).view(-1, 1)