In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re  
import random
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np
import os
from torch.utils.data import Dataset
from sacrebleu import corpus_bleu

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [33]:
path_zh = '/home/ys2542/Neural-Translation-System/iwslt-zh-en-processed/'
path_vi = '/home/ys2542/Neural-Translation-System/iwslt-vi-en-processed/'
ft_home = '/scratch/ys2542/NLP_FASTTEXT/'
model_path = '/scratch/ys2542/model/'

In [4]:
PAD_token = 0
UNK_token = 1
SOS_token = 2
EOS_token = 3

In [5]:
words_to_load = 100000 
EMBED_SIZE = 300

In [6]:
with open(ft_home + 'wiki-news-300d-1M.vec') as f:
    matrix_size = words_to_load + 4
    loaded_embeddings_ft_en = np.zeros((matrix_size, EMBED_SIZE))
    words_ft_en = {'<pad>': PAD_token, '<unk>': UNK_token, '<sos>': SOS_token, '<eos>': EOS_token,}
    idx2words_ft_en = {PAD_token: '<pad>', UNK_token: '<unk>', SOS_token: '<sos>', EOS_token: '<eos>'}
    ordered_words_ft_en = ['<pad>', '<unk>', '<sos>', '<eos>']
    
    loaded_embeddings_ft_en[0,:] = np.zeros(EMBED_SIZE)
    loaded_embeddings_ft_en[1,:] = np.random.uniform(-1.0, 1.0, EMBED_SIZE)
    loaded_embeddings_ft_en[2,:] = np.random.uniform(-1.0, 1.0, EMBED_SIZE)
    loaded_embeddings_ft_en[3,:] = np.random.uniform(-1.0, 1.0, EMBED_SIZE)
    
    for i, line in enumerate(f):
        if i == 0:
            continue
        if i == words_to_load + 1: 
            break
        s = line.split()
        idx = i + 3
        loaded_embeddings_ft_en[idx, :] = np.asarray(s[1:])
        words_ft_en[s[0]] = idx
        idx2words_ft_en[idx] = s[0]
        ordered_words_ft_en.append(s[0])

In [7]:
with open(ft_home + 'cc.vi.300.vec') as f:
    matrix_size = words_to_load + 4
    loaded_embeddings_ft_zh = np.zeros((matrix_size, EMBED_SIZE))
    words_ft_zh = {'<pad>': PAD_token, '<unk>': UNK_token, '<sos>': SOS_token, '<eos>': EOS_token,}
    idx2words_ft_zh = {PAD_token: '<pad>', UNK_token: '<unk>', SOS_token: '<sos>', EOS_token: '<eos>'}
    ordered_words_ft_zh = ['<pad>', '<unk>', '<sos>', '<eos>']
    
    loaded_embeddings_ft_zh[0,:] = np.zeros(EMBED_SIZE)
    loaded_embeddings_ft_zh[1,:] = np.random.uniform(-1.0, 1.0, EMBED_SIZE)
    loaded_embeddings_ft_zh[2,:] = np.random.uniform(-1.0, 1.0, EMBED_SIZE)
    loaded_embeddings_ft_zh[3,:] = np.random.uniform(-1.0, 1.0, EMBED_SIZE)
    
    for i, line in enumerate(f):
        if i == 0:
            continue
        if i == words_to_load + 1: 
            break
        s = line.split()
        idx = i + 3
        loaded_embeddings_ft_zh[idx, :] = np.asarray(s[1:])
        words_ft_zh[s[0]] = idx
        idx2words_ft_zh[idx] = s[0]
        ordered_words_ft_zh.append(s[0])

In [8]:
lines_zh_train = open(path_vi + 'train.tok.vi', encoding = 'utf-8').read().strip().split('\n')
lines_en_train = open(path_vi + 'train.tok.en', encoding = 'utf-8').read().strip().split('\n')

lines_zh_val = open(path_vi + 'dev.tok.vi', encoding = 'utf-8').read().strip().split('\n')
lines_en_val = open(path_vi + 'dev.tok.en', encoding = 'utf-8').read().strip().split('\n')

lines_zh_test = open(path_vi + 'test.tok.vi', encoding = 'utf-8').read().strip().split('\n')
lines_en_test = open(path_vi + 'test.tok.en', encoding = 'utf-8').read().strip().split('\n')

In [9]:
def clean_lines(lines, lang):
    data = []
    for line in lines:
        if line == '':
            line = ' '
        if lang == 'en':
            line = line.replace("&apos;", "").replace("&quot;", "")
        if line[-1] != ' ':
            line = line + ' '
       
        line = '<sos> ' + line + '<eos>'
        data.append(line)
    return data

In [10]:
train_zh = clean_lines(lines_zh_train, 'vi')
train_en = clean_lines(lines_en_train, 'en')

val_zh = clean_lines(lines_zh_val, 'vi')
val_en = clean_lines(lines_en_val, 'en')

test_zh = clean_lines(lines_zh_test, 'vi')
test_en = clean_lines(lines_en_test, 'en')   

In [11]:
def indexesFromSentence(data, lang):
    indexes = []
    for sentence in data:
        index = []
        for token in sentence.split():
            if lang == 'vi':
                try:
                    index.append(words_ft_zh[token])
                except KeyError:
                    index.append(UNK_token)
            elif lang == 'en':
                try:
                    index.append(words_ft_en[token])
                except KeyError:
                    index.append(UNK_token)
        indexes.append(index)
    return indexes

In [12]:
train_zh_indexes = indexesFromSentence(train_zh, 'vi')
train_en_indexes = indexesFromSentence(train_en, 'en')

val_zh_indexes = indexesFromSentence(val_zh, 'vi')
val_en_indexes = indexesFromSentence(val_en, 'en')

test_zh_indexes = indexesFromSentence(test_zh, 'vi')
test_en_indexes = indexesFromSentence(test_en, 'en')

In [13]:
length_zh = []
for line in train_zh_indexes:
        length_zh.append(len(line))
        
length_zh = sorted(length_zh)
MAX_LENGTH_ZH = length_zh[int(len(train_zh_indexes)*0.99)]
print(MAX_LENGTH_ZH)

74


In [14]:
length_en = []
for line in train_en_indexes:
        length_en.append(len(line))
        
length_zh = sorted(length_en)
MAX_LENGTH_EN = length_en[int(len(train_en_indexes)*0.99)]
print(MAX_LENGTH_EN)

39


In [15]:
length_en = []
for line in val_zh_indexes:
        length_en.append(len(line))
        
length_zh = sorted(length_en)
MAX_LENGTH_EN = length_en[int(len(val_zh_indexes)*0.99)]
print(MAX_LENGTH_EN)

27


In [16]:
length_en = []
for line in val_en_indexes:
        length_en.append(len(line))
        
length_zh = sorted(length_en)
MAX_LENGTH_EN = length_en[int(len(val_en_indexes)*0.99)]
print(MAX_LENGTH_EN)

25


In [17]:
length_en = []
for line in test_zh_indexes:
        length_en.append(len(line))
        
length_zh = sorted(length_en)
MAX_LENGTH_EN = length_en[int(len(test_zh_indexes)*0.99)]
print(MAX_LENGTH_EN)

33


In [18]:
length_en = []
for line in test_en_indexes:
        length_en.append(len(line))
        
length_zh = sorted(length_en)
MAX_LENGTH_EN = length_en[int(len(test_en_indexes)*0.99)]
print(MAX_LENGTH_EN)

36


In [19]:
MAX_LENGTH_ZH = 74
MAX_LENGTH_EN = 39

In [20]:
train_zh_indexes_filtered = []
train_en_indexes_filtered = []
for i in range(len(train_zh_indexes)):
    if len(train_zh_indexes[i]) <= MAX_LENGTH_ZH and len(train_en_indexes[i]) <= MAX_LENGTH_EN:
        train_zh_indexes_filtered.append(train_zh_indexes[i])
        train_en_indexes_filtered.append(train_en_indexes[i])

In [21]:
val_zh_indexes_filtered = []
val_en_indexes_filtered = []
for i in range(len(val_zh_indexes)):
    if len(val_zh_indexes[i]) <= MAX_LENGTH_ZH and len(val_en_indexes[i]) <= MAX_LENGTH_EN:
        val_zh_indexes_filtered.append(val_zh_indexes[i])
        val_en_indexes_filtered.append(val_en_indexes[i])

In [22]:
test_zh_indexes_filtered = []
test_en_indexes_filtered = []
for i in range(len(test_zh_indexes)):
    if len(test_zh_indexes[i]) <= MAX_LENGTH_ZH and len(test_en_indexes[i]) <= MAX_LENGTH_EN:
        test_zh_indexes_filtered.append(test_zh_indexes[i])
        test_en_indexes_filtered.append(test_en_indexes[i])

In [23]:
class VocabDataset(Dataset):
    def __init__(self, data_list1, data_list2):
        
        self.data_list1 = data_list1
        self.data_list2 = data_list2
        
        assert (len(self.data_list1) == len(self.data_list2))

    def __len__(self):
        return len(self.data_list1)
            
    def __getitem__(self, key):        
        return [self.data_list1[key], self.data_list2[key], len(self.data_list1[key]), len(self.data_list2[key])]

In [24]:
def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list1 = []
    data_list2 = []
    length_list1 = []
    length_list2 = []
    
    for datum in batch:
        length_list1.append(datum[2])
        length_list2.append(datum[3])
        
        padded_vec1 = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_LENGTH_ZH-datum[2])), 
                                mode="constant", constant_values=0)
        padded_vec2 = np.pad(np.array(datum[1]), 
                                pad_width=((0,MAX_LENGTH_EN-datum[3])), 
                                mode="constant", constant_values=0)
        
        data_list1.append(padded_vec1[:MAX_LENGTH_ZH])
        data_list2.append(padded_vec2[:MAX_LENGTH_EN])


    return [torch.from_numpy(np.array(data_list1)).cuda(), torch.from_numpy(np.array(data_list2)).cuda(),
                torch.LongTensor(length_list1).cuda(), torch.LongTensor(length_list2).cuda()]

In [25]:
BATCH_SIZE = 32

train_dataset = VocabDataset(train_zh_indexes_filtered, train_en_indexes_filtered)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

val_dataset = VocabDataset(val_zh_indexes_filtered, val_en_indexes_filtered)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

test_dataset = VocabDataset(test_zh_indexes_filtered, test_en_indexes_filtered)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                           batch_size=1,
                                           collate_fn=vocab_collate_func,
                                           shuffle=False)

val_loader2 = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=1,
                                           collate_fn=vocab_collate_func,
                                           shuffle=False)

In [26]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embed_size=EMBED_SIZE):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(loaded_embeddings_ft_zh).float(), freeze=True)
        self.gru = nn.GRU(embed_size, hidden_size, batch_first=True)

    def forward(self, input, hidden):
        embedded = self.embedding(input)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [27]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH_ZH, embed_size=EMBED_SIZE):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(loaded_embeddings_ft_en).float(), freeze=True)
        self.attn = nn.Linear(hidden_size + embed_size, self.max_length)
        self.attn_combine = nn.Linear(hidden_size + embed_size, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)

        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded, hidden), 2)), dim=2)
        attn_applied = torch.bmm(attn_weights[0].unsqueeze(1), encoder_outputs).squeeze(1)

        output = torch.cat((embedded[0], attn_applied), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = self.softmax(self.out(output[0]))
        return output, hidden, attn_weights
    
    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [28]:
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    batch_size, input_length = input_tensor.size()
    _, target_length = target_tensor.size()
    
    encoder_hidden = encoder.initHidden(batch_size)
    encoder_output, encoder_hidden = encoder(input_tensor, encoder_hidden)
    
    loss = 0

    decoder_input = torch.tensor(np.array([[SOS_token]] * batch_size).reshape(1, batch_size), device=device)
    
    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    if use_teacher_forcing:
        for di in range(target_length):
            
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_output)
        
            
            loss += criterion(decoder_output, target_tensor[:,di])
            decoder_input = target_tensor[:,di].unsqueeze(0) 
            
    else:
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_output)
                        
            
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  
            decoder_input = decoder_input.unsqueeze(0)
 
            loss += criterion(decoder_output, target_tensor[:,di])
            

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [29]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
%matplotlib inline

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)
    
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [30]:
def trainIters(encoder, decoder, n_iters, print_every=100, plot_every=100, learning_rate=0.001):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  
    plot_loss_total = 0  
    
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        for i, (data1, data2, length1, length2) in enumerate(train_loader):
            input_tensor = data1
            target_tensor = data2
            loss = train(input_tensor, target_tensor, encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion)
            print_loss_total += loss
            plot_loss_total += loss

            if i % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

            if i % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0
                
    torch.save(encoder.state_dict(), model_path + "encoder_rnn_atten_vi.pth")
    torch.save(decoder.state_dict(), model_path + "decoder_rnn_atten_vi.pth")

    return plot_losses

In [31]:
hidden_size = 300
encoder1 = EncoderRNN(hidden_size).to(device)
decoder1 = AttnDecoderRNN(hidden_size, len(ordered_words_ft_zh)).to(device)

trainIters(encoder1, decoder1, 5)

0m 1s (- 0m 4s) (1 20%) 0.1150
1m 19s (- 5m 17s) (1 20%) 3.8997
2m 37s (- 10m 31s) (1 20%) 3.0649
3m 56s (- 15m 47s) (1 20%) 3.0374
5m 15s (- 21m 2s) (1 20%) 2.9286
6m 34s (- 26m 18s) (1 20%) 2.8891
7m 53s (- 31m 32s) (1 20%) 2.9018
9m 11s (- 36m 47s) (1 20%) 2.8604
10m 31s (- 42m 4s) (1 20%) 2.8607
11m 50s (- 47m 20s) (1 20%) 2.8460
13m 9s (- 52m 38s) (1 20%) 2.7882
14m 28s (- 57m 52s) (1 20%) 2.7800
15m 47s (- 63m 9s) (1 20%) 2.8082
17m 6s (- 68m 26s) (1 20%) 2.7821
18m 25s (- 73m 42s) (1 20%) 2.7391
19m 44s (- 78m 57s) (1 20%) 2.6882
21m 3s (- 84m 13s) (1 20%) 2.6658
22m 22s (- 89m 29s) (1 20%) 2.6730
23m 40s (- 94m 43s) (1 20%) 2.6289
25m 0s (- 100m 0s) (1 20%) 2.6840
26m 19s (- 105m 16s) (1 20%) 2.5890
27m 38s (- 110m 32s) (1 20%) 2.6255
28m 57s (- 115m 49s) (1 20%) 2.6366
30m 15s (- 121m 3s) (1 20%) 2.4967
31m 34s (- 126m 19s) (1 20%) 2.5895
32m 53s (- 131m 33s) (1 20%) 2.4800
34m 12s (- 136m 49s) (1 20%) 2.5416
35m 30s (- 142m 3s) (1 20%) 2.4835
36m 49s (- 147m 18s) (1 20%) 2.48

[0.11503860082381812,
 3.899672348804961,
 3.064934205275315,
 3.0373861166147096,
 2.928551674867288,
 2.889070035494291,
 2.901811183049128,
 2.860360891880133,
 2.860694850041315,
 2.8460226772993042,
 2.788154559013171,
 2.7799516198573953,
 2.808169641739284,
 2.7821068220872136,
 2.7391039236997945,
 2.6881531368157807,
 2.6657848945030804,
 2.67299201574081,
 2.628865657708584,
 2.68402099218124,
 2.589001822838416,
 2.625510895557893,
 2.6366046885955026,
 2.4966913233047876,
 2.5894644967103617,
 2.4799841073843156,
 2.541611396593924,
 2.483535381219326,
 2.481041050446339,
 2.5108203966189655,
 2.477260568080805,
 2.534766681377705,
 2.4854383282783714,
 2.474125690949268,
 2.481773545680902,
 2.4486713644174425,
 2.4501093898675377,
 2.4548056519337185,
 0.8366013629619893,
 2.3397327745877776,
 2.390543434925568,
 2.361044951218825,
 2.3931337063129137,
 2.349304383106721,
 2.3631850961538463,
 2.352697652180989,
 2.2910647954696266,
 2.3430681570982315,
 2.369914194742839

In [34]:
hidden_size = 300

encoder2 = EncoderRNN(hidden_size).to(device)
decoder2 = AttnDecoderRNN(hidden_size, len(ordered_words_ft_zh)).to(device)

encoder2.load_state_dict(torch.load(model_path + "encoder_rnn_atten_vi.pth"))
decoder2.load_state_dict(torch.load(model_path + "decoder_rnn_atten_vi.pth"))

In [31]:
def evaluate(loader, encoder, decoder):
    decoded_words_list = []
    decoder_attentions_list = []
    with torch.no_grad():
        for i, (data1, data2, length1, length2) in enumerate(loader):
            input_tensor = data1
            input_length = input_tensor.size()[0]
            
            encoder_hidden = encoder.initHidden(input_length)

            encoder_output, encoder_hidden = encoder(input_tensor, encoder_hidden)
            
            decoder_input = torch.tensor(np.array([[SOS_token]]), device=device)

            decoder_hidden = encoder_hidden

            decoded_words = []
            decoder_attentions = torch.zeros(MAX_LENGTH_ZH, MAX_LENGTH_ZH)
            
            for di in range(MAX_LENGTH_EN):
                decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input.reshape(1,1), decoder_hidden, encoder_output)
                decoder_attentions[di] = decoder_attention.data
                topv, topi = decoder_output.data.topk(1) 
                if topi.item() == EOS_token:
                    decoded_words.append('<eos>')
                    break
                else:
                    decoded_words.append(idx2words_ft_en[topi.item()])

                decoder_input = topi.squeeze().detach()
                decoder_input = decoder_input.unsqueeze(0)
                
            decoded_words_list.append(decoded_words)
            decoder_attentions_list.append(decoder_attentions[:di + 1])
                   
        return decoded_words_list, decoder_attentions_list  

In [35]:
predicted_list, attention_list = evaluate(val_loader2, encoder2, decoder2)

In [36]:
predicted_list_nopad = []
for ii in range(len(predicted_list)):
    line = ''
    for jj in predicted_list[ii]:
        if jj != '<pad>':
            line = line + ' ' + jj
    predicted_list_nopad.append(line)

for iii in range(len(predicted_list_nopad)):
    if predicted_list_nopad[iii][-5:] == '<eos>':
        predicted_list_nopad[iii] = predicted_list_nopad[iii][5:-5]
    else:
        predicted_list_nopad[iii] = predicted_list_nopad[iii][5:]

label_list = []
for iii in range(len(val_en_indexes_filtered)):
    line = ''
    for jjj in val_en_indexes_filtered[iii]:
        line = line + ' ' + idx2words_ft_en[jjj]
    label_list.append(line[5:-5])

print('bleu score for validation dataset:', corpus_bleu(predicted_list_nopad, [label_list]).score)



bleu score for validation dataset: 6.8769534635183955


In [37]:
choice = random.randint(0, len(predicted_list_nopad)-1)
print(predicted_list_nopad[choice])
print(label_list[choice])

> And I didn t t t I I I saw that I , very , very .
> I didn t know what it meant , but I could see that my father was very , very happy . 


In [43]:
predicted_list, attention_list = evaluate(test_loader, encoder2, decoder2)

In [44]:
predicted_list_nopad = []
for ii in range(len(predicted_list)):
    line = ''
    for jj in predicted_list[ii]:
        if jj != '<pad>':
            line = line + ' ' + jj
    predicted_list_nopad.append(line)

for iii in range(len(predicted_list_nopad)):
    if predicted_list_nopad[iii][-5:] == '<eos>':
        predicted_list_nopad[iii] = predicted_list_nopad[iii][5:-5]
    else:
        predicted_list_nopad[iii] = predicted_list_nopad[iii][5:]

label_list = []
for iii in range(len(test_en_indexes_filtered)):
    line = ''
    for jjj in test_en_indexes_filtered[iii]:
        line = line + ' ' + idx2words_ft_en[jjj]
    label_list.append(line[5:-5])

print('bleu score for test dataset:', corpus_bleu(predicted_list_nopad, [label_list]).score)



bleu score for test dataset: 7.376933713146011


In [45]:
choice = random.randint(0, len(predicted_list_nopad)-1)
print(predicted_list_nopad[choice])
print(label_list[choice])

> After the in the , , , , , , the .
> After three months in a refugee camp , we landed in Melbourne . 


In [None]:
# for iii in range(len(predicted_list_nopad)):
#     if predicted_list_nopad[iii][-5:] != '<eos>':
#         predicted_list_nopad[iii] = predicted_list_nopad[iii] + ' <eos>'

In [None]:
# label_list = []
# for iii in range(len(val_en_indexes_filtered)):
#     line = ''
#     for jjj in val_en_indexes_filtered[iii]:
#         line = line + ' ' + idx2words_ft_en[jjj]
#     label_list.append(line)

In [38]:
#print('bleu score for validation dataset:', corpus_bleu(predicted_list_nopad, [label_list]).score)

bleu score for validation dataset: 21.93806547688664


In [39]:
# choice = random.randint(0, len(predicted_list_nopad)-1)
# print(predicted_list_nopad[choice])
# print(label_list[choice])

 <sos> But if we them them it , if they , they they the the the the and and , , they they they they to . <eos>
 <sos> But when none of this is presented to them , if they re not shown how food affects the mind and the body , they blindly eat whatever the hell you put in front of them . <eos>


In [40]:
# predicted_list, attention_list = evaluate(test_loader, encoder2, decoder2)

In [41]:
# predicted_list_nopad = []
# for ii in range(len(predicted_list)):
#     line = ''
#     for jj in predicted_list[ii]:
#         if jj != '<pad>':
#             line = line + ' ' + jj
#     predicted_list_nopad.append(line)

In [42]:
# for iii in range(len(predicted_list_nopad)):
#     if predicted_list_nopad[iii][-5:] != '<eos>':
#         predicted_list_nopad[iii] = predicted_list_nopad[iii] + ' <eos>'

In [43]:
# label_list = []
# for iii in range(len(test_en_indexes_filtered)):
#     line = ''
#     for jjj in test_en_indexes_filtered[iii]:
#         line = line + ' ' + idx2words_ft_en[jjj]
#     label_list.append(line)

In [44]:
#print('bleu score for test dataset:', corpus_bleu(predicted_list_nopad, [label_list]).score)

bleu score for test dataset: 22.999052090280383


In [45]:
# choice = random.randint(0, len(predicted_list_nopad)-1)
# print(predicted_list_nopad[choice])
# print(label_list[choice])

 <sos> Let me show you a a picture . <eos>
 <sos> And let me show you a simple example . <eos>
