In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re  
import random
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np
import os
from torch.utils.data import Dataset
from sacrebleu import corpus_bleu

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
path_zh = '/home/ys2542/Neural-Translation-System/iwslt-zh-en-processed/'
path_vi = '/home/ys2542/Neural-Translation-System/iwslt-vi-en-processed/'
ft_home = '/scratch/ys2542/NLP_FASTTEXT/'
model_path = '/scratch/ys2542/model/'

In [4]:
PAD_token = 0
UNK_token = 1
SOS_token = 2
EOS_token = 3

In [5]:
words_to_load = 100000 
EMBED_SIZE = 300

In [6]:
with open(ft_home + 'wiki-news-300d-1M.vec') as f:
    matrix_size = words_to_load + 4
    loaded_embeddings_ft_en = np.zeros((matrix_size, EMBED_SIZE))
    words_ft_en = {'<pad>': PAD_token, '<unk>': UNK_token, '<sos>': SOS_token, '<eos>': EOS_token,}
    idx2words_ft_en = {PAD_token: '<pad>', UNK_token: '<unk>', SOS_token: '<sos>', EOS_token: '<eos>'}
    ordered_words_ft_en = ['<pad>', '<unk>', '<sos>', '<eos>']
    
    loaded_embeddings_ft_en[0,:] = np.zeros(EMBED_SIZE)
    loaded_embeddings_ft_en[1,:] = np.random.uniform(-1.0, 1.0, EMBED_SIZE)
    loaded_embeddings_ft_en[2,:] = np.random.uniform(-1.0, 1.0, EMBED_SIZE)
    loaded_embeddings_ft_en[3,:] = np.random.uniform(-1.0, 1.0, EMBED_SIZE)
    
    for i, line in enumerate(f):
        if i == 0:
            continue
        if i == words_to_load + 1: 
            break
        s = line.split()
        idx = i + 3
        loaded_embeddings_ft_en[idx, :] = np.asarray(s[1:])
        words_ft_en[s[0]] = idx
        idx2words_ft_en[idx] = s[0]
        ordered_words_ft_en.append(s[0])

In [7]:
with open(ft_home + 'cc.vi.300.vec') as f:
    matrix_size = words_to_load + 4
    loaded_embeddings_ft_zh = np.zeros((matrix_size, EMBED_SIZE))
    words_ft_zh = {'<pad>': PAD_token, '<unk>': UNK_token, '<sos>': SOS_token, '<eos>': EOS_token,}
    idx2words_ft_zh = {PAD_token: '<pad>', UNK_token: '<unk>', SOS_token: '<sos>', EOS_token: '<eos>'}
    ordered_words_ft_zh = ['<pad>', '<unk>', '<sos>', '<eos>']
    
    loaded_embeddings_ft_zh[0,:] = np.zeros(EMBED_SIZE)
    loaded_embeddings_ft_zh[1,:] = np.random.uniform(-1.0, 1.0, EMBED_SIZE)
    loaded_embeddings_ft_zh[2,:] = np.random.uniform(-1.0, 1.0, EMBED_SIZE)
    loaded_embeddings_ft_zh[3,:] = np.random.uniform(-1.0, 1.0, EMBED_SIZE)
    
    for i, line in enumerate(f):
        if i == 0:
            continue
        if i == words_to_load + 1: 
            break
        s = line.split()
        idx = i + 3
        loaded_embeddings_ft_zh[idx, :] = np.asarray(s[1:])
        words_ft_zh[s[0]] = idx
        idx2words_ft_zh[idx] = s[0]
        ordered_words_ft_zh.append(s[0])

In [8]:
lines_zh_train = open(path_vi + 'train.tok.vi', encoding = 'utf-8').read().strip().split('\n')
lines_en_train = open(path_vi + 'train.tok.en', encoding = 'utf-8').read().strip().split('\n')

lines_zh_val = open(path_vi + 'dev.tok.vi', encoding = 'utf-8').read().strip().split('\n')
lines_en_val = open(path_vi + 'dev.tok.en', encoding = 'utf-8').read().strip().split('\n')

lines_zh_test = open(path_vi + 'test.tok.vi', encoding = 'utf-8').read().strip().split('\n')
lines_en_test = open(path_vi + 'test.tok.en', encoding = 'utf-8').read().strip().split('\n')

In [9]:
def clean_lines(lines, lang):
    data = []
    for line in lines:
        if line == '':
            line = ' '
        if lang == 'en':
            line = line.replace("&apos;", "").replace("&quot;", "")
        if line[-1] != ' ':
            line = line + ' '
       
        line = '<sos> ' + line + '<eos>'
        data.append(line)
    return data

In [10]:
train_zh = clean_lines(lines_zh_train, 'vi')
train_en = clean_lines(lines_en_train, 'en')

val_zh = clean_lines(lines_zh_val, 'vi')
val_en = clean_lines(lines_en_val, 'en')

test_zh = clean_lines(lines_zh_test, 'vi')
test_en = clean_lines(lines_en_test, 'en')   

In [11]:
def indexesFromSentence(data, lang):
    indexes = []
    for sentence in data:
        index = []
        for token in sentence.split():
            if lang == 'vi':
                try:
                    index.append(words_ft_zh[token])
                except KeyError:
                    index.append(UNK_token)
            elif lang == 'en':
                try:
                    index.append(words_ft_en[token])
                except KeyError:
                    index.append(UNK_token)
        indexes.append(index)
    return indexes

In [12]:
train_zh_indexes = indexesFromSentence(train_zh, 'vi')
train_en_indexes = indexesFromSentence(train_en, 'en')

val_zh_indexes = indexesFromSentence(val_zh, 'vi')
val_en_indexes = indexesFromSentence(val_en, 'en')

test_zh_indexes = indexesFromSentence(test_zh, 'vi')
test_en_indexes = indexesFromSentence(test_en, 'en')

In [13]:
length_zh = []
for line in train_zh_indexes:
        length_zh.append(len(line))
        
length_zh = sorted(length_zh)
MAX_LENGTH_ZH = length_zh[int(len(train_zh_indexes)*0.99)]
print(MAX_LENGTH_ZH)

74


In [14]:
length_en = []
for line in train_en_indexes:
        length_en.append(len(line))
        
length_zh = sorted(length_en)
MAX_LENGTH_EN = length_en[int(len(train_en_indexes)*0.99)]
print(MAX_LENGTH_EN)

39


In [15]:
length_en = []
for line in val_zh_indexes:
        length_en.append(len(line))
        
length_zh = sorted(length_en)
MAX_LENGTH_EN = length_en[int(len(val_zh_indexes)*0.99)]
print(MAX_LENGTH_EN)

27


In [16]:
length_en = []
for line in val_en_indexes:
        length_en.append(len(line))
        
length_zh = sorted(length_en)
MAX_LENGTH_EN = length_en[int(len(val_en_indexes)*0.99)]
print(MAX_LENGTH_EN)

25


In [17]:
length_en = []
for line in test_zh_indexes:
        length_en.append(len(line))
        
length_zh = sorted(length_en)
MAX_LENGTH_EN = length_en[int(len(test_zh_indexes)*0.99)]
print(MAX_LENGTH_EN)

33


In [18]:
length_en = []
for line in test_en_indexes:
        length_en.append(len(line))
        
length_zh = sorted(length_en)
MAX_LENGTH_EN = length_en[int(len(test_en_indexes)*0.99)]
print(MAX_LENGTH_EN)

36


In [19]:
MAX_LENGTH_ZH = 74
MAX_LENGTH_EN = 39

In [20]:
train_zh_indexes_filtered = []
train_en_indexes_filtered = []
for i in range(len(train_zh_indexes)):
    if len(train_zh_indexes[i]) <= MAX_LENGTH_ZH and len(train_en_indexes[i]) <= MAX_LENGTH_EN:
        train_zh_indexes_filtered.append(train_zh_indexes[i])
        train_en_indexes_filtered.append(train_en_indexes[i])

In [21]:
val_zh_indexes_filtered = []
val_en_indexes_filtered = []
for i in range(len(val_zh_indexes)):
    if len(val_zh_indexes[i]) <= MAX_LENGTH_ZH and len(val_en_indexes[i]) <= MAX_LENGTH_EN:
        val_zh_indexes_filtered.append(val_zh_indexes[i])
        val_en_indexes_filtered.append(val_en_indexes[i])

In [22]:
test_zh_indexes_filtered = []
test_en_indexes_filtered = []
for i in range(len(test_zh_indexes)):
    if len(test_zh_indexes[i]) <= MAX_LENGTH_ZH and len(test_en_indexes[i]) <= MAX_LENGTH_EN:
        test_zh_indexes_filtered.append(test_zh_indexes[i])
        test_en_indexes_filtered.append(test_en_indexes[i])

In [23]:
class VocabDataset(Dataset):
    def __init__(self, data_list1, data_list2):
        
        self.data_list1 = data_list1
        self.data_list2 = data_list2
        
        assert (len(self.data_list1) == len(self.data_list2))

    def __len__(self):
        return len(self.data_list1)
            
    def __getitem__(self, key):        
        return [self.data_list1[key], self.data_list2[key], len(self.data_list1[key]), len(self.data_list2[key])]

In [24]:
def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list1 = []
    data_list2 = []
    length_list1 = []
    length_list2 = []
    
    for datum in batch:
        length_list1.append(datum[2])
        length_list2.append(datum[3])
        
        padded_vec1 = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_LENGTH_ZH-datum[2])), 
                                mode="constant", constant_values=0)
        padded_vec2 = np.pad(np.array(datum[1]), 
                                pad_width=((0,MAX_LENGTH_EN-datum[3])), 
                                mode="constant", constant_values=0)
        
        data_list1.append(padded_vec1[:MAX_LENGTH_ZH])
        data_list2.append(padded_vec2[:MAX_LENGTH_EN])


    return [torch.from_numpy(np.array(data_list1)).cuda(), torch.from_numpy(np.array(data_list2)).cuda(),
                torch.LongTensor(length_list1).cuda(), torch.LongTensor(length_list2).cuda()]

In [25]:
BATCH_SIZE = 32

train_dataset = VocabDataset(train_zh_indexes_filtered, train_en_indexes_filtered)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

val_dataset = VocabDataset(val_zh_indexes_filtered, val_en_indexes_filtered)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

test_dataset = VocabDataset(test_zh_indexes_filtered, test_en_indexes_filtered)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                           batch_size=1,
                                           collate_fn=vocab_collate_func,
                                           shuffle=False)

val_loader2 = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=1,
                                           collate_fn=vocab_collate_func,
                                           shuffle=False)

In [26]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embed_size=EMBED_SIZE):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(loaded_embeddings_ft_zh).float(), freeze=True)
        self.gru = nn.GRU(embed_size, hidden_size, batch_first=True)

    def forward(self, input, hidden):
        embedded = self.embedding(input)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [27]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, embed_size=EMBED_SIZE):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(loaded_embeddings_ft_en).float(), freeze=True)
        self.gru = nn.GRU(embed_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input)        
        output = F.relu(output)        
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        
        return output, hidden

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [28]:
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    batch_size, input_length = input_tensor.size()
    _, target_length = target_tensor.size()
    
    encoder_hidden = encoder.initHidden(batch_size)
    encoder_output, encoder_hidden = encoder(input_tensor, encoder_hidden)
    
    loss = 0

    decoder_input = torch.tensor(np.array([[SOS_token]] * batch_size).reshape(1, batch_size), device=device)
    
    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    if use_teacher_forcing:
        for di in range(target_length):
            
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
        
            
            loss += criterion(decoder_output, target_tensor[:,di])
            decoder_input = target_tensor[:,di].unsqueeze(0) 
            
    else:
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
                        
            
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  
            decoder_input = decoder_input.unsqueeze(0)
 
            loss += criterion(decoder_output, target_tensor[:,di])
            

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [29]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
%matplotlib inline

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)
    
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [30]:
def trainIters(encoder, decoder, n_iters, print_every=100, plot_every=100, learning_rate=0.001):
    start = time.time()
    
    plot_losses_t = []
    print_loss_total_t = 0  
    plot_loss_total_t = 0  
    
    plot_losses_v = []
    print_loss_total_v = 0  
    plot_loss_total_v = 0 
    
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        for i, (data1, data2, length1, length2) in enumerate(train_loader):
            input_tensor = data1
            target_tensor = data2
            loss = train(input_tensor, target_tensor, encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion)
            print_loss_total_t += loss
            plot_loss_total_t += loss

            if i % print_every == 0:
                print_loss_avg = print_loss_total_t / print_every
                print_loss_total_t = 0
                print('Train %s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

            if i % plot_every == 0:
                plot_loss_avg = plot_loss_total_t / plot_every
                plot_losses_t.append(plot_loss_avg)
                plot_loss_total_t = 0
                
        for i, (data1, data2, length1, length2) in enumerate(val_loader):
            input_tensor = data1
            target_tensor = data2
            loss = train(input_tensor, target_tensor, encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion)
            print_loss_total_v += loss
            plot_loss_total_v += loss

            if i % print_every == 0:
                print_loss_avg = print_loss_total_v / print_every
                print_loss_total_v = 0
                print('Val %s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

            if i % plot_every == 0:
                plot_loss_avg = plot_loss_total_v / plot_every
                plot_losses_v.append(plot_loss_avg)
                plot_loss_total_v = 0
                
        torch.save(encoder.state_dict(), model_path + "encoder_rnn_vi"+str(hidden_size)+str(iter)+".pth")
        torch.save(decoder.state_dict(), model_path + "decoder_rnn_vi"+str(hidden_size)+str(iter)+".pth")

    return plot_losses_t, plot_losses_v

In [32]:
hidden_size = 300
encoder1 = EncoderRNN(hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, len(ordered_words_ft_zh)).to(device)

plotloss_t1, plotloss_v1 = trainIters(encoder1, decoder1, 8)

Train 0m 0s (- 0m 1s) (1 12%) 0.1152
Train 0m 23s (- 2m 44s) (1 12%) 4.2711
Train 0m 46s (- 5m 26s) (1 12%) 3.0601
Train 1m 9s (- 8m 6s) (1 12%) 2.9433
Train 1m 32s (- 10m 49s) (1 12%) 2.8984
Train 1m 55s (- 13m 30s) (1 12%) 2.7955
Train 2m 19s (- 16m 13s) (1 12%) 2.7729
Train 2m 42s (- 18m 55s) (1 12%) 2.8186
Train 3m 5s (- 21m 37s) (1 12%) 2.7191
Train 3m 28s (- 24m 20s) (1 12%) 2.7052
Train 3m 51s (- 27m 3s) (1 12%) 2.6997
Train 4m 15s (- 29m 46s) (1 12%) 2.6596
Train 4m 38s (- 32m 29s) (1 12%) 2.6835
Train 5m 1s (- 35m 11s) (1 12%) 2.6197
Train 5m 24s (- 37m 54s) (1 12%) 2.6206
Train 5m 48s (- 40m 36s) (1 12%) 2.6064
Train 6m 11s (- 43m 19s) (1 12%) 2.5944
Train 6m 34s (- 46m 0s) (1 12%) 2.5607
Train 6m 57s (- 48m 43s) (1 12%) 2.6318
Train 7m 20s (- 51m 26s) (1 12%) 2.5954
Train 7m 44s (- 54m 9s) (1 12%) 2.5433
Train 8m 7s (- 56m 51s) (1 12%) 2.4916
Train 8m 30s (- 59m 31s) (1 12%) 2.4823
Train 8m 53s (- 62m 14s) (1 12%) 2.4884
Train 9m 16s (- 64m 56s) (1 12%) 2.5114
Train 9m 39s (

Train 76m 5s (- 25m 21s) (6 75%) 2.0720
Train 76m 28s (- 25m 29s) (6 75%) 2.0964
Train 76m 51s (- 25m 37s) (6 75%) 2.0872
Train 77m 14s (- 25m 44s) (6 75%) 2.0781
Train 77m 37s (- 25m 52s) (6 75%) 2.0175
Train 78m 1s (- 26m 0s) (6 75%) 2.1078
Train 78m 24s (- 26m 8s) (6 75%) 2.0770
Train 78m 47s (- 26m 15s) (6 75%) 2.1405
Train 79m 10s (- 26m 23s) (6 75%) 2.0168
Train 79m 33s (- 26m 31s) (6 75%) 2.0141
Train 79m 57s (- 26m 39s) (6 75%) 2.0717
Train 80m 20s (- 26m 46s) (6 75%) 2.0565
Train 80m 43s (- 26m 54s) (6 75%) 2.0573
Train 81m 6s (- 27m 2s) (6 75%) 2.1475
Train 81m 29s (- 27m 9s) (6 75%) 2.1027
Train 81m 52s (- 27m 17s) (6 75%) 2.0725
Train 82m 16s (- 27m 25s) (6 75%) 2.0610
Train 82m 39s (- 27m 33s) (6 75%) 2.0887
Train 83m 2s (- 27m 40s) (6 75%) 2.0542
Train 83m 25s (- 27m 48s) (6 75%) 2.0959
Train 83m 49s (- 27m 56s) (6 75%) 2.0902
Train 84m 12s (- 28m 4s) (6 75%) 2.0471
Train 84m 35s (- 28m 11s) (6 75%) 2.0896
Train 84m 58s (- 28m 19s) (6 75%) 2.0791
Train 85m 21s (- 28m 27s)

In [37]:
hidden_size = 500
encoder2 = EncoderRNN(hidden_size).to(device)
decoder2 = DecoderRNN(hidden_size, len(ordered_words_ft_zh)).to(device)

plotloss_t2, plotloss_v2 = trainIters(encoder2, decoder2, 8)

Train 0m 0s (- 0m 2s) (1 12%) 0.1152
Train 0m 31s (- 3m 43s) (1 12%) 3.9394
Train 1m 3s (- 7m 24s) (1 12%) 3.0295
Train 1m 35s (- 11m 5s) (1 12%) 2.9601
Train 2m 6s (- 14m 47s) (1 12%) 2.8919
Train 2m 38s (- 18m 28s) (1 12%) 2.8505
Train 3m 9s (- 22m 9s) (1 12%) 2.8239
Train 3m 41s (- 25m 50s) (1 12%) 2.7827
Train 4m 12s (- 29m 30s) (1 12%) 2.6437
Train 4m 44s (- 33m 12s) (1 12%) 2.7586
Train 5m 16s (- 36m 52s) (1 12%) 2.6074
Train 5m 47s (- 40m 34s) (1 12%) 2.7056
Train 6m 19s (- 44m 14s) (1 12%) 2.6324
Train 6m 50s (- 47m 55s) (1 12%) 2.6892
Train 7m 22s (- 51m 36s) (1 12%) 2.5968
Train 7m 54s (- 55m 18s) (1 12%) 2.6748
Train 8m 25s (- 59m 1s) (1 12%) 2.6606
Train 8m 57s (- 62m 42s) (1 12%) 2.5748
Train 9m 28s (- 66m 22s) (1 12%) 2.5242
Train 10m 0s (- 70m 2s) (1 12%) 2.5468
Train 10m 32s (- 73m 44s) (1 12%) 2.5934
Train 11m 3s (- 77m 26s) (1 12%) 2.5389
Train 11m 35s (- 81m 8s) (1 12%) 2.6184
Train 12m 7s (- 84m 49s) (1 12%) 2.5038
Train 12m 38s (- 88m 31s) (1 12%) 2.5652
Train 13m 

Train 102m 54s (- 34m 18s) (6 75%) 2.0337
Train 103m 26s (- 34m 28s) (6 75%) 2.0574
Train 103m 58s (- 34m 39s) (6 75%) 2.0058
Train 104m 29s (- 34m 49s) (6 75%) 2.0270
Train 105m 0s (- 35m 0s) (6 75%) 1.9555
Train 105m 32s (- 35m 10s) (6 75%) 2.0961
Train 106m 4s (- 35m 21s) (6 75%) 2.0446
Train 106m 35s (- 35m 31s) (6 75%) 2.0102
Train 107m 7s (- 35m 42s) (6 75%) 1.9834
Train 107m 38s (- 35m 52s) (6 75%) 2.0136
Train 108m 10s (- 36m 3s) (6 75%) 2.0164
Train 108m 42s (- 36m 14s) (6 75%) 2.1412
Train 109m 13s (- 36m 24s) (6 75%) 2.0226
Train 109m 45s (- 36m 35s) (6 75%) 2.0583
Train 110m 16s (- 36m 45s) (6 75%) 2.0187
Train 110m 48s (- 36m 56s) (6 75%) 2.0442
Train 111m 19s (- 37m 6s) (6 75%) 2.0345
Train 111m 51s (- 37m 17s) (6 75%) 2.1157
Train 112m 22s (- 37m 27s) (6 75%) 2.0506
Train 112m 54s (- 37m 38s) (6 75%) 2.0149
Train 113m 26s (- 37m 48s) (6 75%) 2.0951
Train 113m 57s (- 37m 59s) (6 75%) 2.0859
Train 114m 29s (- 38m 9s) (6 75%) 2.0463
Train 115m 0s (- 38m 20s) (6 75%) 2.1172


In [33]:
hidden_size = 300

encoder3 = EncoderRNN(hidden_size).to(device)
decoder3 = DecoderRNN(hidden_size, len(ordered_words_ft_zh)).to(device)

encoder3.load_state_dict(torch.load(model_path + "encoder_rnn_vi3008.pth"))
decoder3.load_state_dict(torch.load(model_path + "decoder_rnn_vi3008.pth"))

In [32]:
def evaluate(loader, encoder, decoder):
    decoded_words_list = []
    with torch.no_grad():
        for i, (data1, data2, length1, length2) in enumerate(loader):
            input_tensor = data1
            input_length = input_tensor.size()[0]
            
            encoder_hidden = encoder.initHidden(input_length)

            encoder_output, encoder_hidden = encoder(input_tensor, encoder_hidden)
            
            decoder_input = torch.tensor(np.array([[SOS_token]]), device=device)

            decoder_hidden = encoder_hidden

            decoded_words = []
            
            for di in range(MAX_LENGTH_EN):
                decoder_output, decoder_hidden = decoder(decoder_input.reshape(1,1), decoder_hidden)
                topv, topi = decoder_output.data.topk(1) 
                if topi.item() == EOS_token:
                    decoded_words.append('<eos>')
                    break
                else:
                    decoded_words.append(idx2words_ft_en[topi.item()])

                decoder_input = topi.squeeze().detach()
                decoder_input = decoder_input.unsqueeze(0)
            decoded_words_list.append(decoded_words)
        return decoded_words_list  

In [35]:
predicted_list = evaluate(val_loader2, encoder3, decoder3)

In [36]:
predicted_list_nopad = []
for ii in range(len(predicted_list)):
    line = ''
    for jj in predicted_list[ii]:
        if jj != '<pad>':
            line = line + ' ' + jj
    predicted_list_nopad.append(line)

for iii in range(len(predicted_list_nopad)):
    if predicted_list_nopad[iii][-5:] == '<eos>':
        predicted_list_nopad[iii] = predicted_list_nopad[iii][5:-5]
    else:
        predicted_list_nopad[iii] = predicted_list_nopad[iii][5:]

label_list = []
for iii in range(len(val_en_indexes_filtered)):
    line = ''
    for jjj in val_en_indexes_filtered[iii]:
        line = line + ' ' + idx2words_ft_en[jjj]
    label_list.append(line[5:-5])

print('bleu score for validation dataset:', corpus_bleu(predicted_list_nopad, [label_list]).score)



bleu score for validation dataset: 6.547676119621894


In [41]:
#predicted_list_nopad = []
#for ii in range(len(predicted_list)):
    #line = ''
    #for jj in predicted_list[ii]:
        #if jj != '<pad>':
            #line = line + ' ' + jj
    #predicted_list_nopad.append(line)

In [42]:
#for iii in range(len(predicted_list_nopad)):
    #if predicted_list_nopad[iii][-5:] != '<eos>':
        #predicted_list_nopad[iii] = predicted_list_nopad[iii] + ' <eos>'

In [43]:
#label_list = []
#for iii in range(len(val_en_indexes_filtered)):
    #line = ''
    #for jjj in val_en_indexes_filtered[iii]:
        #line = line + ' ' + idx2words_ft_en[jjj]
    #label_list.append(line)

In [54]:
#print('bleu score for validation dataset:',corpus_bleu(predicted_list_nopad, [label_list]).score)

In [44]:
choice = random.randint(0, len(predicted_list_nopad)-1)
print(predicted_list_nopad[choice])
print(label_list[choice])

> I was a woman , and I was a in in . . 
> I m a child of 1984 , and I live in the city of Berlin . 


In [46]:
predicted_list = evaluate(test_loader, encoder3, decoder3)

In [47]:
predicted_list_nopad = []
for ii in range(len(predicted_list)):
    line = ''
    for jj in predicted_list[ii]:
        if jj != '<pad>':
            line = line + ' ' + jj
    predicted_list_nopad.append(line)

for iii in range(len(predicted_list_nopad)):
    if predicted_list_nopad[iii][-5:] == '<eos>':
        predicted_list_nopad[iii] = predicted_list_nopad[iii][5:-5]
    else:
        predicted_list_nopad[iii] = predicted_list_nopad[iii][5:]

label_list = []
for iii in range(len(test_en_indexes_filtered)):
    line = ''
    for jjj in test_en_indexes_filtered[iii]:
        line = line + ' ' + idx2words_ft_en[jjj]
    label_list.append(line[5:-5])

print('bleu score for test dataset:', corpus_bleu(predicted_list_nopad, [label_list]).score)



bleu score for test dataset: 4.875237288827215


In [47]:
#predicted_list_nopad = []
#for ii in range(len(predicted_list)):
    #line = ''
    #for jj in predicted_list[ii]:
        #if jj != '<pad>':
            #line = line + ' ' + jj
    #predicted_list_nopad.append(line)

In [48]:
#for iii in range(len(predicted_list_nopad)):
    #if predicted_list_nopad[iii][-5:] != '<eos>':
        #predicted_list_nopad[iii] = predicted_list_nopad[iii] + ' <eos>'

In [49]:
#label_list = []
#for iii in range(len(test_en_indexes_filtered)):
    #line = ''
    #for jjj in test_en_indexes_filtered[iii]:
        #line = line + ' ' + idx2words_ft_en[jjj]
    #label_list.append(line)

In [53]:
#print('bleu score for test dataset:', corpus_bleu(predicted_list_nopad, [label_list]).score)

In [52]:
choice = random.randint(0, len(predicted_list_nopad)-1)
print(predicted_list_nopad[choice])
print(label_list[choice])

> You can a your your and and you you you and and you . .
> You can put a knob in between and now you ve made a little dimmer . 


In [55]:
hidden_size = 500

encoder4 = EncoderRNN(hidden_size).to(device)
decoder4 = DecoderRNN(hidden_size, len(ordered_words_ft_zh)).to(device)

encoder4.load_state_dict(torch.load(model_path + "encoder_rnn_vi5008.pth"))
decoder4.load_state_dict(torch.load(model_path + "decoder_rnn_vi5008.pth"))

In [56]:
predicted_list = evaluate(val_loader2, encoder4, decoder4)

In [57]:
predicted_list_nopad = []
for ii in range(len(predicted_list)):
    line = ''
    for jj in predicted_list[ii]:
        if jj != '<pad>':
            line = line + ' ' + jj
    predicted_list_nopad.append(line)

for iii in range(len(predicted_list_nopad)):
    if predicted_list_nopad[iii][-5:] == '<eos>':
        predicted_list_nopad[iii] = predicted_list_nopad[iii][5:-5]
    else:
        predicted_list_nopad[iii] = predicted_list_nopad[iii][5:]

label_list = []
for iii in range(len(val_en_indexes_filtered)):
    line = ''
    for jjj in val_en_indexes_filtered[iii]:
        line = line + ' ' + idx2words_ft_en[jjj]
    label_list.append(line[5:-5])

print('bleu score for validation dataset:', corpus_bleu(predicted_list_nopad, [label_list]).score)



bleu score for validation dataset: 8.158234021900313


In [58]:
choice = random.randint(0, len(predicted_list_nopad)-1)
print(predicted_list_nopad[choice])
print(label_list[choice])

> In India , there are a few people , and and , , , , , , , , , , , , ,
> And there are only <unk> young black and Latino men in New York , so for them , it s not a question of , Will I get stopped ? 


In [59]:
predicted_list = evaluate(test_loader, encoder4, decoder4)

In [60]:
predicted_list_nopad = []
for ii in range(len(predicted_list)):
    line = ''
    for jj in predicted_list[ii]:
        if jj != '<pad>':
            line = line + ' ' + jj
    predicted_list_nopad.append(line)

for iii in range(len(predicted_list_nopad)):
    if predicted_list_nopad[iii][-5:] == '<eos>':
        predicted_list_nopad[iii] = predicted_list_nopad[iii][5:-5]
    else:
        predicted_list_nopad[iii] = predicted_list_nopad[iii][5:]

label_list = []
for iii in range(len(test_en_indexes_filtered)):
    line = ''
    for jjj in test_en_indexes_filtered[iii]:
        line = line + ' ' + idx2words_ft_en[jjj]
    label_list.append(line[5:-5])

print('bleu score for test dataset:', corpus_bleu(predicted_list_nopad, [label_list]).score)



bleu score for test dataset: 5.092250995456274


In [61]:
choice = random.randint(0, len(predicted_list_nopad)-1)
print(predicted_list_nopad[choice])
print(label_list[choice])

> I think I will think more more more . 
> I think I might have actually heard more hands . 
