In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re  
import random
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np
import os
from torch.utils.data import Dataset
from sacrebleu import corpus_bleu

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
path_zh = '/home/ys2542/Neural-Translation-System/iwslt-zh-en-processed/'
path_vi = '/home/ys2542/Neural-Translation-System/iwslt-vi-en-processed/'
ft_home = '/scratch/ys2542/NLP_FASTTEXT/'
model_path = '/scratch/ys2542/model/'

In [4]:
PAD_token = 0
UNK_token = 1
SOS_token = 2
EOS_token = 3

In [5]:
words_to_load = 100000 
EMBED_SIZE = 300

In [6]:
with open(ft_home + 'wiki-news-300d-1M.vec') as f:
    matrix_size = words_to_load + 4
    loaded_embeddings_ft_en = np.zeros((matrix_size, EMBED_SIZE))
    words_ft_en = {'<pad>': PAD_token, '<unk>': UNK_token, '<sos>': SOS_token, '<eos>': EOS_token,}
    idx2words_ft_en = {PAD_token: '<pad>', UNK_token: '<unk>', SOS_token: '<sos>', EOS_token: '<eos>'}
    ordered_words_ft_en = ['<pad>', '<unk>', '<sos>', '<eos>']
    
    loaded_embeddings_ft_en[0,:] = np.zeros(EMBED_SIZE)
    loaded_embeddings_ft_en[1,:] = np.random.uniform(-1.0, 1.0, EMBED_SIZE)
    loaded_embeddings_ft_en[2,:] = np.random.uniform(-1.0, 1.0, EMBED_SIZE)
    loaded_embeddings_ft_en[3,:] = np.random.uniform(-1.0, 1.0, EMBED_SIZE)
    
    for i, line in enumerate(f):
        if i == 0:
            continue
        if i == words_to_load + 1: 
            break
        s = line.split()
        idx = i + 3
        loaded_embeddings_ft_en[idx, :] = np.asarray(s[1:])
        words_ft_en[s[0]] = idx
        idx2words_ft_en[idx] = s[0]
        ordered_words_ft_en.append(s[0])

In [7]:
with open(ft_home + 'cc.zh.300.vec') as f:
    matrix_size = words_to_load + 4
    loaded_embeddings_ft_zh = np.zeros((matrix_size, EMBED_SIZE))
    words_ft_zh = {'<pad>': PAD_token, '<unk>': UNK_token, '<sos>': SOS_token, '<eos>': EOS_token,}
    idx2words_ft_zh = {PAD_token: '<pad>', UNK_token: '<unk>', SOS_token: '<sos>', EOS_token: '<eos>'}
    ordered_words_ft_zh = ['<pad>', '<unk>', '<sos>', '<eos>']
    
    loaded_embeddings_ft_zh[0,:] = np.zeros(EMBED_SIZE)
    loaded_embeddings_ft_zh[1,:] = np.random.uniform(-1.0, 1.0, EMBED_SIZE)
    loaded_embeddings_ft_zh[2,:] = np.random.uniform(-1.0, 1.0, EMBED_SIZE)
    loaded_embeddings_ft_zh[3,:] = np.random.uniform(-1.0, 1.0, EMBED_SIZE)
    
    for i, line in enumerate(f):
        if i == 0:
            continue
        if i == words_to_load + 1: 
            break
        s = line.split()
        idx = i + 3
        loaded_embeddings_ft_zh[idx, :] = np.asarray(s[1:])
        words_ft_zh[s[0]] = idx
        idx2words_ft_zh[idx] = s[0]
        ordered_words_ft_zh.append(s[0])

In [8]:
lines_zh_train = open(path_zh + 'train.tok.zh', encoding = 'utf-8').read().strip().split('\n')
lines_en_train = open(path_zh + 'train.tok.en', encoding = 'utf-8').read().strip().split('\n')

lines_zh_val = open(path_zh + 'dev.tok.zh', encoding = 'utf-8').read().strip().split('\n')
lines_en_val = open(path_zh + 'dev.tok.en', encoding = 'utf-8').read().strip().split('\n')

lines_zh_test = open(path_zh + 'test.tok.zh', encoding = 'utf-8').read().strip().split('\n')
lines_en_test = open(path_zh + 'test.tok.en', encoding = 'utf-8').read().strip().split('\n')

In [9]:
def clean_lines(lines, lang):
    data = []
    for line in lines:
        if line == '':
            line = ' '
        if lang == 'en':
            line = line.replace("&apos;", "").replace("&quot;", "")
        if line[-1] != ' ':
            line = line + ' '
       
        line = '<sos> ' + line + '<eos>'
        data.append(line)
    return data

In [10]:
train_zh = clean_lines(lines_zh_train, 'zh')
train_en = clean_lines(lines_en_train, 'en')

val_zh = clean_lines(lines_zh_val, 'zh')
val_en = clean_lines(lines_en_val, 'en')

test_zh = clean_lines(lines_zh_test, 'zh')
test_en = clean_lines(lines_en_test, 'en')   

In [11]:
def indexesFromSentence(data, lang):
    indexes = []
    for sentence in data:
        index = []
        for token in sentence.split():
            if lang == 'zh':
                try:
                    index.append(words_ft_zh[token])
                except KeyError:
                    index.append(UNK_token)
            elif lang == 'en':
                try:
                    index.append(words_ft_en[token])
                except KeyError:
                    index.append(UNK_token)
        indexes.append(index)
    return indexes

In [12]:
train_zh_indexes = indexesFromSentence(train_zh, 'zh')
train_en_indexes = indexesFromSentence(train_en, 'en')

val_zh_indexes = indexesFromSentence(val_zh, 'zh')
val_en_indexes = indexesFromSentence(val_en, 'en')

test_zh_indexes = indexesFromSentence(test_zh, 'zh')
test_en_indexes = indexesFromSentence(test_en, 'en')

In [13]:
length_zh = []
for line in train_zh_indexes:
        length_zh.append(len(line))
        
length_zh = sorted(length_zh)
MAX_LENGTH_ZH = length_zh[int(len(train_zh_indexes)*0.99)]
print(MAX_LENGTH_ZH)

69


In [14]:
length_en = []
for line in train_en_indexes:
        length_en.append(len(line))
        
length_zh = sorted(length_en)
MAX_LENGTH_EN = length_en[int(len(train_en_indexes)*0.99)]
print(MAX_LENGTH_EN)

21


In [15]:
length_en = []
for line in val_zh_indexes:
        length_en.append(len(line))
        
length_zh = sorted(length_en)
MAX_LENGTH_EN = length_en[int(len(val_zh_indexes)*0.99)]
print(MAX_LENGTH_EN)

42


In [16]:
length_en = []
for line in val_en_indexes:
        length_en.append(len(line))
        
length_zh = sorted(length_en)
MAX_LENGTH_EN = length_en[int(len(val_en_indexes)*0.99)]
print(MAX_LENGTH_EN)

40


In [17]:
length_en = []
for line in test_zh_indexes:
        length_en.append(len(line))
        
length_zh = sorted(length_en)
MAX_LENGTH_EN = length_en[int(len(test_zh_indexes)*0.99)]
print(MAX_LENGTH_EN)

11


In [18]:
length_en = []
for line in test_en_indexes:
        length_en.append(len(line))
        
length_zh = sorted(length_en)
MAX_LENGTH_EN = length_en[int(len(test_en_indexes)*0.99)]
print(MAX_LENGTH_EN)

12


In [19]:
MAX_LENGTH_ZH = 69
MAX_LENGTH_EN = 40

In [20]:
train_zh_indexes_filtered = []
train_en_indexes_filtered = []
for i in range(len(train_zh_indexes)):
    if len(train_zh_indexes[i]) <= MAX_LENGTH_ZH and len(train_en_indexes[i]) <= MAX_LENGTH_EN:
        train_zh_indexes_filtered.append(train_zh_indexes[i])
        train_en_indexes_filtered.append(train_en_indexes[i])

In [21]:
val_zh_indexes_filtered = []
val_en_indexes_filtered = []
for i in range(len(val_zh_indexes)):
    if len(val_zh_indexes[i]) <= MAX_LENGTH_ZH and len(val_en_indexes[i]) <= MAX_LENGTH_EN:
        val_zh_indexes_filtered.append(val_zh_indexes[i])
        val_en_indexes_filtered.append(val_en_indexes[i])

In [22]:
test_zh_indexes_filtered = []
test_en_indexes_filtered = []
for i in range(len(test_zh_indexes)):
    if len(test_zh_indexes[i]) <= MAX_LENGTH_ZH and len(test_en_indexes[i]) <= MAX_LENGTH_EN:
        test_zh_indexes_filtered.append(test_zh_indexes[i])
        test_en_indexes_filtered.append(test_en_indexes[i])

In [23]:
class VocabDataset(Dataset):
    def __init__(self, data_list1, data_list2):
        
        self.data_list1 = data_list1
        self.data_list2 = data_list2
        
        assert (len(self.data_list1) == len(self.data_list2))

    def __len__(self):
        return len(self.data_list1)
            
    def __getitem__(self, key):        
        return [self.data_list1[key], self.data_list2[key], len(self.data_list1[key]), len(self.data_list2[key])]

In [24]:
def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list1 = []
    data_list2 = []
    length_list1 = []
    length_list2 = []
    
    for datum in batch:
        length_list1.append(datum[2])
        length_list2.append(datum[3])
        
        padded_vec1 = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_LENGTH_ZH-datum[2])), 
                                mode="constant", constant_values=0)
        padded_vec2 = np.pad(np.array(datum[1]), 
                                pad_width=((0,MAX_LENGTH_EN-datum[3])), 
                                mode="constant", constant_values=0)
        
        data_list1.append(padded_vec1[:MAX_LENGTH_ZH])
        data_list2.append(padded_vec2[:MAX_LENGTH_EN])


    return [torch.from_numpy(np.array(data_list1)).cuda(), torch.from_numpy(np.array(data_list2)).cuda(),
                torch.LongTensor(length_list1).cuda(), torch.LongTensor(length_list2).cuda()]

In [25]:
BATCH_SIZE = 32

train_dataset = VocabDataset(train_zh_indexes_filtered, train_en_indexes_filtered)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

val_dataset = VocabDataset(val_zh_indexes_filtered, val_en_indexes_filtered)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

test_dataset = VocabDataset(test_zh_indexes_filtered, test_en_indexes_filtered)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                           batch_size=1,
                                           collate_fn=vocab_collate_func,
                                           shuffle=False)

val_loader2 = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=1,
                                           collate_fn=vocab_collate_func,
                                           shuffle=False)

In [26]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embed_size=EMBED_SIZE):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(loaded_embeddings_ft_zh).float(), freeze=True)
        self.gru = nn.GRU(embed_size, hidden_size, batch_first=True)

    def forward(self, input, hidden):
        embedded = self.embedding(input)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [27]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, embed_size=EMBED_SIZE):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(loaded_embeddings_ft_en).float(), freeze=True)
        self.gru = nn.GRU(embed_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input)        
        output = F.relu(output)        
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        
        return output, hidden

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

In [28]:
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    batch_size, input_length = input_tensor.size()
    _, target_length = target_tensor.size()
    
    encoder_hidden = encoder.initHidden(batch_size)
    encoder_output, encoder_hidden = encoder(input_tensor, encoder_hidden)
    
    loss = 0

    decoder_input = torch.tensor(np.array([[SOS_token]] * batch_size).reshape(1, batch_size), device=device)
    
    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    if use_teacher_forcing:
        for di in range(target_length):
            
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
        
            
            loss += criterion(decoder_output, target_tensor[:,di])
            decoder_input = target_tensor[:,di].unsqueeze(0) 
            
    else:
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(
                decoder_input, decoder_hidden)
                        
            
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  
            decoder_input = decoder_input.unsqueeze(0)
 
            loss += criterion(decoder_output, target_tensor[:,di])
            

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [29]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
%matplotlib inline

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)
    
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [30]:
def trainIters(encoder, decoder, n_iters, print_every=100, plot_every=100, learning_rate=0.001):
    start = time.time()
    
    plot_losses_t = []
    print_loss_total_t = 0  
    plot_loss_total_t = 0  
    
    plot_losses_v = []
    print_loss_total_v = 0  
    plot_loss_total_v = 0 
    
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        for i, (data1, data2, length1, length2) in enumerate(train_loader):
            input_tensor = data1
            target_tensor = data2
            loss = train(input_tensor, target_tensor, encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion)
            print_loss_total_t += loss
            plot_loss_total_t += loss

            if i % print_every == 0:
                print_loss_avg = print_loss_total_t / print_every
                print_loss_total_t = 0
                print('Train %s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

            if i % plot_every == 0:
                plot_loss_avg = plot_loss_total_t / plot_every
                plot_losses_t.append(plot_loss_avg)
                plot_loss_total_t = 0
                
        for i, (data1, data2, length1, length2) in enumerate(val_loader):
            input_tensor = data1
            target_tensor = data2
            loss = train(input_tensor, target_tensor, encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion)
            print_loss_total_v += loss
            plot_loss_total_v += loss

            if i % print_every == 0:
                print_loss_avg = print_loss_total_v / print_every
                print_loss_total_v = 0
                print('Val %s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

            if i % plot_every == 0:
                plot_loss_avg = plot_loss_total_v / plot_every
                plot_losses_v.append(plot_loss_avg)
                plot_loss_total_v = 0
                
        torch.save(encoder.state_dict(), model_path + "encoder_rnn"+str(hidden_size)+str(iter)+".pth")
        torch.save(decoder.state_dict(), model_path + "decoder_rnn"+str(hidden_size)+str(iter)+".pth")

    return plot_losses_t, plot_losses_v

In [36]:
hidden_size = 300
encoder1 = EncoderRNN(hidden_size).to(device)
decoder1 = DecoderRNN(hidden_size, len(ordered_words_ft_zh)).to(device)

plotloss_t1, plotloss_v1 = trainIters(encoder1, decoder1, 8)

Train 0m 0s (- 0m 2s) (1 12%) 0.1152
Train 0m 23s (- 2m 47s) (1 12%) 4.1645
Train 0m 47s (- 5m 31s) (1 12%) 3.0805
Train 1m 10s (- 8m 15s) (1 12%) 3.0577
Train 1m 34s (- 10m 59s) (1 12%) 2.9110
Train 1m 57s (- 13m 45s) (1 12%) 2.8814
Train 2m 21s (- 16m 29s) (1 12%) 2.8007
Train 2m 44s (- 19m 14s) (1 12%) 2.7399
Train 3m 8s (- 21m 59s) (1 12%) 2.7419
Train 3m 32s (- 24m 44s) (1 12%) 2.6945
Train 3m 55s (- 27m 29s) (1 12%) 2.6910
Train 4m 19s (- 30m 14s) (1 12%) 2.6832
Train 4m 42s (- 32m 59s) (1 12%) 2.6682
Train 5m 6s (- 35m 45s) (1 12%) 2.6939
Train 5m 30s (- 38m 31s) (1 12%) 2.7058
Train 5m 53s (- 41m 16s) (1 12%) 2.6407
Train 6m 17s (- 44m 1s) (1 12%) 2.6068
Train 6m 41s (- 46m 47s) (1 12%) 2.6212
Train 7m 4s (- 49m 33s) (1 12%) 2.5993
Train 7m 28s (- 52m 18s) (1 12%) 2.5270
Train 7m 51s (- 55m 2s) (1 12%) 2.5155
Train 8m 15s (- 57m 48s) (1 12%) 2.5658
Train 8m 38s (- 60m 32s) (1 12%) 2.4874
Train 9m 2s (- 63m 17s) (1 12%) 2.5728
Train 9m 26s (- 66m 2s) (1 12%) 2.5148
Train 9m 49s 

Train 77m 11s (- 77m 11s) (4 50%) 2.0061
Train 77m 35s (- 77m 35s) (4 50%) 2.0553
Train 77m 58s (- 77m 58s) (4 50%) 2.0877
Train 78m 22s (- 78m 22s) (4 50%) 2.0159
Train 78m 45s (- 78m 45s) (4 50%) 1.9751
Train 79m 9s (- 79m 9s) (4 50%) 2.0879
Train 79m 33s (- 79m 33s) (4 50%) 2.0783
Train 79m 56s (- 79m 56s) (4 50%) 2.0105
Train 80m 20s (- 80m 20s) (4 50%) 2.0449
Train 80m 43s (- 80m 43s) (4 50%) 1.9733
Train 81m 7s (- 81m 7s) (4 50%) 2.1212
Train 81m 31s (- 81m 31s) (4 50%) 2.1075
Train 81m 54s (- 81m 54s) (4 50%) 2.0083
Train 82m 18s (- 82m 18s) (4 50%) 2.0472
Train 82m 42s (- 82m 42s) (4 50%) 2.1188
Train 83m 5s (- 83m 5s) (4 50%) 2.0813
Train 83m 29s (- 83m 29s) (4 50%) 2.0989
Train 83m 53s (- 83m 53s) (4 50%) 2.0296
Train 84m 16s (- 84m 16s) (4 50%) 2.0695
Train 84m 40s (- 84m 40s) (4 50%) 2.0571
Train 85m 3s (- 85m 3s) (4 50%) 2.0400
Train 85m 27s (- 85m 27s) (4 50%) 2.0776
Train 85m 51s (- 85m 51s) (4 50%) 2.0540
Train 86m 14s (- 86m 14s) (4 50%) 2.0614
Train 86m 38s (- 86m 38s

Train 153m 40s (- 21m 57s) (7 87%) 1.9350
Train 154m 4s (- 22m 0s) (7 87%) 1.9187
Train 154m 28s (- 22m 4s) (7 87%) 1.8831
Train 154m 52s (- 22m 7s) (7 87%) 1.9611
Train 155m 15s (- 22m 10s) (7 87%) 1.8745
Train 155m 39s (- 22m 14s) (7 87%) 1.8598
Train 156m 3s (- 22m 17s) (7 87%) 1.9812
Train 156m 27s (- 22m 21s) (7 87%) 1.9965
Train 156m 50s (- 22m 24s) (7 87%) 1.9167
Train 157m 14s (- 22m 27s) (7 87%) 1.9191
Train 157m 38s (- 22m 31s) (7 87%) 1.9325
Train 158m 2s (- 22m 34s) (7 87%) 1.9726
Train 158m 26s (- 22m 38s) (7 87%) 1.8684
Train 158m 50s (- 22m 41s) (7 87%) 1.9682
Train 159m 13s (- 22m 44s) (7 87%) 1.8859
Train 159m 37s (- 22m 48s) (7 87%) 1.9781
Train 160m 1s (- 22m 51s) (7 87%) 1.9053
Train 160m 25s (- 22m 55s) (7 87%) 1.9375
Train 160m 49s (- 22m 58s) (7 87%) 1.9706
Train 161m 12s (- 23m 1s) (7 87%) 1.9506
Train 161m 36s (- 23m 5s) (7 87%) 1.8982
Train 162m 0s (- 23m 8s) (7 87%) 2.0102
Train 162m 24s (- 23m 12s) (7 87%) 1.9674
Train 162m 47s (- 23m 15s) (7 87%) 1.8127
Tra

In [37]:
hidden_size = 500
encoder2 = EncoderRNN(hidden_size).to(device)
decoder2 = DecoderRNN(hidden_size, len(ordered_words_ft_zh)).to(device)

plotloss_t2, plotloss_v2 = trainIters(encoder2, decoder2, 8)

Train 0m 0s (- 0m 2s) (1 12%) 0.1152
Train 0m 32s (- 3m 50s) (1 12%) 3.8326
Train 1m 5s (- 7m 38s) (1 12%) 3.1646
Train 1m 38s (- 11m 27s) (1 12%) 3.0939
Train 2m 10s (- 15m 16s) (1 12%) 2.9944
Train 2m 43s (- 19m 4s) (1 12%) 2.9968
Train 3m 16s (- 22m 53s) (1 12%) 2.8880
Train 3m 48s (- 26m 42s) (1 12%) 2.8758
Train 4m 21s (- 30m 32s) (1 12%) 2.7768
Train 4m 54s (- 34m 20s) (1 12%) 2.6888
Train 5m 27s (- 38m 10s) (1 12%) 2.7078
Train 5m 59s (- 41m 59s) (1 12%) 2.6730
Train 6m 32s (- 45m 46s) (1 12%) 2.6266
Train 7m 5s (- 49m 36s) (1 12%) 2.6702
Train 7m 37s (- 53m 25s) (1 12%) 2.6743
Train 8m 10s (- 57m 15s) (1 12%) 2.6176
Train 8m 43s (- 61m 2s) (1 12%) 2.5297
Train 9m 15s (- 64m 51s) (1 12%) 2.5838
Train 9m 48s (- 68m 42s) (1 12%) 2.6313
Train 10m 21s (- 72m 31s) (1 12%) 2.5707
Train 10m 54s (- 76m 21s) (1 12%) 2.5861
Train 11m 27s (- 80m 9s) (1 12%) 2.4847
Train 11m 59s (- 83m 58s) (1 12%) 2.5022
Train 12m 32s (- 87m 46s) (1 12%) 2.4826
Train 13m 5s (- 91m 35s) (1 12%) 2.4960
Train

Train 104m 40s (- 104m 40s) (4 50%) 2.0620
Train 105m 12s (- 105m 12s) (4 50%) 1.9694
Train 105m 45s (- 105m 45s) (4 50%) 1.9470
Train 106m 17s (- 106m 17s) (4 50%) 1.9949
Train 106m 49s (- 106m 49s) (4 50%) 1.9926
Train 107m 21s (- 107m 21s) (4 50%) 2.0671
Train 107m 54s (- 107m 54s) (4 50%) 1.9304
Train 108m 26s (- 108m 26s) (4 50%) 2.0938
Train 108m 58s (- 108m 58s) (4 50%) 2.0257
Train 109m 31s (- 109m 31s) (4 50%) 2.0039
Train 110m 3s (- 110m 3s) (4 50%) 2.0173
Train 110m 35s (- 110m 35s) (4 50%) 2.0363
Train 111m 8s (- 111m 8s) (4 50%) 2.0476
Train 111m 40s (- 111m 40s) (4 50%) 2.0954
Train 112m 13s (- 112m 13s) (4 50%) 2.0327
Train 112m 45s (- 112m 45s) (4 50%) 2.0231
Train 113m 17s (- 113m 17s) (4 50%) 2.0051
Train 113m 49s (- 113m 49s) (4 50%) 1.9303
Train 114m 22s (- 114m 22s) (4 50%) 2.0366
Train 114m 54s (- 114m 54s) (4 50%) 2.0439
Train 115m 26s (- 115m 26s) (4 50%) 2.0047
Train 115m 59s (- 115m 59s) (4 50%) 2.1280
Train 116m 31s (- 116m 31s) (4 50%) 2.0368
Train 117m 4s (

Train 207m 49s (- 29m 41s) (7 87%) 1.9455
Train 208m 22s (- 29m 46s) (7 87%) 1.9143
Train 208m 54s (- 29m 50s) (7 87%) 1.9548
Train 209m 26s (- 29m 55s) (7 87%) 1.8691
Train 209m 59s (- 29m 59s) (7 87%) 1.9505
Train 210m 31s (- 30m 4s) (7 87%) 1.9395
Train 211m 4s (- 30m 9s) (7 87%) 1.8710
Train 211m 36s (- 30m 13s) (7 87%) 1.9303
Train 212m 8s (- 30m 18s) (7 87%) 1.9222
Train 212m 41s (- 30m 23s) (7 87%) 1.9061
Train 213m 13s (- 30m 27s) (7 87%) 1.8539
Train 213m 45s (- 30m 32s) (7 87%) 1.9514
Train 214m 18s (- 30m 36s) (7 87%) 1.9776
Train 214m 50s (- 30m 41s) (7 87%) 1.8090
Train 215m 23s (- 30m 46s) (7 87%) 1.9295
Train 215m 55s (- 30m 50s) (7 87%) 1.8688
Train 216m 27s (- 30m 55s) (7 87%) 1.9834
Train 217m 0s (- 31m 0s) (7 87%) 1.9137
Train 217m 32s (- 31m 4s) (7 87%) 1.9559
Train 218m 5s (- 31m 9s) (7 87%) 1.9796
Train 218m 37s (- 31m 13s) (7 87%) 1.8479
Train 219m 9s (- 31m 18s) (7 87%) 1.8537
Train 219m 41s (- 31m 23s) (7 87%) 1.8348
Train 220m 14s (- 31m 27s) (7 87%) 1.8703
Tr

In [31]:
hidden_size = 300

encoder3 = EncoderRNN(hidden_size).to(device)
decoder3 = DecoderRNN(hidden_size, len(ordered_words_ft_zh)).to(device)

encoder3.load_state_dict(torch.load(model_path + "encoder_rnn3008.pth"))
decoder3.load_state_dict(torch.load(model_path + "decoder_rnn3008.pth"))



In [32]:
def evaluate(loader, encoder, decoder):
    decoded_words_list = []
    with torch.no_grad():
        for i, (data1, data2, length1, length2) in enumerate(loader):
            input_tensor = data1
            input_length = input_tensor.size()[0]
            
            encoder_hidden = encoder.initHidden(input_length)

            encoder_output, encoder_hidden = encoder(input_tensor, encoder_hidden)
            
            decoder_input = torch.tensor(np.array([[SOS_token]]), device=device)

            decoder_hidden = encoder_hidden

            decoded_words = []
            
            for di in range(MAX_LENGTH_EN):
                decoder_output, decoder_hidden = decoder(decoder_input.reshape(1,1), decoder_hidden)
                topv, topi = decoder_output.data.topk(1) 
                if topi.item() == EOS_token:
                    decoded_words.append('<eos>')
                    break
                else:
                    decoded_words.append(idx2words_ft_en[topi.item()])

                decoder_input = topi.squeeze().detach()
                decoder_input = decoder_input.unsqueeze(0)
            decoded_words_list.append(decoded_words)
        return decoded_words_list  

In [33]:
predicted_list = evaluate(val_loader2, encoder3, decoder3)

In [34]:
predicted_list_nopad = []
for ii in range(len(predicted_list)):
    line = ''
    for jj in predicted_list[ii]:
        if jj != '<pad>':
            line = line + ' ' + jj
    predicted_list_nopad.append(line)

for iii in range(len(predicted_list_nopad)):
    if predicted_list_nopad[iii][-5:] == '<eos>':
        predicted_list_nopad[iii] = predicted_list_nopad[iii][5:-5]
    else:
        predicted_list_nopad[iii] = predicted_list_nopad[iii][5:]

label_list = []
for iii in range(len(val_en_indexes_filtered)):
    line = ''
    for jjj in val_en_indexes_filtered[iii]:
        line = line + ' ' + idx2words_ft_en[jjj]
    label_list.append(line[5:-5])

print('bleu score for validation dataset:', corpus_bleu(predicted_list_nopad, [label_list]).score)



bleu score for validation dataset: 6.15811860985386


In [35]:
choice = random.randint(0, len(predicted_list_nopad)-1)
print(predicted_list_nopad[choice])
print(label_list[choice])

> To order , you can change the soil . 
> To change the community , you have to change the composition of the soil . 


In [36]:
predicted_list = evaluate(test_loader, encoder3, decoder3)

In [37]:
predicted_list_nopad = []
for ii in range(len(predicted_list)):
    line = ''
    for jj in predicted_list[ii]:
        if jj != '<pad>':
            line = line + ' ' + jj
    predicted_list_nopad.append(line)

for iii in range(len(predicted_list_nopad)):
    if predicted_list_nopad[iii][-5:] == '<eos>':
        predicted_list_nopad[iii] = predicted_list_nopad[iii][5:-5]
    else:
        predicted_list_nopad[iii] = predicted_list_nopad[iii][5:]

label_list = []
for iii in range(len(test_en_indexes_filtered)):
    line = ''
    for jjj in test_en_indexes_filtered[iii]:
        line = line + ' ' + idx2words_ft_en[jjj]
    label_list.append(line[5:-5])

print('bleu score for test dataset:', corpus_bleu(predicted_list_nopad, [label_list]).score)



bleu score for test dataset: 4.5332872379845055


In [44]:
choice = random.randint(0, len(predicted_list_nopad)-1)
print(predicted_list_nopad[choice])
print(label_list[choice])

> It s the new new of of .
> And try that again and do that for another generation . 


In [46]:
hidden_size = 500

encoder3 = EncoderRNN(hidden_size).to(device)
decoder3 = DecoderRNN(hidden_size, len(ordered_words_ft_zh)).to(device)

encoder3.load_state_dict(torch.load(model_path + "encoder_rnn5008.pth"))
decoder3.load_state_dict(torch.load(model_path + "decoder_rnn5008.pth"))

In [47]:
predicted_list = evaluate(val_loader2, encoder3, decoder3)

predicted_list_nopad = []
for ii in range(len(predicted_list)):
    line = ''
    for jj in predicted_list[ii]:
        if jj != '<pad>':
            line = line + ' ' + jj
    predicted_list_nopad.append(line)

for iii in range(len(predicted_list_nopad)):
    if predicted_list_nopad[iii][-5:] == '<eos>':
        predicted_list_nopad[iii] = predicted_list_nopad[iii][5:-5]
    else:
        predicted_list_nopad[iii] = predicted_list_nopad[iii][5:]

label_list = []
for iii in range(len(val_en_indexes_filtered)):
    line = ''
    for jjj in val_en_indexes_filtered[iii]:
        line = line + ' ' + idx2words_ft_en[jjj]
    label_list.append(line[5:-5])

print('bleu score for validation dataset:', corpus_bleu(predicted_list_nopad, [label_list]).score)


choice = random.randint(0, len(predicted_list_nopad)-1)
print(predicted_list_nopad[choice])
print(label_list[choice])



bleu score for validation dataset: 6.710148859009111
> The Grounds Grounds is to be in the forest in the
> So Green Grounds has gone on to plant maybe 20 gardens . 


In [48]:
predicted_list = evaluate(test_loader, encoder3, decoder3)

predicted_list_nopad = []
for ii in range(len(predicted_list)):
    line = ''
    for jj in predicted_list[ii]:
        if jj != '<pad>':
            line = line + ' ' + jj
    predicted_list_nopad.append(line)

for iii in range(len(predicted_list_nopad)):
    if predicted_list_nopad[iii][-5:] == '<eos>':
        predicted_list_nopad[iii] = predicted_list_nopad[iii][5:-5]
    else:
        predicted_list_nopad[iii] = predicted_list_nopad[iii][5:]

label_list = []
for iii in range(len(test_en_indexes_filtered)):
    line = ''
    for jjj in test_en_indexes_filtered[iii]:
        line = line + ' ' + idx2words_ft_en[jjj]
    label_list.append(line[5:-5])

print('bleu score for test dataset:', corpus_bleu(predicted_list_nopad, [label_list]).score)

choice = random.randint(0, len(predicted_list_nopad)-1)
print(predicted_list_nopad[choice])
print(label_list[choice])



bleu score for test dataset: 4.273861517547
> Who can could bin Laden ? The <unk> <unk> ? 
> Who could have predicted Double Rainbow or Rebecca Black or <unk> Cat ? 


In [42]:
#predicted_list_nopad = []
#for ii in range(len(predicted_list)):
    #line = ''
    #for jj in predicted_list[ii]:
        #if jj != '<pad>':
            #line = line + ' ' + jj
    #predicted_list_nopad.append(line)

In [43]:
#for iii in range(len(predicted_list_nopad)):
    #if predicted_list_nopad[iii][-5:] != '<eos>':
        #predicted_list_nopad[iii] = predicted_list_nopad[iii] + ' <eos>'

In [44]:
# label_list = []
# for iii in range(len(val_en_indexes_filtered)):
#     line = ''
#     for jjj in val_en_indexes_filtered[iii]:
#         line = line + ' ' + idx2words_ft_en[jjj]
#     label_list.append(line)

In [49]:
#print('bleu score for validation dataset:',corpus_bleu(predicted_list_nopad, [label_list]).score)

In [50]:
# choice = random.randint(0, len(predicted_list_nopad)-1)
# print(predicted_list_nopad[choice])
# print(label_list[choice])

In [47]:
#predicted_list = evaluate(test_loader, encoder2, decoder2)

In [48]:
# predicted_list_nopad = []
# for ii in range(len(predicted_list)):
#     line = ''
#     for jj in predicted_list[ii]:
#         if jj != '<pad>':
#             line = line + ' ' + jj
#     predicted_list_nopad.append(line)

In [49]:
# for iii in range(len(predicted_list_nopad)):
#     if predicted_list_nopad[iii][-5:] != '<eos>':
#         predicted_list_nopad[iii] = predicted_list_nopad[iii] + ' <eos>'

In [50]:
# label_list = []
# for iii in range(len(test_en_indexes_filtered)):
#     line = ''
#     for jjj in test_en_indexes_filtered[iii]:
#         line = line + ' ' + idx2words_ft_en[jjj]
#     label_list.append(line)

In [52]:
#print('bleu score for test dataset:', corpus_bleu(predicted_list_nopad, [label_list]).score)

In [51]:
# choice = random.randint(0, len(predicted_list_nopad)-1)
# print(predicted_list_nopad[choice])
# print(label_list[choice])