In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import numpy as np
import pdb
from tqdm import tqdm
import pandas as pd
import pickle
import io
import os
import sacrebleu
from sacrebleu import raw_corpus_bleu, corpus_bleu

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
PAD_idx = 0
SOS_idx = 1
EOS_idx = 2
UNK_idx= 3
batch_size = 64
MAX_SENTENCE_LENGTH = 50

In [3]:
class language(object):
    def __init__(self, name, i2t, t2i, embedding_matrix, train, test, val):
        self.name = name
        self.idx2token = i2t
        self.token2idx = t2i
        self.embedding_mat = embedding_matrix
        self.train_idx = train
        self.test_idx = test
        self.val_idx = val
        
# dataset = pickle.load(open("../data/zh1.1w-en6k.p", 'rb'))
dataset = pickle.load(open("../data/vi1.1w-en6k.p", 'rb'))


In [4]:
class LanguageDataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, source_lan, translate_lan):
        """
        @param data_list: list of character
        @param target_list: list of targets

        """
        self.source_lan = source_lan
        self.translate_lan = translate_lan
        
        assert (len(self.source_lan) == len(self.translate_lan))

    def __len__(self):
        return len(self.source_lan)

    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        source_lan_idx = self.source_lan[key][:MAX_SENTENCE_LENGTH-1]
        translation_lan_idx = self.translate_lan[key][:MAX_SENTENCE_LENGTH-1]
        source_lan_idx.append(EOS_idx)
        translation_lan_idx.append(EOS_idx)
        
        return [source_lan_idx, translation_lan_idx, len(source_lan_idx), len(translation_lan_idx)]

In [5]:
def vocab_collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    source_list = []
    translate_list = []
    length_list = []
    
    # padding
    for datum in batch:

        length_list.append(datum[2])
        s_padded_vec = np.pad(np.array(datum[0]),
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[2])),
                                mode="constant", constant_values=PAD_idx)
        source_list.append(s_padded_vec)
        t_padded_vec = np.pad(np.array(datum[1]),
                                pad_width=((0,MAX_SENTENCE_LENGTH-datum[3])),
                                mode="constant", constant_values=PAD_idx)
        translate_list.append(t_padded_vec)
        
#     ind_dec_order = np.argsort(length_list)[::-1]
#     source_list = np.array(source_list)[ind_dec_order]
#     length_list = np.array(length_list)[ind_dec_order]
#     translate_list = np.array(translate_list)[ind_dec_order]
    
    if torch.cuda.is_available and torch.has_cudnn:
        return [torch.from_numpy(np.array(source_list)).cuda(),torch.from_numpy(np.array(translate_list)).cuda()]
    else:
        return [torch.from_numpy(np.array(source_list)),torch.from_numpy(np.array(translate_list))]

In [6]:
train_dataset = LanguageDataset(dataset['src'].train_idx, dataset['tgt'].train_idx)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           collate_fn=vocab_collate_func,
                                           shuffle=True)

val_dataset = LanguageDataset(dataset['src'].val_idx, dataset['tgt'].val_idx)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=batch_size,
                                           collate_fn=vocab_collate_func,
                                           shuffle=False)

test_dataset = LanguageDataset(dataset['src'].test_idx, dataset['tgt'].test_idx)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                           batch_size=batch_size,
                                           collate_fn=vocab_collate_func,
                                           shuffle=False)

In [7]:
class DotProductAttention(nn.Module):
    def __init__(self, sqrt_dim, attn_dropout=0.1):
        super().__init__()
        self.sqrt_dim = sqrt_dim
        self.dropout = nn.Dropout(attn_dropout)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, query, k, v, mask=None):
#         Dot product in batch
        attn = torch.bmm(query, k.transpose(1, 2))
#         Rescaling
        attn = attn / self.sqrt_dim  #scale to make sure the gradient could be sufficiently bp
#         Masking
        if mask is not None:
            attn = attn.masked_fill(mask, -np.inf)
#         Normalize attention
        attn = self.softmax(attn)
# ------------ Notice dropout was added -------------
        attn = self.dropout(attn)
    
        output = torch.bmm(attn, v)

        return output, attn

In [8]:
# class EncoderCNN(nn.Module):
#     def __init__(self, emb_dim, hid_dim=64):
#         super(EncoderCNN, self).__init__()
# #         self.hid_dim = hid_dim
#         self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(dataset['src'].embedding_mat), freeze=False)
#         self.dropout = nn.Dropout(p=0.5)
# #         self.gru = nn.GRU(emb_dim, hid_dim, batch_first=True, bidirectional=True)
#         self.conv1 = nn.Conv1d(emb_dim, 8*hid_dim, 7, 1, padding=3)
#         self.bn1   = nn.BatchNorm1d(8*hid_dim)
#         self.conv2 = nn.Conv1d(8*hid_dim, 8*hid_dim, 7, 1, padding=3)
#         self.bn2   = nn.BatchNorm1d(8*hid_dim)
#         self.conv3 = nn.Conv1d(8*hid_dim, 8*hid_dim, 7, 2, padding=3)
#         self.bn3   = nn.BatchNorm1d(8*hid_dim)
#         self.conv4 = nn.Conv1d(8*hid_dim, 8*hid_dim, 7, 1, padding=3)
#         self.bn4   = nn.BatchNorm1d(8*hid_dim)
#         self.conv5 = nn.Conv1d(8*hid_dim, 8*hid_dim, 7, 1, padding=3)
#         self.bn5   = nn.BatchNorm1d(8*hid_dim)
        

#     def forward(self, inputs):
#         encode_batch_size, length = inputs.size()
#         embedded = self.dropout(self.embedding(inputs).float()) # the size -1 is inferred from other dimensions
# #         pdb.set_trace()
#         embedded = embedded.transpose(2,1)
#         low_out = self.bn1(F.relu(self.conv1(embedded)))
#         hidden = self.bn2(F.relu(self.conv2(low_out))+low_out)
#         hidden = self.bn3(F.relu(self.conv3(hidden)))
#         hidden = self.bn4(F.relu(self.conv4(hidden))+hidden)
#         hidden = self.bn5(self.conv5(hidden)+hidden)
#         hidden = torch.mean(hidden, dim=-1, keepdim=True)
# #         pdb.set_trace()
#         return low_out.transpose(2,1), hidden.permute(2,0,1)

In [9]:
class EncoderCNN(nn.Module):
    def __init__(self, emb_dim, hid_dim=64):
        super(EncoderCNN, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(dataset['src'].embedding_mat), freeze=False)
        self.dropout = nn.Dropout(p=0.5)
#         self.conv_pre = nn.Conv1d(emb_dim, 2*hid_dim, 7, 1, padding=3)
        self.conv1 = nn.Conv1d(emb_dim, 2*hid_dim, 7, 1, padding=3)
        self.conv2 = nn.Conv1d(2*hid_dim, 2*hid_dim, 5, 2, padding=2)
        self.conv3 = nn.Conv1d(2*hid_dim, 2*hid_dim, 3, 2, padding=0)
        self.conv4 = nn.Conv1d(2*hid_dim, 2*hid_dim, 4, 2, padding=0)
        self.conv5 = nn.Conv1d(2*hid_dim, 2*hid_dim, 5, 1, padding=0)
        

    def forward(self, inputs):
        encode_batch_size, length = inputs.size()
        embedded = self.dropout(self.embedding(inputs).float())
        embedded = embedded.transpose(2,1)
        low_out = self.dropout(F.relu(self.conv1(embedded)))
        hidden = self.dropout(F.relu(self.conv2(low_out)))
        hidden = self.dropout(F.relu(self.conv3(hidden)))
        hidden = self.dropout(F.relu(self.conv4(hidden)))
        hidden = self.conv5(hidden) 
        return low_out.transpose(2,1), hidden.permute(2,0,1)

In [10]:
class DecoderRNN(nn.Module):
    def __init__(self, emb_dim, hid_dim, output_dim, dropout_rate=0.3):
        super(DecoderRNN, self).__init__()
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        
        self.dropout = nn.Dropout(p=dropout_rate)
        
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(dataset['tgt'].embedding_mat), freeze=False)
        
        self.mapping = nn.Linear(self.emb_dim+self.hid_dim, self.hid_dim)
#         self.attn_combine = nn.Linear(self.hid_dim + self.emb_dim, self.emb_dim)
        self.dot_product_att = DotProductAttention(sqrt_dim=emb_dim)
        
        self.gru = nn.GRU(emb_dim+hid_dim, hid_dim)
        self.out = nn.Linear(hid_dim, output_dim)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, inputs, hidden, encoder_outputs):
        
        emb = self.dropout(self.embedding(inputs).float())
        
        attn_keys = self.mapping(torch.cat((emb, hidden), dim=2))
        
#         attn_applied = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs)
        
        output, attention = self.dot_product_att(attn_keys[0].unsqueeze(0).transpose(0,1), encoder_outputs, encoder_outputs)

#         output = torch.cat((emb[0], attn_applied.squeeze(1)), 1)
#         output = self.attn_combine(output).unsqueeze(0)
#         output = F.relu(output)
        
#         pdb.set_trace()
        output, hidden = self.gru(torch.cat((emb, output.transpose(0,1)), dim=2), hidden)
        output = self.softmax(self.out(output[0]))
        
        return output, hidden, attention


In [11]:
teacher_forcing_ratio = 1
#input_tensor: list of sentence tensor
def train(source, translate, encoder, decoder, encoder_optimizer, decoder_optimizer,
          criterion):
    encoder.train()
    decoder.train()
    
    cur_batch_size, input_length = source.size()
    cur_batch_size, target_length = translate.size()
    
    
#     encoder_hidden = encoder.init_hidden(cur_batch_size)

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    loss = 0

    encoder_output, encoder_hidden = encoder(source)

    decoder_input = torch.tensor(np.array([[SOS_idx]]*cur_batch_size).reshape(1,cur_batch_size),device=device)
    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for i in range(target_length):
        
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_output)
            
            loss += criterion(decoder_output, translate[:,i])
            decoder_input = translate[:,i].unsqueeze(0)  # Teacher forcing
            
    else:
        # Without teacher forcing: use its own predictions as the next input
        for i in range(target_length):
            decoder_output, decoder_hidden, decoder_attention= decoder(decoder_input, decoder_hidden, encoder_output)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input
            
            #decoder_input [1, batch size] 
            decoder_input = decoder_input.unsqueeze(0)
 
            loss += criterion(decoder_output, translate[:,i])

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [12]:
def compute_corpus_bleu(output_words, target_words):
    output_sentence = [' '.join(parser(words)) for words in output_words]
    target_sentence = [' '.join(parser(words)) for words in target_words]
    
#     ref_sent_list = [[i.split()] for i in target_sentence]
#     candidate_sent_list = [i.split() for i in output_sentence]
    ref_sent_list = [[i for i in target_sentence]]
    candidate_sent_list = [i for i in output_sentence]
#     pdb.set_trace()
#     score = corpus_bleu(ref_sent_list, candidate_sent_list, smoothing_function=chencherry.method1)
    score = raw_corpus_bleu(candidate_sent_list, ref_sent_list)
    
    return score

In [13]:
def evaluate2(encoder, decoder, loader):
    encoder.eval()
    decoder.eval()
    input_words = []
    target_words = []
    decoded_words = []
    num_count = 0
    num_count = 0
    for i, (source, translate) in enumerate(loader):
        
        if i > 20:
            break
        
        cur_batch_size = translate.size()[0]
            
#         with torch.no_grad():
#             encoder_hidden = encoder.init_hidden(cur_batch_size)
        
        input_tensor = source
        target_tensor = translate
        target_length = target_tensor.size()[1]
        
        encoder_output, encoder_hidden = encoder(input_tensor)

        decoder_input = torch.tensor(np.array([[SOS_idx]]*cur_batch_size).reshape(1,cur_batch_size),device=device)
        decoder_hidden = encoder_hidden
        
        for i in range(cur_batch_size):
            decoded_words.append([])
            input_words.append([])
            target_words.append([])
        
        for i in range(cur_batch_size):
            for ii in range(input_tensor.size()[1]):
                if dataset['src'].idx2token[input_tensor.cpu().numpy()[i,ii]] != '<PAD>':
                    input_words[num_count].append(dataset['src'].idx2token[input_tensor.cpu().numpy()[i,ii]])
            num_count += 1
        num_count -= cur_batch_size
        for i in range(cur_batch_size):
            for ii in range(target_tensor.size()[1]):
                if dataset['tgt'].idx2token[target_tensor.cpu().numpy()[i,ii]] != '<PAD>':
                    target_words[num_count].append(dataset['tgt'].idx2token[target_tensor.cpu().numpy()[i,ii]])
            num_count += 1
        num_count -= cur_batch_size      
        
        cur_len = np.zeros(cur_batch_size, dtype=int)
        #pdb.set_trace()
        for i in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_output)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input
            decoder_input = decoder_input.unsqueeze(0)
            
            topi = topi.squeeze().cpu().numpy()
            if cur_len[0] == 0:
                for i in range(len(topi)):
                    decoded_words[num_count+i].append(dataset['tgt'].idx2token[topi[i]])
                    cur_len[i] += 1
            
            else:
                for i in range(len(topi)):
                    if decoded_words[num_count+i][cur_len[i]-1] == '<EOS>':
                        continue
                    decoded_words[num_count+i].append(dataset['tgt'].idx2token[topi[i]])
                    cur_len[i] += 1
        num_count += cur_batch_size
    pre_list = []
    for pre_sentenc in decoded_words:
        pre_list.append("".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in pre_sentenc]).strip())

    true_list = []
    for true_sentenc in target_words:
        true_list.append("".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in true_sentenc]).strip())
    
    true_list2 = []
    true_list2.append(true_list)        
    score = sacrebleu.corpus_bleu(pre_list, true_list2)
    print('bleu score: ', score.score)
    return decoded_words,input_words, target_words, score

In [14]:
def trainIters(encoder, decoder, n_iters, print_every=800, plot_every=200, learning_rate=0.001):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    
    best_score = 0
    best_past = 0
    
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss(ignore_index = 0)
    count_iter = 0
    for cur_iter in range(1, n_iters + 1):
        for i, (source, translate) in tqdm(enumerate(train_loader) ,total=len(train_loader)):
            
            loss = train(source, translate, encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion)
            print_loss_total += loss
            plot_loss_total += loss
            count_iter += 1
            
            if count_iter % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0
#             if count_iter % print_every == 0:
        
        print_loss_avg = print_loss_total / print_every
        print_loss_total = 0
        print('%d %d%% %.4f' % (cur_iter, cur_iter / n_iters * 100, print_loss_avg))

        print('Train: ')
        _,_,_, score = evaluate2(encoder, decoder, train_loader)
        with open('out4.txt', 'a') as file:
            file.write('small train: ' + str(score.score) + '\n')

        print('validation: ')
        _,_,_, score = evaluate2(encoder, decoder, val_loader)
        with open('out4.txt', 'a') as file:
            file.write('validation: ' + str(score.score) + '\n')
            
        if score.score > best_score:
            best_past = 0
            best_score = score.score
        else:
#             learning_rate /= 5
#             encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
#             decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
            best_past += 1
        
            

        encoder_chkpt = encoder.state_dict()
        decoder_chkpt = decoder.state_dict()
        chkpt = {
            'encoder': encoder_chkpt,
            'decoder': decoder_chkpt,
            'epoch': cur_iter
        }
                
        model_name = './chkpt/conv_att_vi'+'_accu_{accu:3.3f}.chkpt'.format(accu=score.score)
        torch.save(chkpt, model_name)
        
        if best_past > 4:
            break
                

In [15]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np
%matplotlib inline

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)
    plt.show()
    
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [16]:
ENC_EMB_DIM = 300
VI_EMB_DIM = 300
CONV_HID_DIM = 256
HID_DIM = 256
OUTPUT_DIM = len(dataset['tgt'].token2idx)


encoder = EncoderCNN(ENC_EMB_DIM, CONV_HID_DIM).to(device)
decoder = DecoderRNN(VI_EMB_DIM, HID_DIM*2, OUTPUT_DIM).to(device)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# chkpt = torch.load("./chkpt/conv_att_ch_accu_1.517.chkpt")

# encoder.load_state_dict(chkpt['encoder'])
# decoder.load_state_dict(chkpt['decoder'])

# print ('success in loading from pretrained')

trainIters(encoder, decoder, 60)


100%|██████████| 2084/2084 [07:17<00:00,  5.14it/s]


1 1% 11.7820
Train: 
bleu score:  15.061024883166462
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  14.484976631267727


100%|██████████| 2084/2084 [07:14<00:00,  5.07it/s]


2 3% 10.2187
Train: 
bleu score:  14.964027880293228
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  14.975729718783354


100%|██████████| 2084/2084 [07:25<00:00,  5.12it/s]


3 5% 9.3464
Train: 
bleu score:  15.279716279084202
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  15.28007109179704


100%|██████████| 2084/2084 [07:25<00:00,  4.61it/s]


4 6% 8.6386
Train: 
bleu score:  18.2834048082564
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  17.358035164411092


100%|██████████| 2084/2084 [07:25<00:00,  5.11it/s]


5 8% 8.0855
Train: 
bleu score:  18.815586942490448
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  18.62723702770037


100%|██████████| 2084/2084 [07:23<00:00,  5.57it/s]


6 10% 7.6406
Train: 
bleu score:  20.292531304812524
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  18.589302181504713


100%|██████████| 2084/2084 [07:25<00:00,  5.05it/s]


7 11% 7.3054
Train: 
bleu score:  21.568634457597483
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  20.856311578269317


100%|██████████| 2084/2084 [07:25<00:00,  5.22it/s]


8 13% 7.0438
Train: 
bleu score:  21.912543629438762
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  20.65989065390471


100%|██████████| 2084/2084 [07:25<00:00,  5.05it/s]


9 15% 6.8221
Train: 
bleu score:  22.218984544484336
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  21.199546294905456


100%|██████████| 2084/2084 [07:24<00:00,  4.90it/s]


10 16% 6.6555
Train: 
bleu score:  22.199219093573458
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  21.20398097970649


100%|██████████| 2084/2084 [07:25<00:00,  5.06it/s]


11 18% 6.5162
Train: 
bleu score:  22.30245746331308
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  19.490866051570908


100%|██████████| 2084/2084 [07:25<00:00,  5.18it/s]


12 20% 6.4093
Train: 
bleu score:  23.16324400766306
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  21.56431746643927


100%|██████████| 2084/2084 [07:24<00:00,  5.23it/s]


13 21% 6.2990
Train: 
bleu score:  23.744560218111644
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  21.918387306956316


100%|██████████| 2084/2084 [07:24<00:00,  5.22it/s]


14 23% 6.2179
Train: 
bleu score:  23.91132490913671
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  21.442226090034325


100%|██████████| 2084/2084 [07:20<00:00,  5.07it/s]


15 25% 6.1374
Train: 
bleu score:  22.077023313827304
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  20.044509318203808


100%|██████████| 2084/2084 [07:13<00:00,  5.93it/s]


16 26% 6.0744
Train: 
bleu score:  24.3412892091887
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  22.134169119469593


100%|██████████| 2084/2084 [07:14<00:00,  5.98it/s]


17 28% 6.0300
Train: 
bleu score:  24.273377866253952
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  22.356843599657328


100%|██████████| 2084/2084 [07:24<00:00,  5.16it/s]


18 30% 5.9797
Train: 
bleu score:  24.277146694191035
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  21.770227644617098


100%|██████████| 2084/2084 [07:24<00:00,  5.20it/s]


19 31% 5.9563
Train: 
bleu score:  23.787021702600327
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  22.0688955777237


100%|██████████| 2084/2084 [07:23<00:00,  5.30it/s]


20 33% 5.9348
Train: 
bleu score:  24.319494072904437
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  22.257669722422996


100%|██████████| 2084/2084 [07:25<00:00,  5.09it/s]


21 35% 5.8758
Train: 
bleu score:  24.823406367177814
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  22.46881395788667


100%|██████████| 2084/2084 [07:24<00:00,  5.25it/s]


22 36% 5.8500
Train: 
bleu score:  24.722329598745947
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  21.673493508130996


100%|██████████| 2084/2084 [07:13<00:00,  5.70it/s]


23 38% 5.9140
Train: 
bleu score:  23.841404071880856
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  22.79512083834368


100%|██████████| 2084/2084 [07:12<00:00,  5.59it/s]


24 40% 5.8190
Train: 
bleu score:  25.09616921951
validation: 


  0%|          | 0/2084 [00:00<?, ?it/s]

bleu score:  22.682886286549852


 14%|█▍        | 293/2084 [01:02<06:42,  4.45it/s]

KeyboardInterrupt: 

### Evaluate best ch-en model

In [22]:
chkpt = torch.load("./chkpt/conv_att_ch_accu_18.380.chkpt")
dataset = pickle.load(open("../data/zh1.1w-en6k.p", 'rb'))
encoder.load_state_dict(chkpt['encoder'])
decoder.load_state_dict(chkpt['decoder'])

print ('success in loading from pretrained')
_ = evaluate2(encoder, decoder, test_loader)

success in loading from pretrained
bleu score:  20.56176219575077


### Evaluate the best VI-EN model
- noticed the validation score from the name is smaller since it was evaluated with raw score

In [19]:
dataset = pickle.load(open("../data/vi1.1w-en6k.p", 'rb'))

chkpt = torch.load("./chkpt/conv_att_vi_accu_22.795.chkpt")

encoder.load_state_dict(chkpt['encoder'])
decoder.load_state_dict(chkpt['decoder'])

print ('success in loading from pretrained')
_ = evaluate2(encoder, decoder, test_loader)

success in loading from pretrained
bleu score:  23.817502880117285
