In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import numpy as np
import pickle
import time

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import pdb

import matplotlib.pyplot as plt
plt.switch_backend('agg')
%matplotlib inline

from model_architectures import Encoder_RNN, Decoder_RNN
from data_prep import prepareData, tensorsFromPair
from inference import generate_translation
from misc import timeSince, load_cpickle_gc

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
!export CUDA_LAUNCH_BLOCKING=1

#### Constants

In [3]:
BATCH_SIZE = 32
SOS_token = 0
EOS_token = 1
MAX_LENGTH = 30
teacher_forcing_ratio = 0.5

#### Data prep - train and val

In [None]:
input_lang, target_lang, train_pairs = prepareData(
    input_file = 'iwslt-vi-en-processed/train.vi', 
    target_file = 'iwslt-vi-en-processed/train.en', 
    input_lang = 'vi', 
    target_lang = 'en')

pickle.dump(train_pairs, open("train_vi_en_pairs", "wb"))
pickle.dump(input_lang, open("train_vi_lang", "wb"))
pickle.dump(target_lang, open("train_en_lang", "wb"))

_, _, val_pairs = prepareData(
    input_file = 'iwslt-vi-en-processed/dev.vi', 
    target_file = 'iwslt-vi-en-processed/dev.en',
    input_lang = 'vi', 
    target_lang = 'en')

pickle.dump(val_pairs, open("val_vi_en_pairs", "wb"))

In [None]:
train_idx_pairs = []
for x in train_pairs:
    indexed = list(tensorsFromPair(x, input_lang, target_lang))
    train_idx_pairs.append(indexed)

pickle.dump(train_idx_pairs, open("train_vi_en_idx_pairs", "wb"))

### Laboratory

In [4]:
import numpy as np
import torch
from torch.utils.data import Dataset

class LanguagePairDataset(Dataset):
    
    def __init__(self, sent_pairs): 
        # this is a list of sentences 
        self.sent_pairs_list = sent_pairs

    def __len__(self):
        return len(self.sent_pairs_list)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        sent1 = self.sent_pairs_list[key][0][:MAX_LENGTH]
        sent2 = self.sent_pairs_list[key][1][:MAX_LENGTH]
        return [sent1, sent2, len(sent1), len(sent2)]

def language_pair_dataset_collate_function(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    sent1_list = []
    sent1_length_list = []
    sent2_list = []
    sent2_length_list = []
    # padding
    for datum in batch:
        padded_vec_1 = np.pad(np.array(datum[0]).T.squeeze(), pad_width=((0,MAX_LENGTH-len(datum[0]))), 
                                mode="constant", constant_values=0)
        padded_vec_2 = np.pad(np.array(datum[1]).T.squeeze(), pad_width=((0,MAX_LENGTH-len(datum[1]))), 
                                mode="constant", constant_values=0)
        sent1_list.append(padded_vec_1)
        sent2_list.append(padded_vec_2)
        sent1_length_list.append(len(datum[0]))
        sent2_length_list.append(len(datum[1]))
    print(np.array(sent1_list).shape)
    return [torch.from_numpy(np.array(sent1_list)), torch.LongTensor(sent1_length_list), 
            torch.from_numpy(np.array(sent2_list)), torch.LongTensor(sent2_length_list)]

In [5]:
train_idx_pairs = load_cpickle_gc("train_vi_en_idx_pairs")
train_dataset = LanguagePairDataset(train_idx_pairs)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=BATCH_SIZE, 
                                           collate_fn=language_pair_dataset_collate_function,
                                           #shuffle=True
                                          )

In [None]:
for i, (sent1_list, sent1_lengths, sent2_list, sent2_lengths) in enumerate(train_loader):
    print(i)
    print(sent1_list.size())
    print(sent2_list.size())
    
    print(sent1_lengths.size())
    print(sent2_lengths.size())
    break
    
    

In [6]:
class Encoder_Batch_RNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder_Batch_RNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        
    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

    def forward(self, sents, sent_lengths):
        '''
            sents is a tensor with the shape (batch_size, padded_length )
        '''
        batch_size = sents.size()[0]
        sent_lengths = list(sent_lengths)
        
        descending_lengths = [x for x, _ in sorted(zip(sent_lengths, range(len(sent_lengths))), reverse=True)]
        descending_indices = [x for _, x in sorted(zip(sent_lengths, range(len(sent_lengths))), reverse=True)]
        descending_lengths = np.array(descending_lengths)
        
        descending_sents = torch.index_select(sents, 0, torch.tensor(descending_indices).to(device))
        
        # get embedding
        embed = self.embedding(descending_sents)
        # pack padded sequence
        embed = torch.nn.utils.rnn.pack_padded_sequence(embed, descending_lengths, batch_first=True)
        
        # fprop though RNN
        self.hidden = self.init_hidden(batch_size)
        rnn_out, self.hidden = self.gru(embed, self.hidden)
        
        # change the order back
        change_it_back = [x for _, x in sorted(zip(descending_indices, range(len(descending_indices))))]
        self.hidden = torch.index_select(self.hidden, 1, torch.LongTensor(change_it_back).to(device)) 
        
        # **TODO**: What is rnn_out?
        return rnn_out, self.hidden

In [7]:
class Decoder_Batch_RNN(nn.Module):
    def __init__(self, output_size, hidden_size):
        super(Decoder_Batch_RNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

    def forward(self, sents, sent_lengths, hidden):
        
        batch_size = sents.size()[0]
        sent_lengths = list(sent_lengths)
        
        descending_lengths = [x for x, _ in sorted(zip(sent_lengths, range(len(sent_lengths))), reverse=True)]
        descending_indices = [x for _, x in sorted(zip(sent_lengths, range(len(sent_lengths))), reverse=True)]
        descending_lengths = np.array(descending_lengths)
        
        descending_sents = torch.index_select(sents, 0, torch.tensor(descending_indices).to(device))
        
        # get embedding
        embed = self.embedding(descending_sents)
        # pack padded sequence
        embed = torch.nn.utils.rnn.pack_padded_sequence(embed, descending_lengths, batch_first=True)
        
        # fprop though RNN
        self.hidden = hidden
        rnn_out, self.hidden = self.gru(embed, self.hidden)
        
        change_it_back = [x for _, x in sorted(zip(descending_indices, range(len(descending_indices))))]
        self.hidden = torch.index_select(self.hidden, 1, torch.LongTensor(change_it_back).to(device))
        rnn_out, _ = torch.nn.utils.rnn.pad_packed_sequence(rnn_out, batch_first=True)
        # rnn_out is batch_size x 28 x 256
                
        final_hidden = self.hidden
        final_hidden = final_hidden.view(final_hidden.size(1), final_hidden.size(0), -1)
        first_hidden = hidden
        first_hidden = first_hidden.view(first_hidden.size(1), first_hidden.size(0), -1)
        
        rnn_out = torch.cat((first_hidden, rnn_out, final_hidden), 1)
        
        
#         rnn_out = rnn_out.view(-1, rnn_out.size(2))
        
        output = self.softmax(self.out(rnn_out))
        # now output is the size 28 by 31257 (vocab size)
        return output, hidden

#### Training

In [None]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # example of input_tensor: [2, 43, 23, 9, 19, 4]. Indexed on our vocabulary. 
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
    
     # iterate GRU over words --> final hidden state is representation of source sentence. 
    loss = 0
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)
    decoder_hidden = encoder_hidden
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [None]:
a = torch.randn(3, 4)
a[0][0]

The whole training process looks like this:

-  Start a timer
-  Initialize optimizers and criterion
-  Create set of training pairs
-  Start empty losses array for plotting

In [30]:
def trainIters(encoder, decoder, n_iters, n_epochs, lang1, lang2, print_every=1000, plot_every=100, learning_rate=0.001):
    """
    lang1 is the Lang object for language 1 
    Lang2 is the Lang object for language 2
    """
    pairs = load_cpickle_gc("train_"+lang1.name+"_"+lang2.name+"_pairs")
    validation_pairs = load_cpickle_gc("val_"+lang1.name+"_"+lang2.name+"_pairs")
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    encoder_optimizer = torch.optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = torch.optim.SGD(decoder.parameters(), lr=learning_rate)
    
    criterion = nn.NLLLoss()
    
    for epoch in range(n_epochs):
        for step, (sent1s, sent1_lengths, sent2s, sent2_lengths) in enumerate(train_loader):
            encoder.train()
            sent1_batch, sent2_batch = sent1s.to(device), sent2s.to(device) 
            sent1_length_batch, sent2_length_batch = sent1_lengths.to(device), sent2_lengths.to(device)
            
            encoder_optimizer.zero_grad()
            outputs, encoder_hidden = encoder(sent1_batch, sent1_length_batch)
            
            encoder_hidden_batch = encoder_hidden
            decoder_hidden = encoder_hidden_batch
            
            decoder_input = torch.tensor([[SOS_token]], device=device)
            use_teacher_forcing = True
            
            loss = 0
            outputs, decoder_hidden = decoder(sent2_batch, sent2_length_batch, decoder_hidden)
            count = 0
            for i in range(len(sent2_batch)):
                l = sent2_length_batch[i]
                for j in range(l):
                    o = outputs[i][j].view(1, -1)
                    s = torch.tensor([sent2_batch[i][j]])
                    loss += criterion(o, s)
                    count += 1
                    
            
            print_loss_total += loss
            plot_loss_total += loss  
            
            if  (step+1) % print_every == 0:
                print_loss_avg = print_loss_total / (count*print_every)
                print_loss_total = 0
                print('TRAIN SCORE %s (%d %d%%) %.4f' % (timeSince(start, step / n_epochs),
                                             step, step / n_epochs * 100, print_loss_avg))
#                 val_loss = test_model(encoder, decoder, search, validation_pairs, lang1, max_length=MAX_LENGTH)
                # returns bleu score
#                 print("VALIDATION BLEU SCORE: "+str(val_loss))

            if step % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0

    showPlot(plot_losses)

In [31]:
hidden_size = 256
input_lang = load_cpickle_gc("train_vi_lang")
target_lang = load_cpickle_gc("train_en_lang")
encoder1 = Encoder_Batch_RNN(input_lang.n_words, hidden_size).to(device)
decoder1 = Decoder_Batch_RNN(target_lang.n_words, hidden_size).to(device)

num_iters = 10000

args = {
    'n_iters': 10000,
    'n_epochs': 4,
    'learning_rate': 0.001,
    'encoder': encoder1,
    'decoder': decoder1,
    'lang1': input_lang, 
    'lang2': target_lang,
    'print_every': 10
}

trainIters(**args)

(32, 30)
(32, 30)
(32, 30)
(32, 30)
(32, 30)
(32, 30)
(32, 30)
(32, 30)
(32, 30)
(32, 30)
TRAIN SCORE 0m 6s (- -1m 56s) (9 225%) 3.1460
(32, 30)
(32, 30)
(32, 30)
(32, 30)
(32, 30)
(32, 30)
(32, 30)
(32, 30)
(32, 30)
(32, 30)
TRAIN SCORE 0m 13s (- -1m 49s) (19 475%) 3.5343
(32, 30)
(32, 30)
(32, 30)
(32, 30)
(32, 30)
(32, 30)
(32, 30)
(32, 30)
(32, 30)


KeyboardInterrupt: 

In [10]:
# torch.save(encoder1.state_dict(), "encoder1_40000")
# torch.save(decoder1.state_dict(), "decoder1_40000")

pairs = load_cpickle_gc("train_"+input_lang.name+"_"+target_lang.name+"_pairs")

In [36]:
# hidden_size = 256
# encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
# encoder1.load_state_dict(torch.load("encoder1_40000", map_location='cpu'))
# decoder1 = DecoderRNN(target_lang.n_words, hidden_size).to(device)
# decoder1.load_state_dict(torch.load("decoder1_40000", map_location='cpu'))
encoder1.eval()
decoder1.eval()
evaluateRandomly(pairs, encoder1, decoder1, n = 1, strategy='greedy', k = 8)

> cam on ong a toi ay hom nay .
= thank you for coming here .


RuntimeError: Expected hidden size (1, 1, 256), got (1, 10, 256)

In [25]:
def evaluateRandomly(pairs, encoder, decoder, n=10, strategy="greedy", k = None, max_length = MAX_LENGTH):
    """
    Randomly select a sentence from the input dataset and try to produce its translation.
    """    
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words = generate_translation(encoder, decoder, pair[0], search=strategy, k = k, max_length = max_length)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [34]:
def indexesFromSentence(lang, sentence):
    words = sentence.split(' ')
    indices = []
    for word in words:
        if lang.word2index.get(word) is not None:
            indices.append(lang.word2index[word])
        else:
            indices.append(1) # UNK_INDEX
    return indices

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def greedy_search(decoder, decoder_input, hidden, max_length):
    translation = []
    for i in range(max_length):
        next_word_softmax, hidden = decoder(decoder_input, hidden)
        best_idx = torch.max(next_word_softmax, 1)[1].squeeze().item()

        # convert idx to word
        best_word = target_lang.index2word[best_idx]
        translation.append(best_word)
        decoder_input = torch.tensor([[best_idx]], device=device)
        
        if best_word == 'EOS':
            break
    return translation


def beam_search(decoder, decoder_input, hidden, max_length, k):
    
    candidates = [(decoder_input, 0, hidden)]
    potential_candidates = []
    completed_translations = []

    # put a cap on the length of generated sentences
    for m in range(max_length):
        for c in candidates:
            # unpack the tuple
            c_sequence = c[0]
            c_score = c[1]
            c_hidden = c[2]
            # EOS token
            if c_sequence[-1] == 1:
                completed_translations.append((c_sequence, c_score))
                k = k - 1
            else:
                next_word_probs, hidden = decoder(c_sequence[-1], c_hidden)
                # in the worst-case, one sequence will have the highest k probabilities
                # so to save computation, only grab the k highest_probability from each candidate sequence
                top_probs, top_idx = torch.topk(next_word_probs, k)
                for i in range(len(top_probs[0])):
                    word = torch.from_numpy(np.array([top_idx[0][i]]).reshape(1, 1)).to(device)
                    new_score = c_score + top_probs[0][i]
                    potential_candidates.append((torch.cat((c_sequence, word)).to(device), new_score, hidden))

        candidates = sorted(potential_candidates, key= lambda x: x[1], reverse=True)[0:k] 
        potential_candidates = []

    completed = completed_translations + candidates
    completed = sorted(completed, key= lambda x: x[1], reverse=True)[0] 
    final_translation = []
    for x in completed[0]:
        final_translation.append(target_lang.index2word[x.squeeze().item()])
    return final_translation

def generate_translation(encoder, decoder, sentence, max_length, search="greedy", k = None):
    """ 
    @param max_length: the max # of words that the decoder can return
    @returns decoded_words: a list of words in target language
    """    
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        
        # encode the source sentence
        encoder_output, encoder_hidden = encoder(input_tensor, [input_length])

        # start decoding
        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS
        decoder_hidden = encoder_hidden
        decoded_words = []
        
        if search == 'greedy':
            decoded_words = greedy_search(decoder, decoder_input, decoder_hidden, max_length)
        elif search == 'beam':
            if k == None:
                k = 2
            decoded_words = beam_search(decoder, decoder_input, decoder_hidden, max_length, k)  

        return decoded_words

In [None]:
def evaluate(encoder, decoder, sentence, search="greedy", max_length=MAX_LENGTH):
    """
    Function that generate translation.
    First, feed the source sentence into the encoder and obtain the hidden states from encoder.
    Secondly, feed the hidden states into the decoder and unfold the outputs from the decoder.
    Lastly, for each outputs from the decoder, collect the corresponding words in the target language's vocabulary.
    And collect the attention for each output words.
    @param encoder: the encoder network
    @param decoder: the decoder network
    @param sentence: string, a sentence in source language to be translated
    @param max_length: the max # of words that the decoder can return
    @output decoded_words: a list of words in target language
    @output decoder_attentions: a list of vector, each of which sums up to 1.0
    """    
    # process input sentence
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        # encode the source lanugage
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]
        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS
        # decode the context vector
        decoder_hidden = encoder_hidden # decoder starts from the last encoding sentence
        # output of this function
        decoder_attentions = torch.zeros(max_length, max_length)
        
        if search == 'greedy':
            decoded_words = greedy_search(decoder, decoder_input, decoder_hidden, max_length)
        elif search == 'beam':
            decoded_words = beam_search(decoder, decoder_input, decoder_hidden, max_length)  
        return decoded_words

In [None]:
import sacrebleu
def calculate_bleu(predictions, labels):
	"""
	Only pass a list of strings 
	"""
	# tthis is ony with n_gram = 4

	bleu = sacrebleu.raw_corpus_bleu(predictions, [labels], .01).score
	return bleu

In [None]:
model_e = EncoderRNN(5551, 256)
model_e.load_state_dict(torch.load("encoder1_40000", map_location='cpu'))
model_e.eval()

In [None]:
model_d = DecoderRNN(19344, 256)
model_d.load_state_dict(torch.load("decoder1_40000", map_location='cpu'))
model_d.eval()

In [None]:
MAX_LENGTH = 100
def test_model(encoder, decoder, search, test_pairs, lang1, max_length=MAX_LENGTH):
    # for test, you only need the lang1 words to be tokenized,
    # lang2 words is the true labels
    encoder_inputs = [pair[0] for pair in test_pairs]
    true_labels = [pair[1] for pair in test_pairs]
    translated_predictions = []
    for i in range(len(encoder_inputs)): 
        if i% 100== 0:
            print(i)
        e_input = encoder_inputs[i]
        decoded_words = generate_translation(encoder, decoder, e_input, max_length=MAX_LENGTH)
        translated_predictions.append(" ".join(decoded_words))
    return calculate_bleu(translated_predictions, true_labels)
    

TODO: 
    
Yikes, teh decoder isn't' preforming very well. 

In [None]:
test_pairs = pickle.load(open("preprocessed_data_no_elmo/iwslt-vi-eng/preprocessed_no_indices_pairs_test", "rb"))

In [None]:
input_lang = pickle.load(open("preprocessed_data_no_elmo/iwslt-vi-eng/preprocessed_no_elmo_vilang", "rb"))

In [None]:
test_model(model_e, model_d, "greedy", test_pairs, input_lang )