# Building an RNN for Machine Translation
## Initial Data Work

In this file we will read in the data for the Vietnamese and Chinese to Engish corpuses, build a token2id and char2id mapping, vocabularies and data loaders

### Building an RNN for Machine Translation
Initial Data Work

In [1]:
#Import Modules
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch import optim
import pickle as pkl
import random
import csv
import pandas as pd
import matplotlib.pyplot as plt
import gensim
from gensim.models import KeyedVectors
import io
from collections import Counter
import sacrebleu

random.seed(123)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#Global Variables
PAD_IDX = 0
UNK_IDX = 1
SOS_IDX = 2
EOS_IDX = 3
BATCH_SIZE = 64

## Loading in pre-trained fasttext embeddings for the three languages
### Building loaded embeddings, token2id, id2token and ordered words for all languages

In [2]:
# Load Pre-trained Word Vectors
def load_embeddings(word2vec, word2id, embedding_dim):
    embeddings = np.zeros((len(word2id), embedding_dim))
    for word, index in word2id.items():
        try:
            embeddings[index] = word2vec[word]

        except KeyError:
            embeddings[index] = np.random.normal(scale=0.6, size=(300,))

    return embeddings


def load_vectors(fname, num_vecs=None):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))

        if num_vecs is None:
            pass
        else:
            if len(data) + 1 > num_vecs:
                break

    return data


def data_dictionary(tokens, vocab_size_limit):
    token_counter = Counter()
    for token in tokens:
        token_counter[token] += 1

    vocab, count = zip(*token_counter.most_common(vocab_size_limit))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(4, 4 + len(vocab))))
    id2token = ['<pad>', '<unk>','<sos>','<eos>'] + id2token
    token2id['<pad>'] = PAD_IDX
    token2id['<unk>'] = UNK_IDX
    token2id['<sos>'] = SOS_IDX
    token2id['<eos>'] = EOS_IDX
    return token2id, id2token

#Load Word Embeddings

path= '../pretrained_embeddings/'

en_loaded_embeddings = load_vectors(path +'wiki.en.vec',400000)
print("Total number of words embedded is {:,d}".format(len(en_loaded_embeddings)))

vi_loaded_embeddings = load_vectors(path+'wiki.vi.vec',400000)
print("Total number of words embedded is {:,d}".format(len(vi_loaded_embeddings)))

zh_loaded_embeddings = load_vectors(path+'wiki.zh.vec',400000)
print("Total number of words embedded is {:,d}".format(len(zh_loaded_embeddings)))


#Create token2Id, token2Id
en_token2id, en_id2token = data_dictionary(list(en_loaded_embeddings.keys()), 400000)
vi_token2id, vi_id2token = data_dictionary(list(vi_loaded_embeddings.keys()), 400000)
zh_token2id, zh_id2token = data_dictionary(list(zh_loaded_embeddings.keys()), 400000)


#Create Emedding Matrix
embedding_size = 300
en_loaded_embeddings=load_embeddings(en_loaded_embeddings,en_token2id,embedding_size)
vi_loaded_embeddings=load_embeddings(vi_loaded_embeddings,vi_token2id,embedding_size)
zh_loaded_embeddings=load_embeddings(zh_loaded_embeddings,zh_token2id,embedding_size)

Total number of words embedded is 400,000
Total number of words embedded is 292,168
Total number of words embedded is 332,647


## Loading in training, validation and test sets

In [3]:
#Loading in the Vietnamese -> En datasets
path1= '../project_data/en-vi/'

train_vi_en = []
with open(path1 +'train.tok.en') as inputfile:
    for line in inputfile:
        train_vi_en.append(line.strip().lower().split(' '))

train_vi_vi = []
with open(path1 + 'train.tok.vi') as inputfile:
    for line in inputfile:
        train_vi_vi.append(line.strip().lower().split(' '))

val_vi_en = []
with open(path1 + 'dev.tok.en') as inputfile:
    for line in inputfile:
        val_vi_en.append(line.strip().lower().split(' '))

val_vi_vi = []
with open(path1 +'dev.tok.vi') as inputfile:
    for line in inputfile:
        val_vi_vi.append(line.strip().lower().split(' '))
        
test_vi_en = []
with open(path1 +'test.tok.en') as inputfile:
    for line in inputfile:
        test_vi_en.append(line.strip().lower().split(' '))

test_vi_vi = []
with open(path1 + 'test.tok.vi') as inputfile:
    for line in inputfile:
        test_vi_vi.append(line.strip().lower().split(' '))
        
        
        
#Loading in the Chinese -> En datasets
path1= '../project_data/en-zh/'

train_zh_en = []
with open(path1 +'train.tok.en') as inputfile:
    for line in inputfile:
        train_zh_en.append(line.strip().lower().split(' '))

train_zh_zh = []
with open(path1 + 'train.tok.zh') as inputfile:
    for line in inputfile:
        train_zh_zh.append(line.strip().lower().split(' '))

val_zh_en = []
with open(path1 + 'dev.tok.en') as inputfile:
    for line in inputfile:
        val_zh_en.append(line.strip().lower().split(' '))

val_zh_zh = []
with open(path1 +'dev.tok.zh') as inputfile:
    for line in inputfile:
        val_zh_zh.append(line.strip().lower().split(' '))
        
test_zh_en = []
with open(path1 +'test.tok.en') as inputfile:
    for line in inputfile:
        test_zh_en.append(line.strip().lower().split(' '))

test_zh_zh = []
with open(path1 + 'test.tok.zh') as inputfile:
    for line in inputfile:
        test_zh_zh.append(line.strip().lower().split(' '))
    
    
#Sanity Checking
print("Vi -> En | Training Examples: "+str(len(train_vi_en)))
print("Vi -> En | Training Examples: "+str(len(train_vi_vi)), '\n')

print("Vi -> En | Validation Examples: "+str(len(val_vi_en)))
print("Vi -> En | Validation Examples: "+str(len(val_vi_vi)), '\n')

print("Vi -> En | Testing Examples: "+str(len(test_vi_en)))
print("Vi -> En | Testing Examples: "+str(len(test_vi_vi)), '\n')

print("Zh -> En | Training Examples: "+str(len(train_zh_en)))
print("Zh -> En | Training Examples: "+str(len(train_zh_zh)), '\n')

print("Zh -> En | Validation Examples: "+str(len(val_zh_en)))
print("Zh -> En | Validation Examples: "+str(len(val_zh_zh)), '\n')

print("Zh -> En | Testing Examples: "+str(len(test_zh_en)))
print("Zh -> En | Testing Examples: "+str(len(test_zh_zh)), '\n')

Vi -> En | Training Examples: 133317
Vi -> En | Training Examples: 133317 

Vi -> En | Validation Examples: 1268
Vi -> En | Validation Examples: 1268 

Vi -> En | Testing Examples: 1553
Vi -> En | Testing Examples: 1553 

Zh -> En | Training Examples: 213377
Zh -> En | Training Examples: 213377 

Zh -> En | Validation Examples: 1261
Zh -> En | Validation Examples: 1261 

Zh -> En | Testing Examples: 1397
Zh -> En | Testing Examples: 1397 



In [4]:
#preserve original data for evaluation
train_vi_en_orig = train_vi_en
train_vi_vi_orig = train_vi_vi
val_vi_en_orig = val_vi_en
val_vi_vi_orig = val_vi_vi
test_vi_en_orig = test_vi_en
test_vi_vi_orig = test_vi_vi

train_zh_en_orig = train_zh_en
train_zh_zh_orig = train_zh_zh
val_zh_en_orig = val_zh_en
val_zh_zh_orig = val_zh_zh
test_zh_en_orig = test_zh_en
test_zh_zh_orig = test_zh_zh

### Encoding our data

In [5]:
VI_EN_MAX_LENGTH = int(np.percentile([len(sentence) for sentence in train_vi_en+train_vi_vi], 90))+1
ZH_EN_MAX_LENGTH = int(np.percentile([len(sentence) for sentence in train_zh_en+train_zh_zh], 90))+1

In [6]:
def encoding_tokens(sentence, language, translator):
    if language== 'English':
        token2id = en_token2id
    elif language== 'Vietnamese':
        token2id = vi_token2id
    elif language== 'Chinese':
        token2id = zh_token2id
    tokens = [token2id[token] if token in token2id else UNK_IDX for token in sentence]
    if translator == 'vi':
        max_len = VI_EN_MAX_LENGTH-1
    elif translator == 'zh':
        max_len = ZH_EN_MAX_LENGTH-1
    tokens=tokens[:max_len]
    return tokens

def encoding_dataset(dataset, language, translator):
    data = [encoding_tokens(tokens, language, translator) for tokens in dataset] 
    return data

In [7]:
train_vi_en = encoding_dataset(train_vi_en, 'English', 'vi')
train_vi_vi = encoding_dataset(train_vi_vi, 'Vietnamese', 'vi')
test_vi_en = encoding_dataset(test_vi_en, 'English', 'vi')
test_vi_vi = encoding_dataset(test_vi_vi, 'Vietnamese', 'vi')
val_vi_en = encoding_dataset(val_vi_en, 'English', 'vi')
val_vi_vi = encoding_dataset(val_vi_vi, 'Vietnamese', 'vi')

train_zh_en = encoding_dataset(train_zh_en, 'English', 'zh')
train_zh_zh = encoding_dataset(train_zh_zh, 'Chinese', 'zh')
test_zh_en = encoding_dataset(test_zh_en, 'English', 'zh')
test_zh_zh = encoding_dataset(test_zh_zh, 'Chinese', 'zh')
val_zh_en = encoding_dataset(val_zh_en, 'English', 'zh')
val_zh_zh = encoding_dataset(val_zh_zh, 'Chinese', 'zh')

## Building Data Loaders

In [8]:
class translationDataset(Dataset):
    def __init__(self, data_list, target_list):
        self.data_list=data_list
        self.target_list=target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.target_list)

    def __getitem__(self, key):
        data = self.data_list[key][:MAX_SAMPLE_LENGTH]
        label = self.target_list[key][:MAX_SAMPLE_LENGTH]
        return [data, len(data), label, len(label)]

def translation_collate_func(batch):
    data_list = []
    label_list = []
    for datum in batch:
        padded_data = np.pad(np.array(datum[0]+[EOS_IDX]), 
                                pad_width=((0,MAX_SAMPLE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_data)
        padded_label = np.pad(np.array(datum[2]+[EOS_IDX]), 
                                pad_width=((0,MAX_SAMPLE_LENGTH-datum[3])), 
                                mode="constant", constant_values=0)
        label_list.append(padded_label)
    return [torch.from_numpy(np.array(data_list)), torch.from_numpy(np.array(label_list))]

In [9]:
# VI -> EN | dataloaders
MAX_SAMPLE_LENGTH = VI_EN_MAX_LENGTH

vi_en_train_dataset = translationDataset(train_vi_vi, train_vi_en)
vi_en_train_loader = torch.utils.data.DataLoader(dataset=vi_en_train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

vi_en_val_dataset = translationDataset(val_vi_vi, val_vi_en)
vi_en_val_loader = torch.utils.data.DataLoader(dataset=vi_en_val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

vi_en_test_dataset = translationDataset(test_vi_vi, test_vi_en)
vi_en_test_loader = torch.utils.data.DataLoader(dataset=vi_en_test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

In [10]:
# ZH -> EN | dataloaders
MAX_SAMPLE_LENGTH = ZH_EN_MAX_LENGTH

zh_en_train_dataset = translationDataset(train_zh_zh, train_zh_en)
zh_en_train_loader = torch.utils.data.DataLoader(dataset=zh_en_train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

zh_en_val_dataset = translationDataset(val_zh_zh, val_zh_en)
zh_en_val_loader = torch.utils.data.DataLoader(dataset=zh_en_val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

zh_en_test_dataset = translationDataset(test_zh_zh, test_zh_en)
zh_en_test_loader = torch.utils.data.DataLoader(dataset=zh_en_test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

## Building the RNN model

In [11]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [12]:
def showPlot(points, string):
    plt.figure()
    fig, ax = plt.subplots()
    plt.plot(points)
    plt.title(string)
    plt.savefig((string+'.png'), dpi=300)

In [13]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, language, drop_rate=0):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.language = language
        if language == 'Vietnamese':
             self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(vi_loaded_embeddings), freeze=True)
        elif language == 'Chinese':
            self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(vi_loaded_embeddings), freeze=True)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.dropout = nn.Dropout(drop_rate)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = self.dropout(embedded)
        output, hidden = self.gru(hidden)
        
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
 

In [78]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, embedding_size, hidden_size, output_size, max_length, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.max_length = max_length
        self.embedding_size = embedding_size
        self.dropout_p = dropout_p
        
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(en_loaded_embeddings), freeze=True)
        self.out = nn.Linear(self.hidden_size, self.output_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        #self.attn = nn.Linear(self.hidden_size, hidden_size)
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size) 
        self.gru = nn.GRU(self.hidden_size + self.embedding_size, self.hidden_size, dropout=dropout_p)
        self.dropout = nn.Dropout(self.dropout_p)
        

    def forward(self, input, hidden, encoder_outputs):
        # Get the embedding of the current input
        embedded = self.embedding(input).view(1, 1, -1) # S=1 x B x N
        embedded = self.dropout(embedded)
        print("embedded size", embedded.size())
        print("hidden size", hidden.size())
        print("encoder outputs", encoder_outputs.size())
        # Get weights from dot product
        attn_weights = torch.bmm(hidden, encoder_outputs.view(1, -1, self.hidden_size).transpose(1,2))

        # Softmax to normalize
        attn_weights = F.softmax(attn_weights)
        print("attn weights", attn_weights.size())
        
        # apply to encoder outputs to get weighted average
        context = attn_weights.bmm(encoder_outputs.view(1, -1, self.hidden_size)) 
        print("context", context.size())
        
        rnn_input = torch.cat((embedded, context), 2)
        print("rnn input", rnn_input.size())
        print("hidden size", hidden.size())
        output, hidden = self.gru(rnn_input, hidden)
        print("output", output.size())
        #output = output.squeeze(0) 
        #output = F.log_softmax(self.out(torch.cat((output, context), 1)))
        output = F.log_softmax(self.out(output[0]), dim=1)
 
        print("output", output.size())
        print("hidden", hidden.size())
        print("attn", attn_weights.size())
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [79]:
def trainAttn(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    
    encoder_hidden = encoder.initHidden()
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    input_tensor = input_variable#.transpose(0,1)
    target_tensor = target_variable#.transpose(0,1)
    
    max_length = input_tensor.size(0)
    #batch_size = input_tensor.size(1)
    #vocab_size = len(en_token2id)
    
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]
   # encoder_output, encoder_hidden = encoder(input_tensor, encoder_hidden)
   # encoder_outputs = encoder_output[0,0]

    decoder_input = torch.tensor(SOS_IDX, device=device)
    #decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    # use_teacher_forcing = True

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            print("target size",target_tensor[di].size())
            decoder_output, decoder_hidden, hidden_weights = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing
            
    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            print(target_tensor[di].size())
            decoder_output, decoder_hidden, hidden_weights = decoder(
                decoder_input, decoder_hidden, encoder_outputs)

            topv, topi = decoder_output.data.topk(1)
            decoder_input = torch.cat(topi) 
            decoder_input = topi.squeeze().detach()
            decoder_input = Variable(torch.LongTensor([[ni]]))
            #decoder_input = decoder_input.cuda() if use_cuda else decoder_input

            loss += criterion(decoder_output, target_tensor[di])

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.data[0] / batch_size

def trainAttnIters(loader, encoder, decoder, n_iters, max_length, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        for i, (data, labels) in enumerate(loader):
            input_tensor = data
            target_tensor = labels

            loss = trainAttn(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
            print_loss_total += loss
            plot_loss_total += loss

            if iter % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                             iter, iter / n_iters * 100, print_loss_avg))

            if iter % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0
    showPlot(plot_losses, 'Vietnamese') 

In [80]:
#Training Model
teacher_forcing_ratio = 1
hidden_size = 256
embedding_size = 300
encoder1 = EncoderRNN(embedding_size, hidden_size, 'Vietnamese').to(device)
attn_decoder1 = AttnDecoderRNN(embedding_size, hidden_size, len(en_token2id), VI_EN_MAX_LENGTH, dropout_p=0.1)
trainAttnIters(vi_en_val_loader, encoder1, attn_decoder1, n_iters=10, max_length=VI_EN_MAX_LENGTH, print_every=1,plot_every=1)

target size torch.Size([45])
embedded size torch.Size([1, 1, 300])
hidden size torch.Size([1, 1, 256])
encoder outputs torch.Size([64, 256])


TypeError: forward() takes 2 positional arguments but 3 were given

In [18]:
def indexesFromSentence(lang, sentence):
    if lang == 'Vietnamese':
        out = []
        for word in sentence:
            if word in vi_token2id:
                out.append(vi_token2id[word])
            else:
                out.append(UNK_IDX)
    elif lang == 'English':
        out = []
        for word in sentence:
            if word in en_token2id:
                out.append(en_token2id[word])
            else:
                out.append(UNK_IDX)
    elif lang == 'Chinese':
        out = []
        for word in sentence:
            if word in zh_token2id:
                out.append(zh_token2id[word])
            else:
                out.append(UNK_IDX)
    return out

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_IDX)
    return torch.tensor(indexes, dtype=torch.long).view(-1, 1)

In [19]:
def evaluate(encoder, decoder, sentence, input_lang):
    with torch.no_grad():   
        
        #for i, (data, labels) in enumerate(vi_en_train_loader):
        #    input_tensor = data
        
       
        input_tensor = tensorFromSentence(input_lang, sentence)
        max_length = input_tensor.size(0)
        input_tensor = input_tensor.transpose(0,1)
        input_length = input_tensor.size()[0]
        #print(input_length)
        
        encoder_hidden = encoder.initHidden()
        #encoder_outputs = torch.zeros(max_length, encoder.hidden_size)
        #encoder_outputs = torch.zeros(max_length, batch_size, encoder.hidden_size)

        #for ei in range(input_length):
        #    encoder_output, encoder_hidden = encoder(
        #        input_tensor)
        #    encoder_outputs[ei] = encoder_output[0,0]
            
        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)
        encoder_output, encoder_hidden = encoder(input_tensor)
        encoder_outputs = encoder_output[0,0]

        #decoder_input = torch.tensor([[SOS_IDX]] * batch_size)
        decoder_input = input_tensor[0,:]
        decoder_hidden = encoder_hidden

        decoded_words = []
        #decoder_attentions = torch.zeros(max_length, max_length)
                
 #       for di in range(max_length):
 #           decoder_output, decoder_hidden = decoder(
  #             decoder_input, decoder_hidden, encoder_hidden)
          #  print(decoder_output)
          #  print(decoder_hidden)
  #          topv, topi = decoder_output.topk(1, di)
            #decoder_attentions[di] = decoder_attention.data
  #          decoder_input = topi.squeeze().detach()
  #          print(decoder_input[di].item())
  #          if decoder_input[di].item()== EOS_IDX:
  #              decoded_words.append('<EOS>')
  #              break
  #          else:
  #              decoded_words.append(en_id2token[decoder_input[di].item()])
  #          print(decoded_words)
  #          decoder_input = decoder_input[di]


#add beam search, using for loops, validation or test time
        for di in range(max_length):
            decoder_output, decoder_hidden = decoder(
               decoder_input, decoder_hidden, encoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()
            if decoder_input[di].item() == EOS_IDX:
                break
            else:
                decoded_words.append(en_id2token[decoder_input[di].item()])
                
        return decoded_words#, decoder_attentions[:di + 1]

In [20]:
from random import randint
def evaluateRandomly(encoder, decoder, language, n=10):
    for i in range(n):
        if language == 'Vietnamese':
            index = randint(0, len(train_vi_vi_orig))
            sentence1 = train_vi_vi_orig[index] 
            sentence2 = train_vi_en_orig[index]
        elif language == 'Chinese':
            index = randint(0, len(train_zh_zh_orig))
            sentence1 = train_zh_zh_orig[index]
            sentence2 = train_zh_en_orig[index]
        
        print('>', sentence1)
        print('=', sentence2)
        output_words = evaluate(encoder, decoder, sentence1, language)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [21]:
MAX_LENGTH = VI_EN_MAX_LENGTH
evaluateRandomly(encoder1, decoder1, "Vietnamese")

> ['"', 'con_người_ta', 'là', 'một', 'công_việc', 'còn', 'đang', 'dang_dở', 'lại', 'lầm', 'tưởng', 'rằng', 'mình', 'đã', 'xong_xuôi', '.', '"', 'dan', 'gilbert', 'chia_sẻ', 'công_trình', 'nghiên_cứu', 'gần_đây', 'của_ông', 'về', 'một', 'hiện_tượng', 'được', 'gọi', 'là', '"', 'đoạn', 'kết', 'của', 'lịch_sử', 'ảo_tưởng', ',', '"', 'ở', 'đó', 'chúng_ta', 'tưởng', 'rằng', 'con_người', 'hiện_tại', 'của', 'chúng_ta', 'sẽ', 'cứ', 'thế', 'này', 'mãi_mãi', '.', 'kết', 'qủa', 'nghiên_cứu', 'chỉ', 'ra', 'rằng', 'chẳng', 'phải', 'thế', 'đâu', '.']
= ['&quot;', 'human', 'beings', 'are', 'works', 'in', 'progress', 'that', 'mistakenly', 'think', 'they', '&apos;re', 'finished', '.', '&quot;', 'dan', 'gilbert', 'shares', 'recent', 'research', 'on', 'a', 'phenomenon', 'he', 'calls', 'the', '&quot;', 'end', 'of', 'history', 'illusion', ',', '&quot;', 'where', 'we', 'somehow', 'imagine', 'that', 'the', 'person', 'we', 'are', 'right', 'now', 'is', 'the', 'person', 'we', '&apos;ll', 'be', 'for', 'the', 'res

In [None]:
def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions)

In [None]:
index = randint(0, len(train_vi_vi_orig))
sentence = train_vi_vi_orig[index] 
evaluateAndShowAttention(sentence)

In [None]:
#calculate BLEU score on corpus level
def BLEU(encoder, decoder, language):
    hypotheses = ""
    targets = ""
    if language == 'Vietnamese':
        inputs = val_vi_vi_orig
        labels = val_vi_en_orig
    else:
        inputs = val_zh_zh_orig
        labels = val_zh_zh_orig
        
    for sentence in inputs:
        output = evaluate(encoder, decoder, sentence, language)
        for word in output:
            hypotheses = hypotheses + " " + word

    #targets
    for sentence in labels:
        for word in sentence:
            #replace infrequent words with <unk>
            if word in en_id2token:
                targets = targets + " " + word
            else:
                targets = targets + " " + '<unk>'

    # hypotheses = hypotheses + (' '.join(train_vi_vi_orig[i])) + " "
    # targets = targets + (' '.join(train_vi_en_orig[i])) + " "
    score = sacrebleu.corpus_bleu(hypotheses, targets)
    print(score)

In [None]:
BLEU(encoder1, decoder1, 'Vietnamese')

In [None]:
####OLD
 def forwardo(self, input, hidden, encoder_outputs):
       # input.unsqueeze(0)
        #embedded = self.dropout(self.embedding(input)).unsqueeze(0)
        print(input)
        print(hidden[0].size())
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)#.unsqueeze(0)
        print(embedded[0].size())
        print(torch.cat((embedded[0], hidden[0]), 1).size())
        print(self.max_length)
        print(encoder_outputs.size())
        
       # attn_weights = F.softmax(
       #     self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        caten = torch.cat((embedded[0], hidden[0]), 1)
#        line = self.attn(caten)
#        attn_weights = F.softmax(
#            line, self.max_length)
#        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
#                                 encoder_outputs.unsqueeze(0))

#        output = torch.cat((embedded[0], attn_applied[0]), 1)
#        output = self.attn_combine(output).unsqueeze(0)
        
       # Calculate attention weights and apply to encoder outputs
       # attn_weights = self.attn(hidden.squeeze(0), encoder_outputs)
       # context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x 1 x N
        
        # Combine embedded input word and attended context, run through RNN
#        rnn_input = torch.cat((word_embedded, context), 2)
#        output = F.relu(output)
#        output, hidden = self.gru(output, hidden)

#        output = F.log_softmax(self.out(output[0]), dim=1)
#        return output, hidden, attn_weights

        # Calculate attention weights and apply to encoder outputs
        attn_weights = F.softmax(
            caten, dim=1)
        print(attn_weights.size())
        attn_applied = attn_weights.view(1, -1).mm(encoder_outputs.view(1, -1))
        
        # Combine embedded input word and attended context, run through RNN
        rnn_input = torch.cat((embedded[0], attn_applied[0]), 1)
        output, hidden = self.gru(rnn_input, hidden)
        
        # Final output layer
        output = output.squeeze(0) # B x N
        output = F.log_softmax(self.out(torch.cat((output, context), 1)))
        
        # Return final output, hidden state, and attention weights (for visualization)
        
        
        
        print(attn_weights.size())
        print(embedded.squeeze(0).size())
        print(hidden.squeeze(0).size())
        print(embedded.view(1, 1, -1).size())
        print(torch.cat((embedded, hidden), dim=1).size())
        print(F.softmax(
            torch.cat((embedded[0], hidden[0]), dim=1), dim=1).size())
        print(F.softmax(self.attn(torch.cat((embedded.squeeze(0), hidden.squeeze(0)), 1))).size())
        attn_weights =  F.softmax(self.attn(torch.cat((embedded.squeeze(0), hidden.squeeze(0)), 1)))
        print(attn_weights.view(1, 1, -1).size())
        print(encoder_outputs.view(1, 1, -1).size())
        
        
        
        
        
        
         #attn_weights = self.attn(hidden[-1], encoder_outputs)
        attn_weights = F.tanh(hidden, encoder_outputs)
        print(attn_weights.size())
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x 1 x N
        
        #attn_weights = F.softmax(
        #    torch.cat((embedded[0], hidden[0]), dim=1), dim=1)
        # Calculate attention weights and apply to encoder outputs
        #attn_weights = self.attn(torch.cat((embedded.squeeze(0), hidden.squeeze(0)), 1))
        #attn_weights = self.attn(hidden.squeeze(0), encoder_outputs)
        score = hidden.bmm(encoder_outputs)
        print(score.size())
        caten = torch.cat((embedded[0], hidden[0]), 1)
        print(caten[-1].unsqueeze(0).size())
        print(caten[-1].unsqueeze(1).size())
        #attn = self.attn(caten)
        attn_weights = F.softmax(caten, dim=1)
        attn_weights = self.attn(attn_weights)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))
        #attn_weights = self.attn(hidden[-1], encoder_outputs[-1])
        #attn_weights = torch.cat((encoder_outputs.view(1, -1), attn_weights), dim=1)
        attn_weights = F.softmax(attn_weights)
        print(attn_weights.view(1, 1, -1).size())
        print(encoder_outputs.view(1, 1,-1).size())
        
        context = torch.bmm(attn_weights.view(1, 1, -1),
                                 attn_weights.view(1, 1, -1))
        
        # Combine embedded input word and attended context, run through RNN
        rnn_input = torch.cat((embedded, context), 2)
        output, hidden = self.gru(rnn_input, hidden)
        
        # Get context vector
        
        # Final output layer
        output = output.squeeze(0) # B x N
        output = F.log_softmax(self.out(torch.cat((output, context), 1)))
        
        
        
        
        
                # Get weights from dot product
        attn_weights = torch.bmm(hidden, encoder_outputs.view(1, -1, self.hidden_size).transpose(1,2))
        print(hidden.size())
        print(attn_weights.size())
        # Softmax to normalize
        context = F.softmax(attn_weights)
        print(context.size())
        # Calculate attention from current RNN state and all encoder outputs;
        # apply to encoder outputs to get weighted average

        #context = self.attn(attn_weights, encoder_outputs)
        
        output = torch.cat((embedded, context), 2)
        #output = self.attn_combine(output).unsqueeze(0)
        print(output.size())
        output = F.relu(output)
        
        # Return final output, hidden state, and attention weights (for visualization)
        return output, hidden, attn_weights
    

In [22]:
print(VI_EN_MAX_LENGTH)

39
