# Building an RNN for Machine Translation
## Initial Data Work

In this file we will read in the data for the Vietnamese and Chinese to Engish corpuses, build a token2id and char2id mapping, vocabularies and data loaders

###Building an RNN for Machine Translation
Initial Data Work

In [1]:
#Import Modules
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch import optim
import pickle as pkl
import random
import csv
import pandas as pd
import matplotlib.pyplot as plt
import gensim
from gensim.models import KeyedVectors
import io
from collections import Counter

random.seed(123)

#Global Variables
PAD_IDX = 0
UNK_IDX = 1
SOS_IDX = 2
EOS_IDX = 3
BATCH_SIZE = 64

## Loading in pre-trained fasttext embeddings for the three languages
### Building loaded embeddings, token2id, id2token and ordered words for all languages

In [2]:
# Load Pre-trained Word Vectors
def load_embeddings(word2vec, word2id, embedding_dim):
    embeddings = np.zeros((len(word2id), embedding_dim))
    for word, index in word2id.items():
        try:
            embeddings[index] = word2vec[word]

        except KeyError:
            embeddings[index] = np.random.normal(scale=0.6, size=(300,))

    return embeddings


def load_vectors(fname, num_vecs=None):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))

        if num_vecs is None:
            pass
        else:
            if len(data) + 1 > num_vecs:
                break

    return data


def data_dictionary(tokens, vocab_size_limit):
    token_counter = Counter()
    for token in tokens:
        token_counter[token] += 1

    vocab, count = zip(*token_counter.most_common(vocab_size_limit))
    id2token = list(vocab)
    token2id = dict(zip(vocab, range(4, 4 + len(vocab))))
    id2token = ['<pad>', '<unk>','<sos>','<eos>'] + id2token
    token2id['<pad>'] = PAD_IDX
    token2id['<unk>'] = UNK_IDX
    token2id['<sos>'] = SOS_IDX
    token2id['<eos>'] = EOS_IDX
    return token2id, id2token

#Load Word Embeddings

path= '/Volumes/Samsung USB/nlp_project/word_embeds/'

en_loaded_embeddings = load_vectors(path +'wiki.en.vec',1000)
print("Total number of words embedded is {:,d}".format(len(en_loaded_embeddings)))

vi_loaded_embeddings = load_vectors(path+'wiki.vi.vec',1000)
print("Total number of words embedded is {:,d}".format(len(vi_loaded_embeddings)))

zh_loaded_embeddings = load_vectors(path+'wiki.zh.vec',1000)
print("Total number of words embedded is {:,d}".format(len(zh_loaded_embeddings)))


#Create token2Id, token2Id
en_token2id, en_id2token = data_dictionary(list(en_loaded_embeddings.keys()), 1000)
vi_token2id, vi_id2token = data_dictionary(list(vi_loaded_embeddings.keys()), 1000)
zh_token2id, zh_id2token = data_dictionary(list(zh_loaded_embeddings.keys()), 1000)


#Create Emedding Matrix
en_loaded_embeddings=load_embeddings(en_loaded_embeddings,en_token2id,300)
vi_loaded_embeddings=load_embeddings(vi_loaded_embeddings,vi_token2id,300)
zh_loaded_embeddings=load_embeddings(zh_loaded_embeddings,zh_token2id,300)






Total number of words embedded is 1,000
Total number of words embedded is 1,000
Total number of words embedded is 1,000


## Loading in training, validation and test sets

In [3]:
#Loading in the Vietnamese -> En datasets
path1= '/Volumes/Samsung USB/nlp_project/iwslt-vi-en-processed/'

train_vi_en = []
with open(path1 +'train.tok.en') as inputfile:
    for line in inputfile:
        train_vi_en.append(line.strip().lower().split(' '))

train_vi_vi = []
with open(path1 + 'train.tok.vi') as inputfile:
    for line in inputfile:
        train_vi_vi.append(line.strip().lower().split(' '))

val_vi_en = []
with open(path1 + 'dev.tok.en') as inputfile:
    for line in inputfile:
        val_vi_en.append(line.strip().lower().split(' '))

val_vi_vi = []
with open(path1 +'dev.tok.vi') as inputfile:
    for line in inputfile:
        val_vi_vi.append(line.strip().lower().split(' '))
        
test_vi_en = []
with open(path1 +'test.tok.en') as inputfile:
    for line in inputfile:
        test_vi_en.append(line.strip().lower().split(' '))

test_vi_vi = []
with open(path1 + 'test.tok.vi') as inputfile:
    for line in inputfile:
        test_vi_vi.append(line.strip().lower().split(' '))
        
        
        
#Loading in the Chinese -> En datasets
path2= '/Volumes/Samsung USB/nlp_project/iwslt-zh-en-processed/'

train_zh_en = []
with open(path2 + 'train.tok.en') as inputfile:
    for line in inputfile:
        train_zh_en.append(line.strip().lower().split('\t'))

i=0
train_zh_zh = []
fin = io.open(path2 +'train.tok.zh', 'r', encoding='utf-8',newline= '\n', errors='ignore')
#n, d = map(str, fin.readline().split())
for line in fin:
    if i % 2 == 0 :
        train_zh_zh.append(line.rstrip().split(' '))
    i+=1        
        
        
val_zh_en = []
with open(path2 + 'dev.tok.en') as inputfile:
    for line in inputfile:
        val_zh_en.append(line.strip().lower().split('\t'))

i=0
val_zh_zh = []
fin = io.open(path2 +'dev.tok.zh', 'r', encoding='utf-8',newline= '\n', errors='ignore')
#n, d = map(str, fin.readline().split())
for line in fin:
    if i % 2 == 0 :
        val_zh_zh.append(line.rstrip().split(' '))
    i+=1
                
        

test_zh_en = []
with open(path2 +'test.tok.en') as inputfile:
    for line in inputfile:
        test_zh_en.append(line.strip().lower().split('\t'))

i=0
test_zh_zh = []
fin = io.open(path2 +'test.tok.zh', 'r', encoding='utf-8',newline= '\n', errors='ignore')
#n, d = map(str, fin.readline().split())
for line in fin:
    if i % 2 == 0 :
        test_zh_zh.append(line.rstrip().split(' '))
    i+=1
    
    
#Sanity Checking
print("Vi -> En | Training Examples: "+str(len(train_vi_en)))
print("Vi -> En | Training Examples: "+str(len(train_vi_vi)), '\n')

print("Vi -> En | Validation Examples: "+str(len(val_vi_en)))
print("Vi -> En | Validation Examples: "+str(len(val_vi_vi)), '\n')

print("Vi -> En | Testing Examples: "+str(len(test_vi_en)))
print("Vi -> En | Testing Examples: "+str(len(test_vi_vi)), '\n')

print("Zh -> En | Training Examples: "+str(len(train_zh_en)))
print("Zh -> En | Training Examples: "+str(len(train_zh_zh)), '\n')

print("Zh -> En | Validation Examples: "+str(len(val_zh_en)))
print("Zh -> En | Validation Examples: "+str(len(val_zh_zh)), '\n')

print("Zh -> En | Testing Examples: "+str(len(test_zh_en)))
print("Zh -> En | Testing Examples: "+str(len(test_zh_zh)), '\n')

Vi -> En | Training Examples: 133317
Vi -> En | Training Examples: 133317 

Vi -> En | Validation Examples: 1268
Vi -> En | Validation Examples: 1268 

Vi -> En | Testing Examples: 1553
Vi -> En | Testing Examples: 1553 

Zh -> En | Training Examples: 213377
Zh -> En | Training Examples: 213377 

Zh -> En | Validation Examples: 1261
Zh -> En | Validation Examples: 1261 

Zh -> En | Testing Examples: 1397
Zh -> En | Testing Examples: 1397 



### Encoding our data

In [4]:
VI_EN_MAX_LENGTH = int(np.percentile([len(sentence) for sentence in train_vi_en+train_vi_vi], 90))+1
ZH_EN_MAX_LENGTH = int(np.percentile([len(sentence) for sentence in train_zh_en+train_zh_zh], 90))+1

In [5]:
def encoding_tokens(sentence, language, translator):
    if language== 'English':
        token2id = en_token2id
    elif language== 'Vietnamese':
        token2id = vi_token2id
    elif language== 'Chinese':
        token2id = zh_token2id
    tokens = [token2id[token] if token in token2id else UNK_IDX for token in sentence]
    if translator == 'vi':
        max_len = VI_EN_MAX_LENGTH-1
    elif translator == 'zh':
        max_len = ZH_EN_MAX_LENGTH-1
    tokens=tokens[:max_len]
    return tokens

def encoding_dataset(dataset, language, translator):
    data = [encoding_tokens(tokens, language, translator) for tokens in dataset] 
    return data

In [6]:
train_vi_en = encoding_dataset(train_vi_en, 'English', 'vi')
train_vi_vi = encoding_dataset(train_vi_vi, 'Vietnamese', 'vi')
test_vi_en = encoding_dataset(test_vi_en, 'English', 'vi')
test_vi_vi = encoding_dataset(test_vi_vi, 'Vietnamese', 'vi')
val_vi_en = encoding_dataset(val_vi_en, 'English', 'vi')
val_vi_vi = encoding_dataset(val_vi_vi, 'Vietnamese', 'vi')

train_zh_en = encoding_dataset(train_zh_en, 'English', 'zh')
train_zh_zh = encoding_dataset(train_zh_zh, 'Chinese', 'zh')
test_zh_en = encoding_dataset(test_zh_en, 'English', 'zh')
test_zh_zh = encoding_dataset(test_zh_zh, 'Chinese', 'zh')
val_zh_en = encoding_dataset(val_zh_en, 'English', 'zh')
val_zh_zh = encoding_dataset(val_zh_zh, 'Chinese', 'zh')

## Building Data Loaders

In [7]:
class translationDataset(Dataset):
    def __init__(self, data_list, target_list):
        self.data_list=data_list
        self.target_list=target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.target_list)

    def __getitem__(self, key):
        data = self.data_list[key][:MAX_SAMPLE_LENGTH]
        label = self.target_list[key][:MAX_SAMPLE_LENGTH]
        return [data, len(data), label, len(label)]

def translation_collate_func(batch):
    data_list = []
    label_list = []
    for datum in batch:
        padded_data = np.pad(np.array(datum[0]+[EOS_IDX]), 
                                pad_width=((0,MAX_SAMPLE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_data)
        padded_label = np.pad(np.array(datum[2]+[EOS_IDX]), 
                                pad_width=((0,MAX_SAMPLE_LENGTH-datum[3])), 
                                mode="constant", constant_values=0)
        label_list.append(padded_label)
    return [torch.from_numpy(np.array(data_list)), torch.from_numpy(np.array(label_list))]

In [8]:
# VI -> EN | dataloaders
MAX_SAMPLE_LENGTH = VI_EN_MAX_LENGTH

vi_en_train_dataset = translationDataset(train_vi_vi, train_vi_en)
vi_en_train_loader = torch.utils.data.DataLoader(dataset=vi_en_train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

vi_en_val_dataset = translationDataset(val_vi_vi, val_vi_en)
vi_en_val_loader = torch.utils.data.DataLoader(dataset=vi_en_val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

vi_en_test_dataset = translationDataset(test_vi_vi, test_vi_en)
vi_en_test_loader = torch.utils.data.DataLoader(dataset=vi_en_test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

In [9]:
# ZH -> EN | dataloaders
MAX_SAMPLE_LENGTH = ZH_EN_MAX_LENGTH

zh_en_train_dataset = translationDataset(train_zh_zh, train_zh_en)
zh_en_train_loader = torch.utils.data.DataLoader(dataset=zh_en_train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

zh_en_val_dataset = translationDataset(val_zh_zh, val_zh_en)
zh_en_val_loader = torch.utils.data.DataLoader(dataset=zh_en_val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

zh_en_test_dataset = translationDataset(test_zh_zh, test_zh_en)
zh_en_test_loader = torch.utils.data.DataLoader(dataset=zh_en_test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

## Building the RNN model

In [10]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [11]:
def showPlot(points, string):
    plt.figure()
    fig, ax = plt.subplots()
    plt.plot(points)
    plt.title(string)
    plt.savefig((string+'.png'), dpi=300)

In [12]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size,hidden_size, language, drop_rate=0):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.dropout = nn.Dropout(drop_rate)
        
        if language == 'Vietnamese':
             self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(vi_loaded_embeddings), freeze=True)
                
        elif language == 'Chinese':
            self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(vi_loaded_embeddings), freeze=True)
            

    def forward(self, input, batch_size, hidden):
        embedded = self.dropout(self.embedding(input).view(1, batch_size, self.hidden_size))
        output, hidden = self.gru(hidden)
        return output, hidden
    
    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)
    
class DecoderRNN(nn.Module):
    
    def __init__(self, hidden_size, output_size, drop_rate=0):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(en_loaded_embeddings), freeze=True)
        
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, batch_size, hidden):
        output = self.embedding(input).view(1, batch_size, self.hidden_size)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)
    
def train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length):
    
    batch_size = input_variable.size()[0]
    encoder_hidden = encoder.initHidden(batch_size)

    input_variable = input_variable.transpose(0, 1)
    target_variable = target_variable.transpose(0, 1)

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_variable.size()[0]
    target_length = target_variable.size()[0]

    encoder_outputs = torch.zeros(max_length, batch_size, encoder.hidden_size)
    #encoder_outputs = encoder_outputs.cuda() if use_cuda else encoder_outputs

    loss = 0
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_variable[ei], batch_size, encoder_hidden)
        encoder_outputs[ei] = encoder_output[0]

    decoder_input = torch.LongTensor([SOS_IDX] * batch_size)
    #decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    # use_teacher_forcing = True

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden= decoder(
                decoder_input, batch_size, decoder_hidden)
            loss += criterion(decoder_output, target_variable[di])
            decoder_input = target_variable[di]  # Teacher forcing
            
    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden= decoder(
                decoder_input, batch_size, decoder_hidden)

            topv, topi = decoder_output.data.topk(1)
            #decoder_input = torch.cat(topi) 
            decoder_input = topi.squeeze().detach()
            # decoder_input = Variable(torch.LongTensor([[ni]]))
            #decoder_input = decoder_input.cuda() if use_cuda else decoder_input

            loss += criterion(decoder_output, target_variable[di])

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.data[0] / target_length

def trainIters(loader, encoder, decoder, n_iters, max_length, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        for i, (data, labels) in enumerate(loader):
            input_tensor = data
            target_tensor = labels

            loss = train(input_tensor, target_tensor, encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion, max_length)
            print_loss_total += loss
            plot_loss_total += loss

            if iter % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                             iter, iter / n_iters * 100, print_loss_avg))

            if iter % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0
    showPlot(plot_losses, 'Vietnamese')
    
    

    
    
    
    

    



In [None]:
#Training Model
teacher_forcing_ratio = 0.5
hidden_size = 300 #a-z+SOS+EOS+PAD
batch_size = 128
encoder1 = EncoderRNN(len(vi_token2id),hidden_size, 'Vietnamese', drop_rate = 0.1)
decoder1 = DecoderRNN(hidden_size,len(en_token2id))
trainIters(vi_en_val_loader, encoder1, decoder1, n_iters=200, max_length=VI_EN_MAX_LENGTH, print_every=10,plot_every=50)



1m 33s (- 29m 35s) (10 5%) 64.6798
1m 34s (- 29m 46s) (10 5%) 0.2904
1m 34s (- 29m 57s) (10 5%) 0.3110
1m 35s (- 30m 7s) (10 5%) 0.2195
1m 35s (- 30m 16s) (10 5%) 0.3240
1m 36s (- 30m 26s) (10 5%) 0.2372
1m 36s (- 30m 38s) (10 5%) 0.2381
1m 37s (- 30m 50s) (10 5%) 0.3007
1m 37s (- 31m 0s) (10 5%) 0.2593
1m 38s (- 31m 10s) (10 5%) 0.2633
1m 38s (- 31m 19s) (10 5%) 0.2708
1m 39s (- 31m 29s) (10 5%) 0.2495
1m 39s (- 31m 39s) (10 5%) 0.3302
1m 40s (- 31m 49s) (10 5%) 0.2806
1m 41s (- 31m 59s) (10 5%) 0.3053
1m 41s (- 32m 9s) (10 5%) 0.2820
1m 42s (- 32m 18s) (10 5%) 0.2504
1m 42s (- 32m 28s) (10 5%) 0.2683
1m 43s (- 32m 38s) (10 5%) 0.2421
1m 43s (- 32m 47s) (10 5%) 0.2216
3m 24s (- 30m 36s) (20 10%) 47.1524
3m 24s (- 30m 43s) (20 10%) 0.2595
3m 25s (- 30m 50s) (20 10%) 0.2663
3m 26s (- 30m 57s) (20 10%) 0.2863
3m 27s (- 31m 4s) (20 10%) 0.2292
3m 27s (- 31m 11s) (20 10%) 0.2773
3m 28s (- 31m 17s) (20 10%) 0.2320
3m 29s (- 31m 25s) (20 10%) 0.2275
3m 30s (- 31m 32s) (20 10%) 0.2948
3m 31s 