# Project Work
## Initial Data Work

In this file we will read in the data for the Vietnamese and Chinese to Engish corpuses, build a token2id and char2id mapping, vocabularies and data loaders

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
import pickle as pkl
import random
import csv
import pandas as pd
import matplotlib.pyplot as plt
import gensim
from gensim.models import KeyedVectors

random.seed(123)

PAD_IDX = 0
UNK_IDX = 1
BATCH_SIZE = 32

## Loading in training, validation and test sets

In [2]:
#Loading in the Vietnamese -> En datasets

train_vi_en = []
with open('../project_data/en-vi/train.tok.en') as inputfile:
    for line in inputfile:
        train_vi_en.append(line.strip().lower().split(' '))

train_vi_vi = []
with open('../project_data/en-vi/train.tok.vi') as inputfile:
    for line in inputfile:
        train_vi_vi.append(line.strip().lower().split(' '))

val_vi_en = []
with open('../project_data/en-vi/dev.tok.en') as inputfile:
    for line in inputfile:
        val_vi_en.append(line.strip().lower().split(' '))

val_vi_vi = []
with open('../project_data/en-vi/dev.tok.vi') as inputfile:
    for line in inputfile:
        val_vi_vi.append(line.strip().lower().split(' '))
        
test_vi_en = []
with open('../project_data/en-vi/test.tok.en') as inputfile:
    for line in inputfile:
        test_vi_en.append(line.strip().lower().split(' '))

test_vi_vi = []
with open('../project_data/en-vi/test.tok.vi') as inputfile:
    for line in inputfile:
        test_vi_vi.append(line.strip().lower().split(' '))

In [3]:
#Loading in the Chinese -> En datasets

train_zh_en = []
with open('../project_data/en-zh/train.tok.en') as inputfile:
    for line in inputfile:
        train_zh_en.append(line.strip().lower().split(' '))

train_zh_zh = []
with open('../project_data/en-zh/train.tok.zh') as inputfile:
    for line in inputfile:
        train_zh_zh.append(line.strip().lower().split(' '))

val_zh_en = []
with open('../project_data/en-zh/dev.tok.en') as inputfile:
    for line in inputfile:
        val_zh_en.append(line.strip().lower().split(' '))

val_zh_zh = []
with open('../project_data/en-zh/dev.tok.zh') as inputfile:
    for line in inputfile:
        val_zh_zh.append(line.strip().lower().split(' '))
        
test_zh_en = []
with open('../project_data/en-zh/test.tok.en') as inputfile:
    for line in inputfile:
        test_zh_en.append(line.strip().lower().split(' '))

test_zh_zh = []
with open('../project_data/en-zh/test.tok.zh') as inputfile:
    for line in inputfile:
        test_zh_zh.append(line.strip().lower().split(' '))

In [4]:
#Sanity Checking
print("Vi -> En | Training Examples: "+str(len(train_vi_en)))
print("Vi -> En | Training Examples: "+str(len(train_vi_vi)), '\n')

print("Vi -> En | Validation Examples: "+str(len(val_vi_en)))
print("Vi -> En | Validation Examples: "+str(len(val_vi_vi)), '\n')

print("Vi -> En | Testing Examples: "+str(len(test_vi_en)))
print("Vi -> En | Testing Examples: "+str(len(test_vi_vi)), '\n')

print("Zh -> En | Training Examples: "+str(len(train_zh_en)))
print("Zh -> En | Training Examples: "+str(len(train_zh_zh)), '\n')

print("Zh -> En | Validation Examples: "+str(len(val_zh_en)))
print("Zh -> En | Validation Examples: "+str(len(val_zh_zh)), '\n')

print("Zh -> En | Testing Examples: "+str(len(test_zh_en)))
print("Zh -> En | Testing Examples: "+str(len(test_zh_zh)), '\n')

Vi -> En | Training Examples: 133317
Vi -> En | Training Examples: 133317 

Vi -> En | Validation Examples: 1268
Vi -> En | Validation Examples: 1268 

Vi -> En | Testing Examples: 1553
Vi -> En | Testing Examples: 1553 

Zh -> En | Training Examples: 213377
Zh -> En | Training Examples: 213377 

Zh -> En | Validation Examples: 1261
Zh -> En | Validation Examples: 1261 

Zh -> En | Testing Examples: 1397
Zh -> En | Testing Examples: 1397 



## Loading in pre-trained fasttext embeddings for the three languages
### Building loaded embeddings, token2id, id2token and ordered words for all languages

In [5]:
# building the three vocabs from pre-trained embeddings
en_embeddings = KeyedVectors.load_word2vec_format('../pretrained_embeddings/wiki.en.vec')
vi_embeddings = KeyedVectors.load_word2vec_format('../pretrained_embeddings/wiki.vi.vec')
zh_embeddings = KeyedVectors.load_word2vec_format('../pretrained_embeddings/wiki.zh.vec')

In [6]:
en_loaded_embeddings = en_embeddings.vectors
vi_loaded_embeddings = vi_embeddings.vectors
zh_loaded_embeddings = zh_embeddings.vectors

In [7]:
#adding PAD AND UNK embeddings
en_loaded_embeddings = np.insert(en_loaded_embeddings, 0, np.zeros(300,), axis=0)
vi_loaded_embeddings = np.insert(vi_loaded_embeddings, 0, np.zeros(300,), axis=0)
zh_loaded_embeddings = np.insert(zh_loaded_embeddings, 0, np.zeros(300,), axis=0)

en_loaded_embeddings = np.insert(en_loaded_embeddings, 1, np.random.rand(300,), axis=0)
vi_loaded_embeddings = np.insert(vi_loaded_embeddings, 1, np.random.rand(300,), axis=0)
zh_loaded_embeddings = np.insert(zh_loaded_embeddings, 1, np.random.rand(300,), axis=0)

In [8]:
# building out id2token and token2id for all languages
en_token2id = {j: i for i,j in enumerate(['PAD','UNK']+en_embeddings.index2word)}
en_id2token = {i: j for i,j in enumerate(['PAD','UNK']+en_embeddings.index2word)}
vi_token2id = {j: i for i,j in enumerate(['PAD','UNK']+vi_embeddings.index2word)}
vi_id2token = {i: j for i,j in enumerate(['PAD','UNK']+vi_embeddings.index2word)}
zh_token2id = {j: i for i,j in enumerate(['PAD','UNK']+zh_embeddings.index2word)}
zh_id2token = {i: j for i,j in enumerate(['PAD','UNK']+zh_embeddings.index2word)}

In [36]:
VI_EN_MAX_LENGTH = np.percentile([len(sentence) for sentence in train_vi_en+train_vi_vi], 90)
ZH_EN_MAX_LENGTH = np.percentile([len(sentence) for sentence in train_zh_en+train_zh_zh], 90)

## Building Data Loaders

In [37]:
class translationDataset(Dataset):
    def __init__(self, data_list, target_list):
        self.data_list=data_list
        self.target_list=target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.target_list)

    def __getitem__(self, key):
        token_idx = self.data_list[key][:MAX_SAMPLE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

def translation_collate_func(batch):
    data_list = []
    label_list = []
    length_list = []
    for datum in batch:
        padded_vec = np.pad(np.array(datum[2]), 
                                pad_width=((0,MAX_SAMPLE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        label_list.append(padded_vec)
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SAMPLE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.from_numpy(np.array(label_list))]

In [38]:
# VI -> EN | dataloaders
MAX_SAMPLE_LENGTH = VI_EN_MAX_LENGTH

vi_en_train_dataset = translationDataset(train_vi_vi, train_vi_en)
vi_en_train_loader = torch.utils.data.DataLoader(dataset=vi_en_train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

vi_en_val_dataset = translationDataset(val_vi_vi, val_vi_en)
vi_en_val_loader = torch.utils.data.DataLoader(dataset=vi_en_val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

vi_en_test_dataset = translationDataset(test_vi_vi, test_vi_en)
vi_en_test_loader = torch.utils.data.DataLoader(dataset=vi_en_test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=False)

In [39]:
# ZH -> EN | dataloaders
MAX_SAMPLE_LENGTH = ZH_EN_MAX_LENGTH

zh_en_train_dataset = translationDataset(train_zh_zh, train_zh_en)
zh_en_train_loader = torch.utils.data.DataLoader(dataset=zh_en_train_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

zh_en_val_dataset = translationDataset(val_zh_zh, val_zh_en)
zh_en_val_loader = torch.utils.data.DataLoader(dataset=zh_en_val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

zh_en_test_dataset = translationDataset(test_zh_zh, test_zh_en)
zh_en_test_loader = torch.utils.data.DataLoader(dataset=zh_en_test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=False)