# Project Work
## Initial Data Work

In this file we will read in the data for the Vietnamese and Chinese to Engish corpuses, build a token2id and char2id mapping, vocabularies and data loaders

In [14]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter
import pickle as pkl
import random
import pdb
import csv
import pandas as pd
import matplotlib.pyplot as plt
import gensim
from gensim.models import KeyedVectors

random.seed(123)

PAD_IDX = 0
UNK_IDX = 1
BATCH_SIZE = 32

## Loading in training, validation and test sets

In [7]:
#Loading in the Vietnamese -> En datasets

train_vi_en = pd.read_table('../project_data/en-vi/train.tok.en', header=None)
train_vi_vi = pd.read_table('../project_data/en-vi/train.tok.vi', header=None)
val_vi_en = pd.read_table('../project_data/en-vi/dev.tok.en', header=None)
val_vi_vi = pd.read_table('../project_data/en-vi/dev.tok.vi', header=None)
test_vi_en = pd.read_table('../project_data/en-vi/test.tok.en', header=None)
test_vi_vi = pd.read_table('../project_data/en-vi/test.tok.vi', header=None)

In [9]:
#Loading in the Chinese -> En datasets

train_zh_en = pd.read_table('../project_data/en-zh/train.tok.en', header=None)
train_zh_zh = pd.read_table('../project_data/en-zh/train.tok.zh', header=None)
val_zh_en = pd.read_table('../project_data/en-zh/dev.tok.en', header=None)
val_zh_zh = pd.read_table('../project_data/en-zh/dev.tok.zh', header=None)
test_zh_en = pd.read_table('../project_data/en-zh/test.tok.en', header=None)
test_zh_zh = pd.read_table('../project_data/en-zh/test.tok.zh', header=None)

In [12]:
print("Vi -> En | Training Examples: "+str(len(train_vi_en)))
print("Vi -> En | Validation Examples: "+str(len(val_vi_vi)))
print("Vi -> En | Testing Examples: "+str(len(test_vi_en)))
print("Cz -> En | Training Examples: "+str(len(train_zh_en)))
print("Cz -> En | Validation Examples: "+str(len(val_zh_en)))
print("Cz -> En | Testing Examples: "+str(len(test_zh_en)))

Vi -> En | Training Examples: 133168
Vi -> En | Validation Examples: 1266
Vi -> En | Testing Examples: 1553
Cz -> En | Training Examples: 212943
Cz -> En | Validation Examples: 1261
Cz -> En | Testing Examples: 1397


## Loading in pre-trained fasttext embeddings for the three languages
### Building loaded embeddings, token2id, id2token and ordered words for all languages

In [189]:
# building the three vocabs from pre-trained embeddings
en_embeddings = KeyedVectors.load_word2vec_format('../pretrained_embeddings/wiki.en.vec')
vi_embeddings = KeyedVectors.load_word2vec_format('../pretrained_embeddings/wiki.vi.vec')
zh_embeddings = KeyedVectors.load_word2vec_format('../pretrained_embeddings/wiki.zh.vec')

In [190]:
en_loaded_embeddings = en_embeddings.vectors
vi_loaded_embeddings = vi_embeddings.vectors
zh_loaded_embeddings = zh_embeddings.vectors

In [191]:
#adding PAD AND UNK embeddings
en_loaded_embeddings = np.insert(en_loaded_embeddings, 0, np.zeros(300,), axis=0)
vi_loaded_embeddings = np.insert(vi_loaded_embeddings, 0, np.zeros(300,), axis=0)
zh_loaded_embeddings = np.insert(zh_loaded_embeddings, 0, np.zeros(300,), axis=0)

en_loaded_embeddings = np.insert(en_loaded_embeddings, 1, np.random.rand(300,), axis=0)
vi_loaded_embeddings = np.insert(vi_loaded_embeddings, 1, np.random.rand(300,), axis=0)
zh_loaded_embeddings = np.insert(zh_loaded_embeddings, 1, np.random.rand(300,), axis=0)

In [202]:
# building out id2token and token2id for all languages
en_token2id = {j: i for i,j in enumerate(['PAD','UNK']+en_embeddings.index2word)}
en_id2token = {i: j for i,j in enumerate(['PAD','UNK']+en_embeddings.index2word)}
vi_token2id = {j: i for i,j in enumerate(['PAD','UNK']+vi_embeddings.index2word)}
vi_id2token = {i: j for i,j in enumerate(['PAD','UNK']+vi_embeddings.index2word)}
zh_token2id = {j: i for i,j in enumerate(['PAD','UNK']+zh_embeddings.index2word)}
zh_id2token = {i: j for i,j in enumerate(['PAD','UNK']+zh_embeddings.index2word)}

In [210]:
VI_EN_MAX_LENGTH = max([len(word) for word in list(train_vi_en[0])+list(train_vi_vi[0])])
ZH_EN_MAX_LENGTH = max([len(word) for word in list(train_zh_en[0])+list(train_zh_zh[0])])

## Building Data Loaders

In [212]:
class translationDataset(Dataset):
    def __init__(self, data_list, target_list):
        self.data_list=data_list
        self.target_list=target_list
        assert (len(self.data_list) == len(self.target_list))

    def __len__(self):
        return len(self.target_list)

    def __getitem__(self, key):
        token_idx = self.data_list[key][:MAX_SAMPLE_LENGTH]
        label = self.target_list[key]
        return [token_idx, len(token_idx), label]

def translation_collate_func(batch):
    data_list = []
    label_list = []
    length_list = []
    for datum in batch:
        padded_vec = np.pad(np.array(datum[2]), 
                                pad_width=((0,MAX_SAMPLE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        label_list.append(padded_vec)
        length_list.append(datum[1])
    # padding
    for datum in batch:
        padded_vec = np.pad(np.array(datum[0]), 
                                pad_width=((0,MAX_SAMPLE_LENGTH-datum[1])), 
                                mode="constant", constant_values=0)
        data_list.append(padded_vec)
    return [torch.from_numpy(np.array(data_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]

In [213]:
# VI -> EN | dataloaders
MAX_SAMPLE_LENGTH = VI_EN_MAX_LENGTH

vi_en_train_dataset = translationDataset(train_vi_vi[0], train_vi_en[0])
vi_en_train_loader = torch.utils.data.DataLoader(dataset=vi_en_train_dataset, 
                                           abatch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

vi_en_val_dataset = translationDataset(val_vi_vi[0], val_vi_en[0])
vi_en_val_loader = torch.utils.data.DataLoader(dataset=vi_en_val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=True)

vi_en_test_dataset = translationDataset(test_vi_vi[0], test_vi_en[0])
vi_en_test_loader = torch.utils.data.DataLoader(dataset=vi_en_test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=translation_collate_func,
                                           shuffle=False)

AssertionError: 