In [1]:
import io
from io import open

import numpy as np
import os
import json

import unicodedata
import string
import re
import pickle as pkl
import gzip

In [2]:
sos_tag = '<sos>'
eos_tag = '<eos>'

#Windows hack
if "\\" in os.getcwd():
    data_dir = '\\'.join(os.getcwd().split("\\")[:-1]) + '\\data\\'
    emb_dir = '\\'.join(os.getcwd().split("\\")[:-1]) + '\\embeddings\\'
else:
    data_dir = '/'.join(os.getcwd().split("/")[:-1]) + '/data/'
    emb_dir = '/'.join(os.getcwd().split("/")[:-1]) + '/embeddings/'

In [3]:
def load_vectors(fname, count):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for i, line in enumerate(fin):
        if i == count:
            break
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))
    return data

def save_zipped_pickle(obj, filename, protocol=-1):
    with gzip.open(filename, 'wb') as f:
        pkl.dump(obj, f, protocol)

def load_zipped_pickle(filename):
    with gzip.open(filename, 'rb') as f:
        loaded_object = pkl.load(f)
        return loaded_object

# Tokenize Vietnamese

In [5]:
if "\\" in os.getcwd():
    vi_emb_path = '\\'.join(os.getcwd().split("\\")[:-2]) + "\\cc.vi.300.vec"
else:
    vi_emb_path = '/'.join(os.getcwd().split("/")[:-2]) + "/cc.vi.300.vec"
    
vi_emb = load_vectors(vi_emb_path, 100000)
vi_emb_mat = np.array([v for k,v in vi_emb.items()])

vi_emb_mat = np.concatenate(
        [np.zeros(vi_emb_mat.shape[1]).reshape(1,-1),  
        np.random.uniform(vi_emb_mat.mean()-vi_emb_mat.std(),vi_emb_mat.mean()+vi_emb_mat.std(), vi_emb_mat.shape[1]).reshape(1,-1),
        np.random.uniform(vi_emb_mat.mean()-vi_emb_mat.std(),vi_emb_mat.mean()+vi_emb_mat.std(), vi_emb_mat.shape[1]).reshape(1,-1),
        np.random.uniform(vi_emb_mat.mean()-vi_emb_mat.std(),vi_emb_mat.mean()+vi_emb_mat.std(), vi_emb_mat.shape[1]).reshape(1,-1),
        vi_emb_mat])

#save_zipped_pickle(vi_emb, emb_dir + "vi_embeddings_100K.p")
save_zipped_pickle(vi_emb_mat, emb_dir + "vi_embeddings_matrix_100K.p")

vi_id2word = {i+4:k for i, k in enumerate(vi_emb.keys())}
vi_id2word[0] = "<pad>"
vi_id2word[1] = "<unk>"
vi_id2word[2] = "<sos>"
vi_id2word[3] = "<eos>"

vi_word2id = {v:k for k,v in vi_id2word.items()}

save_zipped_pickle(vi_id2word, emb_dir + "id2word_vi_dic.p")

save_zipped_pickle(vi_word2id, emb_dir + "word2id_vi_dic.p")


In [6]:
def tokenizeVietnamese(vi_path):
    with open(vi_path, encoding='utf-8') as vi:
        lines = [sos_tag + " " + x.strip().lower().replace("_", " ").replace("\"", "") + " " + eos_tag for x in vi]
        
    tokens = [[t for t in x.split(" ") if t != ""] for x in lines]
    
    tokens_num = [["x" for k in range(len(sentence))] for sentence in tokens]

    for i, sentence in enumerate(tokens):
        for j, token in enumerate(sentence):
            if token in vi_word2id:
                tokens_num[i][j] = vi_word2id[token]
            else:
                tokens_num[i][j] = vi_word2id['<unk>']

    return tokens, tokens_num

In [7]:
for i in ["train", "test", "dev"]:
    tokens, tokens_num = tokenizeVietnamese("{}iwslt-vi-en/{}.tok.vi".format(data_dir, i))
    
    save_zipped_pickle(tokens, "{}vi-en-tokens/{}_vi_tok.p".format(data_dir, i))
    save_zipped_pickle(tokens_num, "{}vi-en-tokens/{}_vi_tok_num.p".format(data_dir, i))


# Tokenize English

In [8]:
if "\\" in os.getcwd():
    en_emb_path = '\\'.join(os.getcwd().split("\\")[:-2]) + "\\wiki-news-300d-1M.vec"
else:
    en_emb_path = '/'.join(os.getcwd().split("/")[:-2]) + "/wiki-news-300d-1M.vec"
    
en_emb = load_vectors(en_emb_path, 100000)
en_emb_mat = np.array([v for k,v in en_emb.items()])

en_emb_mat = np.concatenate(
        [np.zeros(en_emb_mat.shape[1]).reshape(1,-1),  
        np.random.uniform(en_emb_mat.mean()-en_emb_mat.std(),en_emb_mat.mean()+en_emb_mat.std(), en_emb_mat.shape[1]).reshape(1,-1),
        np.random.uniform(en_emb_mat.mean()-en_emb_mat.std(),en_emb_mat.mean()+en_emb_mat.std(), en_emb_mat.shape[1]).reshape(1,-1),
        np.random.uniform(en_emb_mat.mean()-en_emb_mat.std(),en_emb_mat.mean()+en_emb_mat.std(), en_emb_mat.shape[1]).reshape(1,-1),
        en_emb_mat])

#save_zipped_pickle(en_emb, emb_dir + "en_embeddings_100K.p")
save_zipped_pickle(en_emb_mat, emb_dir + "en_embeddings_matrix_100K.p")

en_id2word = {i+4:k for i, k in enumerate(en_emb.keys())}
en_id2word[0] = "<pad>"
en_id2word[1] = "<unk>"
en_id2word[2] = "<sos>"
en_id2word[3] = "<eos>"

en_word2id = {v:k for k,v in en_id2word.items()}
 
save_zipped_pickle(en_id2word, emb_dir + "id2word_en_dic.p")
save_zipped_pickle(en_word2id, emb_dir + "word2id_en_dic.p")


In [9]:
def englishClean(x):
    x = ''.join(c for c in unicodedata.normalize('NFD', x) if unicodedata.category(c) != 'Mn')
    x = re.sub(r"&apos;", r"", x)
    x = re.sub(r"&quot;", r"", x)
    x = re.sub(r" &quot;", r"", x)
    x = re.sub(r" &quot; ", r"", x)
    x = re.sub(r"([.!?])", r" \1 ", x)
    return x 


def tokenizeEnglish(en_path):
    with open(en_path, encoding='utf-8') as en:
        lines = [sos_tag + " " + englishClean(x.lower().strip()) + " " + eos_tag for x in en]
        
    tokens = [[t for t in x.split(" ") if t != ""] for x in lines]
    
    tokens_num = [["x" for k in range(len(sentence))] for sentence in tokens]

    for i, sentence in enumerate(tokens):
        for j, token in enumerate(sentence):
            if token in en_word2id:
                tokens_num[i][j] = en_word2id[token]
            else:
                tokens_num[i][j] = en_word2id['<unk>']

    return tokens, tokens_num
    

In [10]:
for i in ["train", "test", "dev"]:
    tokens, tokens_num = tokenizeEnglish("{}iwslt-vi-en/{}.tok.en".format(data_dir, i))
    
    save_zipped_pickle(tokens, "{}vi-en-tokens/{}_en_tok.p".format(data_dir, i))
    save_zipped_pickle(tokens_num, "{}vi-en-tokens/{}_en_tok_num.p".format(data_dir, i))


In [11]:
for i in ["train", "test", "dev"]:
    tokens, tokens_num = tokenizeEnglish("{}iwslt-zh-en/{}.tok.en".format(data_dir, i))
    
    save_zipped_pickle(tokens, "{}zh-en-tokens/{}_en_tok.p".format(data_dir, i))
    save_zipped_pickle(tokens_num, "{}zh-en-tokens/{}_en_tok_num.p".format(data_dir, i))


In [14]:
tokens, tokens_num = tokenizeEnglish("{}iwslt-vi-en/{}.tok.en".format(data_dir, 'train'))

# Tokenize Chinese

In [17]:
if "\\" in os.getcwd():
    zh_emb_path = '\\'.join(os.getcwd().split("\\")[:-2]) + "\\cc.zh.300.vec"
else:
    zh_emb_path = '/'.join(os.getcwd().split("/")[:-2]) + "/cc.zh.300.vec"
    
zh_emb = load_vectors(zh_emb_path, 100000)
zh_emb_mat = np.array([v for k,v in zh_emb.items()])

zh_emb_mat = np.concatenate(
        [np.zeros(zh_emb_mat.shape[1]).reshape(1,-1),  
        np.random.uniform(zh_emb_mat.mean()-zh_emb_mat.std(),zh_emb_mat.mean()+zh_emb_mat.std(), zh_emb_mat.shape[1]).reshape(1,-1),
        np.random.uniform(zh_emb_mat.mean()-zh_emb_mat.std(),zh_emb_mat.mean()+zh_emb_mat.std(), zh_emb_mat.shape[1]).reshape(1,-1),
        np.random.uniform(zh_emb_mat.mean()-zh_emb_mat.std(),zh_emb_mat.mean()+zh_emb_mat.std(), zh_emb_mat.shape[1]).reshape(1,-1),
        zh_emb_mat])

#save_zipped_pickle(zh_emb, emb_dir + "zh_embeddings_100K.p")
save_zipped_pickle(zh_emb_mat, emb_dir + "zh_embeddings_matrix_100K.p")

zh_id2word = {i+4:k for i, k in enumerate(zh_emb.keys())}
zh_id2word[0] = "<pad>"
zh_id2word[1] = "<unk>"
vi_id2word[2] = "<sos>"
vi_id2word[3] = "<eos>"

zh_word2id = {v:k for k,v in zh_id2word.items()}
    
save_zipped_pickle(zh_id2word, emb_dir + "id2word_zh_dic.p")
save_zipped_pickle(zh_word2id, emb_dir + "word2id_zh_dic.p")

In [18]:
def tokenizeChinese(zh_path):
    with open(zh_path, encoding='utf-8') as zh:
        lines = [sos_tag + " "  + x.strip().lower().replace("_", " ").replace("\"", "") + " " + eos_tag for x in zh]
        
    tokens = [[t for t in x.split(" ") if t != ""] for x in lines]
    
    tokens_num = [["x" for k in range(len(sentence))] for sentence in tokens]

    for i, sentence in enumerate(tokens):
        for j, token in enumerate(sentence):
            if token in zh_word2id:
                tokens_num[i][j] = zh_word2id[token]
            else:
                tokens_num[i][j] = zh_word2id['<unk>']

    return tokens, tokens_num

In [23]:
for i in ["train", "test", "dev"]:
    tokens, tokens_num = tokenizeChinese("{}iwslt-zh-en/{}.tok.zh".format(data_dir, i))
    
    save_zipped_pickle(tokens, "{}zh-en-tokens/{}_zh_tok.p".format(data_dir, i))
    save_zipped_pickle(tokens_num, "{}zh-en-tokens/{}_zh_tok_num.p".format(data_dir, i))
