In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import numpy as np
import pdb
import pandas as pd
from tqdm import tqdm
import io
import pickle
import os
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.optim import lr_scheduler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
PAD_idx = 0
SOS_idx = 1
EOS_idx = 2
UNK_idx= 3
batch_size = 256
MAX_SENTENCE_LENGTH = 70

In [3]:
tgt_pretrained_path = '../data/cc.en.300.vec'
src_pretrained_path = '../data/cc.vi.300.vec'
data_prefix = '../data/'

## Read data

In [4]:
train_tgt = open('../data/iwslt-vi-en/train.tok.en' , encoding='utf-8').read().strip().split('\n')
val_tgt = open('../data/iwslt-vi-en/dev.tok.en', encoding='utf-8').read().strip().split('\n')
test_tgt = open('../data/iwslt-vi-en/test.tok.en', encoding='utf-8').read().strip().split('\n')
train_tgt[:5]

['Rachel Pike : The science behind a climate headline',
 'In 4 minutes , atmospheric chemist Rachel Pike provides a glimpse of the massive scientific effort behind the bold headlines on climate change , with her team -- one of thousands who contributed -- taking a risky flight over the rainforest in pursuit of data on a key molecule .',
 'I &apos;d like to talk to you today about the scale of the scientific effort that goes into making the headlines you see in the paper .',
 'Headlines that look like this when they have to do with climate change , and headlines that look like this when they have to do with air quality or smog .',
 'They are both two branches of the same field of atmospheric science .']

In [5]:
train_src = open('../data/iwslt-vi-en/train.tok.vi', encoding='utf-8').read().strip().split('\n')
val_src = open('../data/iwslt-vi-en/dev.tok.vi', encoding='utf-8').read().strip().split('\n')
test_src = open('../data/iwslt-vi-en/test.tok.vi', encoding='utf-8').read().strip().split('\n')
train_src[:5]

['Khoa_học đằng_sau một tiêu_đề về khí_hậu',
 'Trong 4 phút , chuyên_gia hoá_học khí_quyển Rachel Pike giới_thiệu sơ_lược về những nỗ_lực khoa_học miệt_mài đằng_sau những tiêu_đề táo_bạo về biến_đổi khí_hậu , cùng với đoàn nghiên_cứu của mình - - hàng ngàn người đã cống_hiến cho dự_án này - - một chuyến bay mạo_hiểm qua rừng_già để tìm_kiếm thông_tin về một phân_tử then_chốt .',
 'Tôi muốn cho các bạn biết về sự to_lớn của những nỗ_lực khoa_học đã góp_phần làm_nên các dòng tít bạn thường thấy trên báo .',
 'Có những dòng trông như thế_này khi bàn về biến_đổi khí_hậu , và như thế_này khi nói về chất_lượng không_khí hay khói bụi .',
 'Cả hai đều là một nhánh của cùng một lĩnh_vực trong ngành khoa_học khí_quyển .']

## Data statistics

In [6]:
print ('Length of train data:', len(train_tgt))
print ('Length of val data:', len(val_tgt))
print ('Length of test data:', len(test_tgt))

Length of train data: 133317
Length of val data: 1268
Length of test data: 1553


In [7]:
def to_dataframe(en, ch, len_raio=0.95):
    en_len_list, ch_len_list = [], []
    for en_sample, ch_sample in zip(en, ch):
        en_len_list.append(len(en_sample))
        ch_len_list.append(len(ch_sample))
    df = pd.DataFrame({'en': en, 
                       'en_len': en_len_list,
                       'vi': ch,
                       'vi_len': ch_len_list
                      })
    en_len_at_ratio = sorted(en_len_list)[int(len_raio*len(en_len_list))]
    ch_len_at_ratio = sorted(ch_len_list)[int(len_raio*len(ch_len_list))]
    return df, max(en_len_at_ratio, ch_len_at_ratio)

## Data Preprocessing 



In [8]:
# Get the vocabulary based on the training data
from collections import Counter

def get_vocabulary(data, VOCABULARY_SIZE_en=10000, VOCABULARY_SIZE_zh=10000, min_time=None):
    en_vocab, ch_vocab = [], []
    for idx, row in data.iterrows():
        en_vocab += row['en'].split()
        ch_vocab += row['vi'].split()
    en_token_counter = Counter(en_vocab)
    ch_token_counter = Counter(ch_vocab)
    print ("Number of en words: {}, vi words: {}".format(len(en_token_counter), len(ch_token_counter)))

    en_word, en_count = zip(*en_token_counter.most_common(VOCABULARY_SIZE_en))
    ch_word, ch_count  = zip(*ch_token_counter.most_common(VOCABULARY_SIZE_zh))
    print (en_count[-1], ch_count[-1])
    en_id2token = ['<PAD>','<SOS>','<EOS>','<UNK>'] + list(en_word)
    en_token2id = dict(zip(en_id2token, np.arange(len(en_id2token))))
    ch_id2token = ['<PAD>','<SOS>','<EOS>','<UNK>'] + list(ch_word)
    ch_token2id = dict(zip(ch_id2token, np.arange(len(ch_id2token))))
    return en_id2token, en_token2id, ch_id2token, ch_token2id

In [9]:
def load_vectors(fname):
    """
    load the pretrained word embeddings
    param fname: the path the to the word embedding
    return: 
            a dictionary of the {word: embedding}
    """
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in tqdm(fin):
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))
    return data

In [10]:
def load_pickle(data_path, callback, *callback_args):
    """
    Create huge file with the callback function if not exist, otherwise load directly
    param data_path: the path of the load file if exist, otherwise the path to store the created file
    param callback: the function to generate the data if not exist
    param callback_args: the argument for the callback:
    return: 
            the data, either loaded or created by callback
    """

    data_path = data_prefix+data_path
    if os.path.isfile(data_path):
        print ('File exists, load from path...')
        data = pickle.load(open(data_path, 'rb'))
    else:
        print ('File not exists, creating...')
        data = callback(*callback_args)
        pickle.dump(data, open(data_path, 'wb'))
    return data

In [11]:
def get_embedding_weights(whole_vec, id2token):
    """
    get the embeddings based on the word, create the embedding matrix
    param whole_vec: the dictionary of pretrained embeddings
    param id2token: the whole vocabulary
    return:
            embedding matrix
    """
    weight = np.zeros((len(id2token), len(whole_vec['sky'])))
    mask = np.zeros((len(id2token)))
    for i, word in enumerate(id2token[1:]):
        if word in whole_vec.keys():
            weight[i+1] = np.array(whole_vec[word])
        elif i<3:
            weight[i+1] = np.array(np.random.randn(300))
        else:
            weight[i+1] = np.array(whole_vec['UNK'])
            mask[i+1] = 1
            print ("Out of vocabulary word: ", word)   
    return weight, mask

In [12]:
#Turn a Unicode string to plain ASCII
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def normalizeZh(s):
    s = s.strip()
    s = re.sub( '\s+', ' ', s )
    return s

def normalizeVi(s):
    s = s.lower().strip()
    s =re.sub(r"[-()\"#/@;:<>{}`+=~|,]", "", s)
    s = re.sub(r"[0-9]","",s)
    s = re.sub( '\s+', ' ', s)
    return s

In [13]:
def token_to_index(sentence, token2id):
    indicies_data = []
    for s in sentence:
        tokens = s.split(' ')
        index_list =[token2id[token] if token in token2id else UNK_idx for token in tokens]
        assert len(tokens) == len(index_list)
        indicies_data.append(index_list)
    return indicies_data

In [14]:
normalize_train_tgt = [normalizeString(s) for s in train_tgt]
normalize_val_tgt = [normalizeString(s) for s in val_tgt]
normalize_test_tgt = [normalizeString(s) for s in test_tgt]

In [15]:
normalize_train_tgt[:5]

['rachel pike the science behind a climate headline',
 'in minutes atmospheric chemist rachel pike provides a glimpse of the massive scientific effort behind the bold headlines on climate change with her team one of thousands who contributed taking a risky flight over the rainforest in pursuit of data on a key molecule .',
 'i apos d like to talk to you today about the scale of the scientific effort that goes into making the headlines you see in the paper .',
 'headlines that look like this when they have to do with climate change and headlines that look like this when they have to do with air quality or smog .',
 'they are both two branches of the same field of atmospheric science .']

In [16]:
normalize_train_src = [normalizeVi(s) for s in train_src]
normalize_val_src = [normalizeVi(s) for s in val_src]
normalize_test_src = [normalizeVi(s) for s in test_src]

In [17]:
normalize_train_src[0]

'khoa_học đằng_sau một tiêu_đề về khí_hậu'

In [18]:
train_df, pad_len = to_dataframe(normalize_train_tgt, normalize_train_src)
val_df, _ = to_dataframe(normalize_val_tgt, normalize_val_src)
test_df, _ = to_dataframe(normalize_test_tgt, normalize_test_src)

In [19]:
tgt_id2token, tgt_token2id, src_id2token, src_token2id = get_vocabulary(train_df,6600,11000, min_time=30)

Number of en words: 41267, vi words: 33913
19 6


In [20]:
train_tgt_indicies = token_to_index(normalize_train_tgt, tgt_token2id)
val_tgt_indicies = token_to_index(normalize_val_tgt, tgt_token2id)
test_tgt_indicies = token_to_index(normalize_test_tgt, tgt_token2id)

In [21]:
train_src_indicies = token_to_index(normalize_train_src, src_token2id)
val_src_indicies = token_to_index(normalize_val_src, src_token2id)
test_src_indicies = token_to_index(normalize_test_src, src_token2id)

In [22]:
tgt_pretrain_emb, zh_pretrain_emb = [], []
# print ('-'*100)
# tgt_pretrain_emb = load_pickle('en_pretrain_emb.p', load_vectors, tgt_pretrained_path)
print ('-'*100)
tgt_embedding, _ = load_pickle('eng_embedding6.6k_vi.p', get_embedding_weights, tgt_pretrain_emb, tgt_id2token)

----------------------------------------------------------------------------------------------------
File exists, load from path...


In [23]:
print ('-'*100)
src_pretrain_emb = load_pickle('vi_pretrain_emb.p', load_vectors, src_pretrained_path)
print ('-'*100)
src_embedding, _ = load_pickle('vi_embedding1.1w.p', get_embedding_weights, src_pretrain_emb, src_id2token)

1106it [00:00, 11059.11it/s]

----------------------------------------------------------------------------------------------------
File not exists, creating...


2000000it [02:08, 15565.33it/s]


----------------------------------------------------------------------------------------------------
File not exists, creating...
Out of vocabulary word:  chúng_ta
Out of vocabulary word:  có_thể
Out of vocabulary word:  chúng_tôi
Out of vocabulary word:  tất_cả
Out of vocabulary word:  thế_giới
Out of vocabulary word:  thực_sự
Out of vocabulary word:  bắt_đầu
Out of vocabulary word:  vấn_đề
Out of vocabulary word:  thay_đổi
Out of vocabulary word:  sử_dụng
Out of vocabulary word:  bởi_vì
Out of vocabulary word:  thế_nào
Out of vocabulary word:  không_thể
Out of vocabulary word:  bây_giờ
Out of vocabulary word:  đầu_tiên
Out of vocabulary word:  quan_trọng
Out of vocabulary word:  con_người
Out of vocabulary word:  thời_gian
Out of vocabulary word:  phát_triển
Out of vocabulary word:  nghiên_cứu
Out of vocabulary word:  làm_việc
Out of vocabulary word:  có_một
Out of vocabulary word:  tại_sao
Out of vocabulary word:  hệ_thống
Out of vocabulary word:  câu_chuyện
Out of vocabulary word: 

Out of vocabulary word:  luyện_tập
Out of vocabulary word:  niềm_vui
Out of vocabulary word:  hàng_xóm
Out of vocabulary word:  đám_đông
Out of vocabulary word:  nhưng_mà
Out of vocabulary word:  tiên_phong
Out of vocabulary word:  ưu_tiên
Out of vocabulary word:  sinh_sản
Out of vocabulary word:  của_riêng
Out of vocabulary word:  chờ_đợi
Out of vocabulary word:  nhắc_nhở
Out of vocabulary word:  giám_đốc
Out of vocabulary word:  con_chuột
Out of vocabulary word:  xe_tải
Out of vocabulary word:  ghi_nhớ
Out of vocabulary word:  sao_hoả
Out of vocabulary word:  làm_bạn
Out of vocabulary word:  sự_việc
Out of vocabulary word:  vệ_sinh
Out of vocabulary word:  nhân_tố
Out of vocabulary word:  tị_nạn
Out of vocabulary word:  ranh_giới
Out of vocabulary word:  luật_lệ
Out of vocabulary word:  tập_thể
Out of vocabulary word:  giác_quan
Out of vocabulary word:  chấm_dứt
Out of vocabulary word:  định_hướng
Out of vocabulary word:  tiến_trình
Out of vocabulary word:  trích_dẫn
Out of vocabular

Out of vocabulary word:  tổng_kết
Out of vocabulary word:  phương_cách
Out of vocabulary word:  phòng_bệnh
Out of vocabulary word:  đồng_tiền
Out of vocabulary word:  vị_thành_niên
Out of vocabulary word:  có_lý
Out of vocabulary word:  đồng_hành
Out of vocabulary word:  chối_bỏ
Out of vocabulary word:  sức_sống
Out of vocabulary word:  lấy_làm
Out of vocabulary word:  bao_trùm
Out of vocabulary word:  đồng_bằng
Out of vocabulary word:  thiểu_số
Out of vocabulary word:  trớ_trêu
Out of vocabulary word:  bí_quyết
Out of vocabulary word:  giảm_dần
Out of vocabulary word:  tưởng_chừng
Out of vocabulary word:  mặt_phẳng
Out of vocabulary word:  phân_giải
Out of vocabulary word:  hân_hoan
Out of vocabulary word:  hướng_ngoại
Out of vocabulary word:  lý_thú
Out of vocabulary word:  háo_hức
Out of vocabulary word:  cố_ý
Out of vocabulary word:  săn_bắn
Out of vocabulary word:  chọc_trời
Out of vocabulary word:  khốn_khổ
Out of vocabulary word:  hình_trụ
Out of vocabulary word:  hạt_phấn
Out o

Out of vocabulary word:  thiên_chúa_giáo
Out of vocabulary word:  tay_vịn
Out of vocabulary word:  đấu_giá
Out of vocabulary word:  chần_chừ
Out of vocabulary word:  ngọn_đèn
Out of vocabulary word:  di_căn
Out of vocabulary word:  nhiễm_sắc_thể
Out of vocabulary word:  hôm_trước
Out of vocabulary word:  bịt_mắt
Out of vocabulary word:  bão_hoà
Out of vocabulary word:  tinh_hoàn
Out of vocabulary word:  chất_đốt
Out of vocabulary word:  trữ_lượng
Out of vocabulary word:  bồi_thường
Out of vocabulary word:  đền_bù
Out of vocabulary word:  tiền_công
Out of vocabulary word:  kì_thi
Out of vocabulary word:  thực_dân
Out of vocabulary word:  máy_phát_điện
Out of vocabulary word:  khả_dĩ
Out of vocabulary word:  thần_tiên
Out of vocabulary word:  huck
Out of vocabulary word:  tin_học
Out of vocabulary word:  tan_nát
Out of vocabulary word:  clonie
Out of vocabulary word:  buồng_lái
Out of vocabulary word:  cphi
Out of vocabulary word:  học_kỳ
Out of vocabulary word:  rửa_tiền
Out of vocabula

In [24]:
class language(object):
    def __init__(self, name, i2t, t2i, embedding_matrix, train, test, val):
        self.name = name
        self.idx2token = i2t
        self.token2idx = t2i
        self.embedding_mat = embedding_matrix
        self.train_idx = train
        self.test_idx = test
        self.val_idx = val
        
vi = language('Vietnam', src_id2token, src_token2id, src_embedding, train_src_indicies, test_src_indicies, val_src_indicies)
en = language('English', tgt_id2token, tgt_token2id, tgt_embedding, train_tgt_indicies, test_tgt_indicies, val_tgt_indicies)

data = {
        'src': vi,
        'tgt': en
       }

pickle.dump(data, open(data_prefix+'vi1.1w-en6k.p', 'wb'))