In [1]:
# coding=utf-8
import sys
sys.path.append('..')

import os
import io
import random
import copy 
import re
import html

from time import time 
from multiprocessing import Pool
from collections import Counter

from functools import partial

from tqdm import tqdm_notebook
import pythainlp
from pythainlp.util import *
from pythainlp.tokenize import word_tokenize
from pythainlp.ulmfit import *

# subword-nmt
from subword_nmt import learn_bpe as learner
from subword_nmt import apply_bpe as subword_tokenizer

import fairseq 
from datetime import timedelta
from tqdm import tqdm, tqdm_notebook
from pythainlp.tokenize import DEFAULT_DICT_TRIE

from pythainlp.corpus import thai_words

print(pythainlp.__version__)
# assert pythainlp.__version__ == '2.1'

import tensorflow as tf
import tensorflow_hub as hub
import numpy as np


2.0.7


In [9]:
# install BPEmb (BPE embeddings)

!pip install --q bpemb emoji subword-nmt fairseq tensorflow_hub sentencepiece tf_sentencepiece

In [2]:
# # USE

# g = tf.Graph()
# with g.as_default():
#     text_input = tf.placeholder(dtype=tf.string, shape=[None])
#     embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/1")
#     embedded_text = embed(text_input)
#     init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
# g.finalize()

# # def compute_similarity(src, tgt):
# #     """
# #         Calculate sentence similarity based on Google Universal Sentence Encoder (Multilingual Large)
# #     """

# # # Initialize session.
# # session = tf.Session(graph=g)
# # session.run(init_op)

# # # Compute embeddings.
# # en_result = session.run(embedded_text, feed_dict={text_input: english_sentences})
# # it_result = session.run(embedded_text, feed_dict={text_input: italian_sentences})
# # ja_result = session.run(embedded_text, feed_dict={text_input: japanese_sentences})

# # # Compute similarity matrix. Higher score indicates greater similarity.
# # similarity_matrix_it = np.inner(en_result, it_result)
# # similarity_matrix_ja = np.inner(en_result, ja_result)

In [3]:
from bpemb import BPEmb

bpemb_pretrained ={
    'th': {
        '25000': BPEmb(lang="th", vs=25000)
    },
    'en': {
        '25000': BPEmb(lang="en", vs=25000)
    }
}


In [4]:
with open('../data/opensubtitle_v2018/OpenSubtitles.en-th.en','r', encoding='utf-8') as f:
    en = f.read().split('\n')
len(en),en[:3]


(3281534,
 ['Slave in the Magic Mirror, come from the farthest space.',
  'Through wind and darkness, I summon thee.',
  'Speak!'])

In [5]:
with open('../data/opensubtitle_v2018/OpenSubtitles.en-th.th','r', encoding='utf-8') as f:
    th = f.read().split('\n')
    
len(th),th[:3]

(3281534,
 ['ทาสในกระจกวิเศษ, มาจากพื้นที่ที่ไกลที่สุด',
  'ผ่านลมและความมืดฉันเรียกเจ้า',
  'พูด!'])

## 1. Preprocess Opensubtitles_v2018

In [6]:
from mt_opus.preprocess import (
    SentenceLengthLessThanOrEqualToOne,
    SentenceContainsUnknownSymbol,
    SentenceContainsAdSymbol,
    ThaiSentenceContainsNoThaiCharacters,
    ThaiSentenceContainsNoThaiCharactersPattern,

    UnescapeString,
    RemoveUnwantedSymbols,
    RemoveUnwantedPattern,
    ReplaceDashInSentence,
    RemoveHashtagInSentence,
    RemoveFullStopInThaiSentence,
    NormalizeThaiVowel,
    
    SentencePairFoundRepeatedText,
)

In [7]:
# to filter out a sentence pair
filtering_rules = [
    SentenceLengthLessThanOrEqualToOne,
    SentenceContainsAdSymbol,
    SentenceContainsUnknownSymbol,
    ThaiSentenceContainsNoThaiCharacters,
    ThaiSentenceContainsNoThaiCharactersPattern,
]

cleaning_rules = [
    UnescapeString,
    RemoveUnwantedSymbols,
    RemoveUnwantedPattern,
    ReplaceDashInSentence,
    RemoveHashtagInSentence,
    RemoveFullStopInThaiSentence,
    NormalizeThaiVowel,
]

filtering_sentence_pair_rules = [
    SentencePairFoundRepeatedText
]

In [8]:
# this could be parallelized
def filter_sentence(sentence, lang, rules=filtering_rules):
    """
        Return True if a sentence match filtering pattern
    """
    for rule in rules:
        rule_obj = rule()
        if rule_obj.test(sentence, lang=lang):
            return True
    return False

def filter_sentences(sentences, lang, rules=filtering_rules):
    """
        Returns a list of Boolean value, if such element is True, it means filter that sentence pair.
        Otherwise, keep that sentnence pair.
    """
    filtering_indices = []
    p = Pool() # use all available cores
    t = time()

    _filter_sentence = partial(filter_sentence, lang=lang)
    filtering_indices = p.map(_filter_sentence, sentences)
    
    p.close()
    p.join() # call Pool.join() to wait for the worker processes to terminate.

    filtering_indices_np = np.array(filtering_indices)
    number_of_filtered_out = np.sum(filtering_indices_np)
    
    print('Time taken: {} s'.format(time() -t))
    print('# sentences ({}) before filtered out'.format(lang), len(sentences))
    print('# sentences ({})filtered out'.format(lang), number_of_filtered_out)
    print('# sentences ({})after filtered out'.format(lang), len(sentences) - number_of_filtered_out)

    return filtering_indices_np, number_of_filtered_out


def filter_sentence_pair(sentence_pairs, rules=filtering_sentence_pair_rules):
    """
        Return True if a sentence match filtering sentence pair pattern
    """
    for rule in rules:
        rule_obj = rule()
        if rule_obj.test(sentence_pairs):
            return True
    return False

def filter_sentence_pairs(sentence_pairs, rules=filtering_sentence_pair_rules):
    """
        Returns a list of Boolean value, if such element is True, it means filter that sentence pair.
        Otherwise, keep that sentnence pair.
    """
    filtering_indices = []
    p = Pool() # use all available cores
    t = time()

    _filter_sentence_pair = partial(filter_sentence_pair)
    filtering_indices = p.map(_filter_sentence_pair, sentence_pairs)
    
    p.close()
    p.join() # call Pool.join() to wait for the worker processes to terminate.

    filtering_indices_np = np.array(filtering_indices)
    number_of_filtered_out = np.sum(filtering_indices_np)
    
    print('Time taken: {} s'.format(time() -t))
    print('# sentences before filtered out', len(sentence_pairs))
    print('# sentences filtered out', number_of_filtered_out)
    print('# sentences after filtered out', len(sentence_pairs) - number_of_filtered_out)

    return filtering_indices_np, number_of_filtered_out


def clean_sentence(sentence, lang, rules=cleaning_rules):
    for rule in rules: 
        rule_obj = rule()
        if rule_obj.test(sentence, lang=lang):
            sentence = rule_obj.replace(sentence, lang=lang)
    return sentence

def clean_sentences(sentences, lang, rules=cleaning_rules):
    """
        Clean the sentence with the specified text cleaning rules
        Return a list of cleaned sentences
    """
    p = Pool() # use all available cores
    t = time()
    
    _clean_sentence = partial(clean_sentence, lang=lang)
    cleaned_sentences = p.map(_clean_sentence, sentences)
    
    p.close()
    p.join() # call Pool.join() to wait for the worker processes to terminate.

    print('Time taken: {} s'.format(time() -t))
    
    return cleaned_sentences


### 1.1 Filter by each language 

In [9]:
filtering_indices_th, _ = filter_sentences(th, lang='th')

Time taken: 20.27482318878174 s
# sentences (th) before filtered out 3281534
# sentences (th)filtered out 77301
# sentences (th)after filtered out 3204233


In [10]:
filtering_indices_en, _ = filter_sentences(en, lang='en')

Time taken: 6.393211841583252 s
# sentences (en) before filtered out 3281534
# sentences (en)filtered out 4453
# sentences (en)after filtered out 3277081


In [12]:
filtering_indices_th_en = filtering_indices_th | filtering_indices_en

print('# filtering_indices_th_en', sum(filtering_indices_th_en))

filtered_th = [th[i] for i, filtered_out in enumerate(filtering_indices_th_en) if not filtered_out]
filtered_en = [en[i] for i, filtered_out in enumerate(filtering_indices_th_en) if not filtered_out]

# filtering_indices_th_en 79519


In [13]:
print('# filtered_th', len(filtered_th))
print('# filtered_en', len(filtered_en))

# filtered_th 3202015
# filtered_en 3202015


In [135]:
3281534 - 78930

3202604

### 1.2 Filter by pair of source and target language

In [14]:

th_en_tuples = [(filtered_th[i], filtered_en[i]) for i in range(0, len(filtered_th))]
filtering_pairs_indices_th_en, _ = filter_sentence_pairs(th_en_tuples)

filtered_pairs_th = [filtered_th[i] for i, filtered_out in enumerate(filtering_pairs_indices_th_en) if not filtered_out]
filtered_pairs_en = [filtered_en[i] for i, filtered_out in enumerate(filtering_pairs_indices_th_en) if not filtered_out]




Time taken: 3.5652689933776855 s
# sentences before filtered out 3202015
# sentences filtered out 574
# sentences after filtered out 3201441


In [15]:
# th_en_tuples

In [16]:
print('# filtered_pairs_th', len(filtered_pairs_th))
print('# filtered_pairs_en', len(filtered_pairs_en))

# filtered_pairs_th 3201441
# filtered_pairs_en 3201441


In [17]:
print('โช'.encode('utf-8'))

b'\xe0\xb9\x82\xc2\x99\xe0\xb8\x8a'


In [21]:
c = Counter()
for i in range(len(filtered_th)):
    
    if re.search('^โช', filtered_th[i]):
        c['found'] += 1
        if c['found'] <= 100:
            print(i)
            print(filtered_th[i])
            print(filtered_en[i])
            print()

c

Counter()

In [30]:
cleaned_th = clean_sentences(filtered_pairs_th, lang="th")

Time taken: 67.09233093261719 s


In [23]:
cleaned_en = clean_sentences(filtered_pairs_en, lang="en")

Time taken: 27.437071800231934 s


In [31]:
print('cleaned_th', len(cleaned_th))

cleaned_th 3201441


In [32]:
with open('./sent.th', 'w') as f:
    for sent in cleaned_th:
        f.write(sent + '\n')



In [33]:
with open('./sent.en', 'w') as f:
    for sent in cleaned_en:
        f.write(sent + '\n')


## Explore cleaned sentences

In [27]:
c = Counter()
for i, sent in enumerate(cleaned_th):
    if ';' in sent:
        c['found'] += 1
        if c['found'] <= 5:
            print(sent,'<>', cleaned_en[i])
            print()
c

แสงดาว, ดาวสดใส ดาวแรกที่ฉันเห็นคืนนี้; <> Star light, star bright, first star I see tonight;

มันจะเปิดพื้นที่ป่าไม้นี้ การพัฒนาซึ่งจะเป็นประโยชน์ต่อเศรษฐกิจ; <> and it would drive a harpoon right into the heart of the Communist concentration.

"จุดประสงค์ของเรื่องราวคือ to inflame ราคะ; ทั้งหมดจะถูกยินยอม <> "The Great Hall will be adequately heated

เรื่องราวของฉันจะสนใจท่าน... ...แด่ของเราโดยเฉพาะประธานาธิบดี; <> Enough. I am eager to hear Signora Maggi's voice

ไม่คิดว่าน้ำตาของคุณ .ยับยั้ง my desire ; ที่เขาทำฉันไม่มีความเมตตามากกว่า <> Go on!



Counter({'found': 282})

In [35]:
c = Counter()
print('len th', len(th))
print('len filtered_th', len(filtered_th))
print('len filtered_pairs_th', len(filtered_pairs_th))
print('len cleaned_th', len(cleaned_th))
print('')
for i, sent in enumerate(cleaned_th):
    if 'โช I' in sent:
        c['found'] += 1
        if c['found'] <= 5:
            print(filtered_th[i])
            print(filtered_pairs_th[i])
            print(cleaned_th[i])
            print(cleaned_en[i])
            print()
print(c)

len th 3281534
len filtered_th 3202015
len filtered_pairs_th 3201441
len cleaned_th 3201441

Counter()


In [113]:
print(len(filtered_pairs_th))
print(len(cleaned_th))

3202027
3202027


In [37]:
# important
c = Counter()
for i, sent in enumerate(cleaned_th):
    if cleaned_en[i] in sent:
        c['found'] += 1
        if c['found'] <= 10:
            print(sent,'<>', cleaned_en[i])
            print()
c

Ask my sisters - แม่! <> Ask my sisters

นายทำสิ่งที่นายต้องทำ ลิงค์ <> 

ฉันเสียใจจริงๆ ที่ต้องบอกว่านายไม่ใช่ปัญหาอีกต่อไปแล้ว <> 

ทำไมมาข้างหลังล่ะ <> 

มีคนเกาะอยู่สองคน <> 

นิ่งไว้ <> 

ไปเร็ว เร็วๆๆ <> 

วิสท์เลอร์ตกไปแล้ว <> 

Missile Command - โมริโมโต้บอกว่า <> Missile Command

โทษทีนะ เวลานี้ไม่เหมาะ <> 



Counter({'found': 201})

In [102]:
# important
c = Counter()
for i, sent in enumerate(filtered_pairs_th):
    if 'โช I ' in sent:
        c['found'] += 1
        if c['found'] <= 10:
            print(sent,'<>', filtered_pairs_en[i])
            print()
c

Counter()

In [60]:

def tokenize_worker(sentence, lang, trie):
    
    _tokenizer_newmm = partial(pythainlp.tokenize.word_tokenize, engine='newmm',
                               keep_whitespace=False,
                              custom_dict=(trie if trie != None else DEFAULT_DICT_TRIE))
    return ' '.join(_tokenizer_newmm(sentence))
  
def tokenize_handler(sentences, lang, trie=None):
    toks = []
    p = Pool(6)
    t = time()
    _tokenize_worker = partial(tokenize_worker, lang=lang, trie=trie)
    toks = p.map(_tokenize_worker, sentences)
    
    p.close()
    p.join() # call Pool.join() to wait for the worker processes to terminate.

    print('{} s'.format(time() -t))

    return toks
  

In [61]:
def write_spaced_tokens_to_file(data, folder_name, filename):
    with open('/root/mt-opus/data/{}/{}'.format(folder_name, filename),'w') as f:
        for item in data:
            f.write(item + '\n')
            
            

In [62]:
t = time()
print('sentence filtering (th)')
indices_to_filter_out_th = sentences_filter(th, lang='th')
print(len(indices_to_filter_out_th))

print('sentence filtering (en)')
indices_to_filter_out_en = sentences_filter(en, lang='en')

print(len(indices_to_filter_out_en))

indices_to_filter_out = indices_to_filter_out_th + indices_to_filter_out_en
indices_to_filter_out = set(indices_to_filter_out)


sentence filtering (th)


HBox(children=(IntProgress(value=0, max=3281534), HTML(value='')))


102619
sentence filtering (en)


HBox(children=(IntProgress(value=0, max=3281534), HTML(value='')))


4560


In [63]:
print(len(indices_to_filter_out))


93721


In [64]:
# Clean Sentence

print('clean sentence (th)')
filtered_th = [clean_sentence(x, lang='th') for i, x in tqdm_notebook(enumerate(th), total=len(th)) if i not in indices_to_filter_out]
print('clean sentence (en)')
filtered_en = [clean_sentence(x, lang='en') for i, x in tqdm_notebook(enumerate(en), total=len(en)) if i not in indices_to_filter_out]

print('{} seconds'.format(time() -t))


clean sentence (th)


HBox(children=(IntProgress(value=0, max=3281534), HTML(value='')))


clean sentence (en)


HBox(children=(IntProgress(value=0, max=3281534), HTML(value='')))


228.31016874313354 seconds


In [28]:
print(len(th))
# print(len(filtered_th))

3281534


In [32]:
# explore


counter = Counter()

for idx, sent in enumerate(th):
    
    if re.search('#',sent):
        counter['th found'] += 1
        if counter['th found'] < 10:
            print(en[idx])
            print(sent)
   
counter

##Have forgot ##
{\cHFFFFFF}## ลืม ##
## Will be ##
{\cHFFFFFF}## จะ ##
##And for bonny ##
{\cHFFFFFF}## และสำหรับ Bonny ##
##Annie Laurie ##
{\cHFFFFFF}## แอนนี่ลอรี่ ##
##I would lay ##
{\cHFFFFFF}## ฉันจะวาง ##
# Me doon #
{\cHFFFFFF}# ฉัน Doon #
#And dee ##
{\cHFFFFFF}#And Dee ##
Writing's on the wall
# Writing's on the wall #
Very superstitious
# Very superstitious #


Counter({'th found': 18512})

In [22]:
# counter = Counter()

# for idx, sent in enumerate(th):
    
#     if re.search('เเ',sent):
#         counter['th found'] += 1
#         if counter['th found'] < 10:
#             print(en[idx])
#             print(sent)
   
# counter

In [23]:
# counter = Counter()

# for idx, sent in enumerate(th):
    
#     if re.search(r'โช [A-z]', sent):
#         counter['th found'] += 1
#         if counter['th found'] < 10:
#             print(en[idx])
#             print(sent)
#             print()
 

 
# counter

In [24]:
# counter = Counter()

# for idx, sent in enumerate(filtered_th):
    
#     if re.search('/ N',sent):
#         counter['found;'] += 1
#         if counter['found'] < 100:
#             print(filtered_en[idx])
#             print(sent)
#             print()

  
# counter

In [25]:
# counter = Counter()

# for idx, sent in enumerate(filtered_th):
    
#     if re.search("เเ", sent):
#         counter['th found ? '] += 1
#         if counter['th found ? '] < 30:
#             print(filtered_en[idx])
#             print(sent)
#             print()
 
 
# counter

In [279]:
sentence_counter = {
    'th': Counter(),
    'en': Counter()
}
for i in range(len(filtered_th)):
    for lang in ['th', 'en']:
        if lang == 'th':
            sentence_counter[lang][filtered_th[i]] += 1
        if lang == 'en':
            sentence_counter[lang][filtered_en[i]] += 1

In [280]:
sentence_counter['th'].most_common(10)

[('ใช่', 10609),
 ('ไม่', 6691),
 ('โอเค', 6606),
 ('ขอบคุณ', 4760),
 ('เฮ้', 3365),
 ('อะไรนะ?', 3246),
 ('ครับ', 2709),
 ('อะไรนะ', 2664),
 ('อะไร?', 2601),
 ('ไม่!', 2097)]

In [281]:
sentence_counter['en'].most_common(10)

[('What?', 13268),
 ('Yeah.', 12373),
 ('No.', 10795),
 ('Okay.', 7995),
 ('Yes.', 6653),
 ('Thank you.', 6488),
 ('Hey.', 4278),
 ('No!', 3772),
 ("I don't know.", 3650),
 ('Why?', 3565)]

In [282]:
print_only = 100
count = 0
for i in range(len(filtered_th)):
    if filtered_th[i] == 'โซ' and count < print_only:
        print(filtered_th[i], '|', filtered_en[i])
        count += 1

โซ | Zo
โซ | Zoe
โซ | Zo?
โซ | Zo?
โซ | Zo?
โซ | Zo?


In [290]:
print_only = 10
count = 0
for i in range(len(filtered_th)):
    if re.search(r'\.$', filtered_th[i]):
        if count < print_only:
            print(filtered_th[i])
            print(filtered_en[i])
            print()
        count += 1
print('count', count)

กรุณารอ.
Wait, please.

สวัสดี.
Hello there.

นั่นดีกว่า.
That's better.

สมเด็จพระราชินี.
The Queen.

ไปได้.
Now, go.

ใช่.
Yes.

มาสิ ชายไก่ ติดตามฉัน.
Come on, hen... Men. Follow me.

เงียบ.
Quiet.

อย่าปล่อยให้เขา หยุดเขา.
Don't let him.Stop him.

แค่นั้นแหละ.
That's it.

count 168365


In [120]:
toks = {
    'th': {
        'sentencepiece': [],
        'newmm':[]
    },
    'en': {
        'sentencepiece': [],
        'newmm':[]
    }
}

## 1a Segment texts into tokens with `newmm`

In [106]:
toks['th']['newmm'] = tokenize_handler(filtered_th, lang='th')
toks['en']['newmm'] = tokenize_handler(filtered_en, lang='en')


38.81156301498413 s
39.77449178695679 s


In [107]:
toks['th']['newmm'][0:10], toks['en']['newmm'][0:10]

(['ทาส ใน กระจก วิเศษ , มาจาก พื้นที่ ที่ ไกล ที่สุด',
  'ผ่าน ลม และ ความมืด ฉัน เรียก เจ้า',
  'พูด !',
  'ให้ ฉัน เห็น พระพักตร์ ของ พระองค์',
  'สิ่ง ที่ เจ้า จะ รู้ ว่า สมเด็จ พระราชินี ของ ฉัน ได้ อย่างไร',
  'กระจก วิเศษ บน ผนัง ผู้ ที่ เป็น สังขาร หนึ่ง ทั้งหมด หรือไม่',
  'ที่ มีชื่อเสียง เป็น ความงาม ของ เจ้า พระ บาท สมเด็จ พระเจ้าอยู่หัว',
  'แต่ ถือเป็น แม่บ้าน ที่ น่ารัก ที่ ฉัน เห็น',
  'ยาจก ไม่ สามารถ ซ่อน พระคุณ อ่อนโยน ของ เธอ',
  'อนิจจา เธอ มี ความเป็นธรรม มากขึ้น กว่า เจ้า'],
 ['Slave in the Magic Mirror , come from the farthest space .',
  'Through wind and darkness , I summon thee .',
  'Speak !',
  'Let me see thy face .',
  'What wouldst thou know , my Queen ?',
  'Magic Mirror on the wall , who is the fairest one of all ?',
  'Famed is thy beauty , Majesty .',
  'But hold , a lovely maid I see .',
  'Rags cannot hide her gentle grace .',
  'Alas , she is more fair than thee .'])

## 1b Segment texts into BPE tokens with SentencePiece (BPEmb)


In [108]:
def encode_bpe(sentences, lang, n_vocab=25000):
    """Return a list of bpe tokens give a list of sentences"""
    segmented_sentences = []
    for sentence in tqdm_notebook(sentences, total=len(sentences)):
#         print(sentence)
        bpe_tokens = bpemb_pretrained[lang]['{}'.format(n_vocab)].encode(sentence)
        segmented_sentences.append(' '.join(bpe_tokens))
        
    return segmented_sentences

### 1.1 Thai language

In [109]:
toks['th']['sentencepiece'] = encode_bpe(filtered_th, 'th', 25000)

print(toks['th']['sentencepiece'][0:10])

HBox(children=(IntProgress(value=0, max=3202751), HTML(value='')))


['▁ท าส ใน กระจก วิเศษ , ▁มาจาก พื้นที่ ที่ ไกล ที่สุด', '▁ผ่าน ลม และความ มืด ฉัน เรียก เจ้า', '▁พูด !', '▁ให้ ฉัน เห็น พระพักตร์ ของ ▁พระองค์', '▁สิ่งที่ เจ้า จะ รู้ว่า สมเด็จพระราชินี ▁ของ ฉัน ได้อย่างไร', '▁กระจ ก วิเศษ บน ผนัง ▁ผู้ ที่เป็น สัง ขาร หนึ่ง ทั้งหมด ▁หรือไม่', '▁ที่มีชื่อเสียง เป็น ความงาม ของ ▁เจ้า พระบาทสมเด็จพระ เจ้าอยู่หัว', '▁แต่ ถือเป็น แม่ บ้าน ที่น ่ารัก ที่ ฉัน ▁เห็น', '▁ยา จก ไม่สามารถ ซ่อน พระคุณ ▁อ่อน โยน ของเธอ', '▁อน ิจ จา เธอ มีความเป็น ธรรม ▁มาก ขึ้น กว่า เจ้า']


### 1.2 English language

In [110]:
toks['en']['sentencepiece']  = encode_bpe(filtered_en, 'en', 25000)
print(toks['en']['sentencepiece'][0:10])

HBox(children=(IntProgress(value=0, max=3202751), HTML(value='')))


['▁slave ▁in ▁the ▁magic ▁mirror , ▁come ▁from ▁the ▁fart hest ▁space .', '▁through ▁wind ▁and ▁darkness , ▁i ▁summon ▁the e .', '▁speak !', '▁let ▁me ▁see ▁thy ▁face .', '▁what ▁would st ▁thou ▁know , ▁my ▁queen ?', '▁magic ▁mirror ▁on ▁the ▁wall , ▁who ▁is ▁the ▁fa ire st ▁one ▁of ▁all ?', '▁famed ▁is ▁thy ▁beauty , ▁majesty .', '▁but ▁hold , ▁a ▁lov ely ▁maid ▁i ▁see .', '▁ra gs ▁cannot ▁hide ▁her ▁gentle ▁grace .', '▁al as , ▁she ▁is ▁more ▁fair ▁than ▁the e .']


## 2. Split train-valid-test 

In [111]:
#train-valid-test split 80/10/10

n = len(toks['th']['newmm'])

print('N = ',n)
idx = list(range(n))

random.seed(1234) # Set SEED
random.shuffle(idx)

train_idx, valid_idx, test_idx = idx[:int(n*0.8)], idx[int(n*0.8):int(n*0.9)], idx[int(n*0.9):]

dataset_split = {}
dataset_split['train'] = train_idx
dataset_split['valid'] = valid_idx
dataset_split['test'] = test_idx


len(train_idx),len(valid_idx),len(test_idx)



N =  3202751


(2562200, 320275, 320276)

In [112]:
dataset = {
    'train': {
        'en': {
            'sentencepiece': [],
            'newmm':[]
        },
        'th': {
             'sentencepiece': [],
            'newmm':[]
        }
    },
    'valid': {
        'en': {
            'sentencepiece': [],
            'newmm':[]
        },
        'th': {
             'sentencepiece': [],
            'newmm':[]
        }
    },
    'test': {
        'en': {
            'sentencepiece': [],
            'newmm':[]
        },
        'th': {
             'sentencepiece': [],
            'newmm':[]
        }
    }
}

for split_name in ['train', 'valid', 'test']:
    for lang in ['th', 'en']:
        for tok_type in ['sentencepiece', 'newmm']:

            dataset[split_name][lang][tok_type] = [toks[lang][tok_type][i] for i in dataset_split[split_name]] 


In [113]:
print(dataset['train']['th']['newmm'][0:2],'\n')
print(dataset['train']['en']['newmm'][0:2],'\n')
print(dataset['train']['th']['sentencepiece'][0:2],'\n')
print(dataset['train']['en']['sentencepiece'][0:2],'\n')

['เบค กี้ เธอ ทำท่า แปลก ๆ เมื่อกี้ ใน ห้อง', 'อยู่ กับ เธอ แอน นา จะ นำทาง คุณ ผม จะ กลับ ไป'] 

['Becky , um , you were acting particularly strange in there just now .', "Stay with her so Anna can guide you . I ' m going back ."] 

['▁เบ ค กี้ ▁เธอ ทํา ท่า แปลก ๆ ▁เมื่อ กี้ ▁ในห้อง', '▁ อยู่กับ เธอ ▁แอนนา จะนํา ทาง คุณ ▁ผม จะ กลับไป'] 

['▁bec ky , ▁um , ▁you ▁were ▁acting ▁particularly ▁strange ▁in ▁there ▁just ▁now .', "▁stay ▁with ▁her ▁so ▁anna ▁can ▁guide ▁you . ▁i ' m ▁going ▁back ."] 



In [119]:
# Counting number of tokens for train, valid, test
counter = Counter( )
for dataset_type in ['train', 'valid', 'test']:
    for th_sent_toks in dataset[dataset_type]['th']['newmm']:
        counter['th_{}_n_toks'.format(dataset_type)] += len(th_sent_toks)
    for en_sent_toks in dataset[dataset_type]['en']['newmm']:
        counter['en_{}_n_toks'.format(dataset_type)] += len(en_sent_toks)

print(counter) 

Counter({'en_train_n_toks': 92383739, 'th_train_n_toks': 86683223, 'en_valid_n_toks': 11536351, 'en_test_n_toks': 11535798, 'th_test_n_toks': 10833242, 'th_valid_n_toks': 10826042})


In [124]:

for tok_type_src in ['sentencepiece', 'newmm']:
    for tok_type_tgt in ['sentencepiece', 'newmm']:
        langs = ['th', 'en']
        for lang in langs:
            src_lang = lang
            tgt_lang = 'en' if lang =='th' else 'th'
            FOLDER_NAME = "opensubtitles_tok/{}-{}/{}-{}".format(tok_type_src, tok_type_tgt, src_lang, tgt_lang )
            FOLDER_NAME_BIN = "opensubtitles_bin/{}-{}/{}-{}".format(tok_type_src, tok_type_tgt, src_lang, tgt_lang)
           
            
            # Create directories
            print('create directories: ')
            print('dir: ../data/{}'.format(FOLDER_NAME))
            print('dir: ../data/{}'.format(FOLDER_NAME_BIN))

            !mkdir -p ../data/{FOLDER_NAME}
            !mkdir -p ../data/{FOLDER_NAME_BIN}

            for split_name in ['train', 'valid', 'test']:
                
                write_spaced_tokens_to_file(dataset[split_name][src_lang][tok_type_src],
                                            FOLDER_NAME, '{}.{}'.format(split_name, src_lang))
                
                write_spaced_tokens_to_file(dataset[split_name][tgt_lang][tok_type_tgt],
                                            FOLDER_NAME, '{}.{}'.format(split_name, tgt_lang))


create directories: 
dir: ../data/opensubtitles_tok/sentencepiece-sentencepiece/th-en
dir: ../data/opensubtitles_bin/sentencepiece-sentencepiece/th-en
create directories: 
dir: ../data/opensubtitles_tok/sentencepiece-sentencepiece/en-th
dir: ../data/opensubtitles_bin/sentencepiece-sentencepiece/en-th
create directories: 
dir: ../data/opensubtitles_tok/sentencepiece-newmm/th-en
dir: ../data/opensubtitles_bin/sentencepiece-newmm/th-en
create directories: 
dir: ../data/opensubtitles_tok/sentencepiece-newmm/en-th
dir: ../data/opensubtitles_bin/sentencepiece-newmm/en-th
create directories: 
dir: ../data/opensubtitles_tok/newmm-sentencepiece/th-en
dir: ../data/opensubtitles_bin/newmm-sentencepiece/th-en
create directories: 
dir: ../data/opensubtitles_tok/newmm-sentencepiece/en-th
dir: ../data/opensubtitles_bin/newmm-sentencepiece/en-th
create directories: 
dir: ../data/opensubtitles_tok/newmm-newmm/th-en
dir: ../data/opensubtitles_bin/newmm-newmm/th-en
create directories: 
dir: ../data/opens

In [125]:
!head ../data/opensubtitles_tok/newmm-sentencepiece/th-en/train.en


▁bec ky , ▁um , ▁you ▁were ▁acting ▁particularly ▁strange ▁in ▁there ▁just ▁now .
▁stay ▁with ▁her ▁so ▁anna ▁can ▁guide ▁you . ▁i ' m ▁going ▁back .
▁look .
▁oh , ▁no , ▁it ' s ▁the ▁other ▁way ▁around , ▁dr . ▁lewis .
▁sort ▁of .
▁bart ender , ▁something ▁really ▁strong , ▁please .
▁yes , ▁obviously .
▁la ' s ▁so ▁nice .
▁i ' m ▁going ▁to ▁fix ▁it .
▁i ▁get ▁b ored .


In [126]:
!head ../data/opensubtitles_tok/newmm-sentencepiece/th-en/train.th

เบค กี้ เธอ ทำท่า แปลก ๆ เมื่อกี้ ใน ห้อง
อยู่ กับ เธอ แอน นา จะ นำทาง คุณ ผม จะ กลับ ไป
ฟัง นะ
พอดี เลย ดร. ลี วิ ส
แบบ ว่า
เอ่อ บาร์ เท็น เด อร ์ ขอ อะไร ที่
ก็ ใช่ ห น่ะ สิ
แอลเอ สวย เนอะ
ฉัน กำลังจะ แก้ ไขมัน
ฉัน เบื่อ ละ
