In [1]:
# coding=utf-8
import sys
sys.path.append('..')

import os
import io
import random
import copy 
import re
import html

from time import time 
from multiprocessing import Pool
from collections import Counter

from functools import partial

from tqdm import tqdm_notebook
import pythainlp
from pythainlp.util import *
from pythainlp.tokenize import word_tokenize
from pythainlp.ulmfit import *

# subword-nmt
from subword_nmt import learn_bpe as learner
from subword_nmt import apply_bpe as subword_tokenizer

import fairseq 
from datetime import timedelta
from tqdm import tqdm, tqdm_notebook
from pythainlp.tokenize import DEFAULT_DICT_TRIE

from pythainlp.corpus import thai_words

print(pythainlp.__version__)
# assert pythainlp.__version__ == '2.1'

import tensorflow as tf
import tensorflow_hub as hub
import numpy as np


2.0.7


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# install BPEmb (BPE embeddings)

!pip install --q bpemb



In [3]:
# # USE

# g = tf.Graph()
# with g.as_default():
#     text_input = tf.placeholder(dtype=tf.string, shape=[None])
#     embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/1")
#     embedded_text = embed(text_input)
#     init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
# g.finalize()

# # def compute_similarity(src, tgt):
# #     """
# #         Calculate sentence similarity based on Google Universal Sentence Encoder (Multilingual Large)
# #     """

# # # Initialize session.
# # session = tf.Session(graph=g)
# # session.run(init_op)

# # # Compute embeddings.
# # en_result = session.run(embedded_text, feed_dict={text_input: english_sentences})
# # it_result = session.run(embedded_text, feed_dict={text_input: italian_sentences})
# # ja_result = session.run(embedded_text, feed_dict={text_input: japanese_sentences})

# # # Compute similarity matrix. Higher score indicates greater similarity.
# # similarity_matrix_it = np.inner(en_result, it_result)
# # similarity_matrix_ja = np.inner(en_result, ja_result)

In [4]:
from bpemb import BPEmb

bpemb_pretrained ={
    'th': {
        '25000': BPEmb(lang="th", vs=25000)
    },
    'en': {
        '25000': BPEmb(lang="en", vs=25000)
    }
}


In [5]:
with open('../data/opensubtitle_v2018/OpenSubtitles.en-th.en','r', encoding='utf-8') as f:
    en = f.read().split('\n')
len(en),en[:3]


(3281534,
 ['Slave in the Magic Mirror, come from the farthest space.',
  'Through wind and darkness, I summon thee.',
  'Speak!'])

In [6]:
with open('../data/opensubtitle_v2018/OpenSubtitles.en-th.th','r', encoding='utf-8') as f:
    th = f.read().split('\n')
    
len(th),th[:3]

(3281534,
 ['ทาสในกระจกวิเศษ, มาจากพื้นที่ที่ไกลที่สุด',
  'ผ่านลมและความมืดฉันเรียกเจ้า',
  'พูด!'])

In [7]:
class BaseRule:    
    def test(self, sentence, lang):
        pass

class ReplaceRule(BaseRule):   
    
    def __init__(self):
        super().__init__()
        
    def replace(self, sentence, lang):
        pass

class UnescapeString(ReplaceRule):
    def __init__(self):
        super().__init__()
        pass
        
    def test(self, sentence, lang):
        return True

    def replace(self, sentence, lang):
        sentence = sentence.replace('\\"', '"')
        return unescape_string(sentence)

class ReplaceSymbolOccurenceRule(ReplaceRule):
    
    def __init__(self):
        super().__init__()
        self.token_list = ['', '', '​', '', '', '', '', '', 'โ', '​',
                           '♪', '{\ cHFFFFFF }', '\cHFFFFFF', '§', 'font color = "# 808080 "',
                          ' ##', '{\\cHFFFFFF}', '## ', ' ## ', '-# ', '# ', ' #', '\ N', '\ NI', ',8203;', '8203;', '#8203; ',
                          '#8203;', '\\i1}', "\\\\\\\\", "\\\\\\", "\\\\", '\\ ', '\\', '{}']
        
    def test(self, sentence, lang):
        for token in self.token_list:
            if token in sentence:
                return True
        return False

    def replace(self, sentence, lang):
        for token in self.token_list:
            sentence = sentence.replace(token, '')
        return sentence

class ReplaceHashtagInSentenceRule(ReplaceRule):
    
    def __init__(self):
        super().__init__()
        pass
        
    def test(self, sentence, lang):
        if re.match(r"[[^#]|[^##]]", sentence):
            return True
        if re.search(r"#$", sentence):
            return True
        return False

    def replace(self, sentence, lang):
        sentence = re.sub(r"^#+", '', sentence).lstrip()
        sentence = re.sub(r"#$", '', sentence).rstrip()

        return sentence
    

class NormalizeThaiVowel(ReplaceRule):
    
    def __init__(self):
        super().__init__()
        pass
        
    def test(self, sentence, lang):
        if 'เเ' in sentence and lang == 'th':
            return True
        if 'ใใ' in sentence and lang == 'th':
            return True
        return False
    
    def replace(self, sentence, lang):
        sentence = re.sub(r"เเ", 'แ', sentence)
        sentence = re.sub(r"ใใ", 'ใ', sentence)

        return sentence
    
class ReplaceFullStopInThaiSentenceRule(ReplaceRule):
    def __init__(self):
        super().__init__()
        
    def test(self, sentence, lang):
        if lang == 'th':
            if re.search(r'\.$', sentence):
                return True
        return False

    def replace(self, sentence, lang):
        sentence = re.sub(r"\.$", '', sentence)
        return sentence
    
    
    
class ReplaceDashInSentenceRule(ReplaceRule):
    
    def __init__(self):
        super().__init__()
        
    def test(self, sentence, lang):
        if ' - ' in sentence:
            return True
        if '- ' in sentence:
            return True
        if ' -' in sentence:
            return True
        if re.search(r"^[-]+", sentence):
            return True
        if re.search(r"[-]+$", sentence):
            return True
        return False

    def replace(self, sentence, lang):
        sentence = re.sub(r" - ", '', sentence)
        sentence = re.sub(r"- ", '', sentence)
        sentence = re.sub(r" -", '', sentence)
        sentence = re.sub(r"^[-]+", '', sentence) # start with space + "-" 
        sentence = re.sub(r"[-]+$", '', sentence) # end with space + "-" 

        return sentence

In [8]:
def testReplaceDashInSentenceRule():
    rule = ReplaceDashInSentenceRule()
    assert rule.test('- ', lang="th") == True
    assert rule.test(' -', lang="th") == True
    assert rule.test(' - ', lang="th") == True
    assert rule.test('-กับใคร', lang="th") == True

    print(rule.replace('- Hello', lang="th"))
    print(rule.replace(' -กHello', lang="th"))
    print(rule.replace('Hello-', lang="th"))
    print(rule.replace('---- Hello- ', lang="th"))
    print(rule.replace('a-sad Hello--- ', lang="th"))
    print(rule.replace('-กับใคร', lang="th"))

testReplaceDashInSentenceRule()


Hello
กHello
Hello
Hello
a-sad Hello
กับใคร


In [9]:
def testReplaceHashtagInSentenceRule():
    rule = ReplaceHashtagInSentenceRule()
    assert rule.test('#..', lang="th") == True
    assert rule.test('##..', lang="th") == True
    assert rule.test('.#', lang="th") == True

    print(rule.replace('#..', lang="th"))
    print(rule.replace('##..', lang="th"))
    print(rule.replace('#### ..', lang="th"))
    print(rule.replace('..', lang="th"))
    print(rule.replace('.#', lang="th"))

testReplaceHashtagInSentenceRule()


..
..
..
..
.


In [10]:
def testReplaceFullStopInThaiSentenceRule():
    rule = ReplaceFullStopInThaiSentenceRule()
    assert rule.test('ฉัน.', lang="th") == True
    print(rule.replace('ฉัน.', lang="th"))

        
testReplaceFullStopInThaiSentenceRule()

ฉัน


In [11]:
def testNormalizeThaiVowel():
    rule = NormalizeThaiVowel()
    assert rule.test('เเ', lang='th') == True
    assert rule.test('ใใ', lang='th') == True

    print(rule.replace('เเรกเกิด', lang='th'))
    print(rule.replace('ใให้', lang='th'))

testNormalizeThaiVowel()
    

แรกเกิด
ให้


In [12]:

class ContainsAdSymbol(BaseRule):
    def __init__(self):
        super().__init__()
        
        
    def test(self, sentence, lang):
        if '@' in sentence:
            return True
        return False

class NoThaiWordInThaiSentence(BaseRule):
    def __init__(self):
        super().__init__()
        
        
    def test(self, sentence, lang):
        if lang == "th":
            if countthai(sentence, ignore_chars='') == 0.0:
                return True
            if re.search(r'^โช [A-z]', sentence):
                return True
            if re.search(r'^โช\s\.', sentence):
                return True
            if re.search(r'^โช\sโช', sentence):
                return True
        return False

class ContainsTokensInThaiSentence(BaseRule):
    def __init__(self):
        super().__init__()
        self.list_of_tokens = ['โ#65533;', 'N#233;', '#/N#', 'ใใใ', '#']
        
    def test(self, sentence, lang):
        if lang == "th":
            if re.search(r'\d\d; \d\d', sentence):
                return True
            for symbol in self.list_of_tokens:
                if symbol in sentence:
                    return True
           
        return False
    
class ContainsUnknownSymbols(BaseRule):
    def __init__(self):
        super().__init__()
        self.list_unknown_symbols = [ b'\x98\xc2', b'\xae\xc2', b'\x99\xc2',
                                      b'\xb1\xc2', b'\xc2\xb7', b'\xc2\x8b',
                                      b'\xc3\x83']
        
    def test(self, sentence, lang):
        for symbol in self.list_unknown_symbols:
            if symbol in sentence.encode('utf-8'):
                return True
        return False

class SentenceLengthLessThanOne(BaseRule):
    def __init__(self):
        super().__init__()
        
        
    def test(self, sentence, lang):
        if len(sentence) <= 1:
            return True
        return False

In [13]:
def testContainsAdSymbol():
    rule = ContainsAdSymbol()
    print(rule.test('@gmail', lang="th"))
    print(rule.test('gmasd', lang="th"))
testContainsAdSymbol()    

True
False


In [14]:
def testContainsUnknownSymbols():
    rule = ContainsUnknownSymbols()
    print(''.encode('utf-8'))
    print(rule.test('', lang="th"))
testContainsUnknownSymbols()    

b'\xc2\x8b'
True


In [15]:
def testNoThaiWordInThaiSentence():
    rule = NoThaiWordInThaiSentence()
    print(rule.test('', lang="th"))
    print(rule.test('en', lang="th"))
    print(rule.test('โช a', lang="th"))

testNoThaiWordInThaiSentence()

True
True
True


In [16]:
DEFAULT_REPLACE_RULES = [ReplaceSymbolOccurenceRule,
                         ReplaceDashInSentenceRule,
                         ReplaceHashtagInSentenceRule,
                         UnescapeString,
                         NormalizeThaiVowel]

DEFAULT_FILTER_OUT_RULES = [ContainsUnknownSymbols,
                            ContainsTokensInThaiSentence,
                            ContainsAdSymbol,
                            NoThaiWordInThaiSentence,
                            SentenceLengthLessThanOne]


In [17]:
# preprocess text

# LIST_OF_TOKENS_TO_REPLACE = ['', '', '​', '', '', '', '', '', 'โ', '​',
#                    '♪', '{\ cHFFFFFF }', '§', 'font color = "# 808080 "']
def unescape_string(text):
    return html.unescape(text)

def sentences_filter(sentences, lang, rules=DEFAULT_FILTER_OUT_RULES):
    indices = []
    for index, sentence in tqdm_notebook(enumerate(sentences), total=len(sentences)):
        
        for rule in rules:
            rule_obj = rule()
            if rule_obj.test(sentence, lang=lang):
                indices.append(index)
    return indices

def clean_sentence(sentence, lang, rules=DEFAULT_REPLACE_RULES):
    for rule in rules:
        
        rule_obj = rule()
        if rule_obj.test(sentence, lang=lang):
            sentence = rule_obj.replace(sentence, lang=lang)
        
    return sentence

In [60]:

def tokenize_worker(sentence, lang, trie):
    
    _tokenizer_newmm = partial(pythainlp.tokenize.word_tokenize, engine='newmm',
                               keep_whitespace=False,
                              custom_dict=(trie if trie != None else DEFAULT_DICT_TRIE))
    return ' '.join(_tokenizer_newmm(sentence))
  
def tokenize_handler(sentences, lang, trie=None):
    toks = []
    p = Pool(12)
    t = time()
    _tokenize_worker = partial(tokenize_worker, lang=lang, trie=trie)
    toks = p.map(_tokenize_worker, sentences)
    
    p.close()
    p.join() # call Pool.join() to wait for the worker processes to terminate.

    print('{} s'.format(time() -t))

    return toks
  

In [61]:
def write_spaced_tokens_to_file(data, folder_name, filename):
    with open('/root/mt-opus/data/{}/{}'.format(folder_name, filename),'w') as f:
        for item in data:
            f.write(item + '\n')
            
            

In [62]:
t = time()
print('sentence filtering (th)')
indices_to_filter_out_th = sentences_filter(th, lang='th')
print(len(indices_to_filter_out_th))

print('sentence filtering (en)')
indices_to_filter_out_en = sentences_filter(en, lang='en')

print(len(indices_to_filter_out_en))

indices_to_filter_out = indices_to_filter_out_th + indices_to_filter_out_en
indices_to_filter_out = set(indices_to_filter_out)


sentence filtering (th)


HBox(children=(IntProgress(value=0, max=3281534), HTML(value='')))


102619
sentence filtering (en)


HBox(children=(IntProgress(value=0, max=3281534), HTML(value='')))


4560


In [63]:
print(len(indices_to_filter_out))


93721


In [64]:
# Clean Sentence

print('clean sentence (th)')
filtered_th = [clean_sentence(x, lang='th') for i, x in tqdm_notebook(enumerate(th), total=len(th)) if i not in indices_to_filter_out]
print('clean sentence (en)')
filtered_en = [clean_sentence(x, lang='en') for i, x in tqdm_notebook(enumerate(en), total=len(en)) if i not in indices_to_filter_out]

print('{} seconds'.format(time() -t))


clean sentence (th)


HBox(children=(IntProgress(value=0, max=3281534), HTML(value='')))


clean sentence (en)


HBox(children=(IntProgress(value=0, max=3281534), HTML(value='')))


228.31016874313354 seconds


In [59]:
print(len(th))
print(len(filtered_th))

3281534


NameError: name 'filtered_th' is not defined

In [425]:
# explore




In [33]:
counter = Counter()

for idx, sent in enumerate(th):
    
    if re.search('เเ',sent):
        counter['th found'] += 1
        if counter['th found'] < 10:
            print(en[idx])
            print(sent)
   
counter

Last night, I dreamt I went to Manderley again.
เมื่อคืนฉันฝันว่า ได้กลับไปที่เเมนเดอเลย์อีกครั้ง
The drive wound away in front of me, twisting and turning as it had always done.
ถนนเลื้อยคดเคี้ยวอยู่เบื้องหน้าของฉัน ยังคงคดโค้งเเละวกวนดั่งเช่นเคย
Nature had come into her own again, and little by little had encroached upon the drive with long, tenacious fingers.
ธรรมชาติได้กลับคืนสู่ตนเองอีกครั้ง ค่อยๆ ปกคลุมทางเข้าทีละน้อย ดั่งนิ้วมืออันเรียวยาวและเหนียวเเน่น
On and on wound the poor thread that had once been our drive, and finally, there was Manderley.
แผ่ปกคลุมทางเข้าอันซอมซ่อ ที่ครั้งหนึ่งเคยเป็นทางเดินรถของเรา ในที่สุดก็ถึงเเมนเดอเลย์
Manderley, secretive and silent.
เเมนเดอเลย์-- ลึกลับเเละเงียบงัน
Time could not mar the perfect symmetry of those walls.
กาลเวลามิอาจทําให้กําเเพง ที่ได้สมมาตรนี้ด่างพร้อยได้
Moonlight can play odd tricks upon the fancy, and suddenly it seemed to me that light came from the windows.
เเสงจันทร์อาจทําให้เกิดภาพลวงตา ทําให้ฉันเห็นว่ามีเเสงไฟ ลอดออกมาจา

Counter({'th found': 11472})

In [57]:
counter = Counter()

for idx, sent in enumerate(th):
    
    if re.search(r'โช [A-z]', sent):
        counter['th found'] += 1
        if counter['th found'] < 10:
            print(en[idx])
            print(sent)
            print()
 

 
counter

- Tai-Sho. - Tai-Sho.
ไทโช \ไทโช

♪ Get thee behind me, Satan
โช Get thee behind me, Satan



Counter({'th found': 2})

In [378]:
counter = Counter()

for idx, sent in enumerate(filtered_th):
    
    if re.search('/ N',sent):
        counter['found;'] += 1
        if counter['found'] < 100:
            print(filtered_en[idx])
            print(sent)
            print()

  
counter

Counter()

In [48]:
counter = Counter()
# control_characters = [
#     b'\xc2\x80',
# b'\xc2\x81',
# b'\xc2\x82',
# b'\xc2\x83',
# b'\xc2\x84',
# b'\xc2\x85',
# b'\xc2\x86',
# b'\xc2\x87',
# b'\xc2\x88',
# b'\xc2\x89',
# b'\xc2\x8a',
# b'\xc2\x8b',
# b'\xc2\x8c',
# b'\xc2\x8d',
# b'\xc2\x8e',
# b'\xc2\x8f',
# b'\xc2\x90',
# b'\xc2\x91',
# b'\xc2\x92',
# b'\xc2\x93',
# b'\xc2\x94',
# b'\xc2\x95',
# b'\xc2\x96',
# b'\xc2\x97',
# b'\xc2\x98',
# b'\xc2\x99',
# b'\xc2\x9a',
# b'\xc2\x9b',
# b'\xc2\x9c',
# b'\xc2\x9d',
# b'\xc2\x9e',
# b'\xc2\x9f',
# ]

control_characters = [b'\xc2\x8b']
for idx, sent in enumerate(th):
    for token in control_characters:
        if token in sent.encode('utf-8'):
           
            counter['th found \fn'] += 1
            if counter['th found \fn'] < 10:
                print('found', token)
                print(en[idx])
                print(sent)
                print(sent.encode('utf-8'))
                print('--')
            break
 
 
counter

found b'\xc2\x8b'
'Lf you know someone who has been bitten 'it is absolutely essential that you isolate them immediately.
เธเธฑเนเธเธเนเธเธทเธญเธชเธดเนเธเธเธตเนเธเนเธณเธเธฒเธเธเธณเนเธเธเธญเธเนเธเธฒ เธเธกเนเธเธทเนเธญเธงเนเธฒเธเนเธงเธ 2 เธเธตเนเธฃเธ
b'\xe0\xb9\x80\xe0\xb8\x98\xc2\x99\xe0\xb9\x80\xe0\xb8\x98\xe0\xb8\x91\xe0\xb9\x80\xe0\xb8\x99\xc2\x88\xe0\xb9\x80\xe0\xb8\x98\xc2\x99\xe0\xb9\x80\xe0\xb8\x98\xc2\x81\xe0\xb9\x80\xe0\xb8\x99\xc2\x87\xe0\xb9\x80\xe0\xb8\x98\xc2\x84\xe0\xb9\x80\xe0\xb8\x98\xe0\xb8\x97\xe0\xb9\x80\xe0\xb8\x98\xe0\xb8\x8d\xe0\xb9\x80\xe0\xb8\x98\xe0\xb8\x8a\xe0\xb9\x80\xe0\xb8\x98\xe0\xb8\x94\xe0\xb9\x80\xe0\xb8\x99\xc2\x88\xe0\xb9\x80\xe0\xb8\x98\xc2\x87\xe0\xb9\x80\xe0\xb8\x98\xc2\x97\xe0\xb9\x80\xe0\xb8\x98\xe0\xb8\x95\xe0\xb9\x80\xe0\xb8\x99\xc2\x88\xe0\xb9\x80\xe0\xb8\x98\xc2\x8b\xe0\xb9\x80\xe0\xb8\x99\xc2\x89\xe0\xb9\x80\xe0\xb8\x98\xe0\xb8\x93\xe0\xb9\x80\xe0\xb8\x98\xc2\x8b\xe0\xb9\x80\xe0\xb8\x98\xe0\xb8\x92\xe0\xb9\x80\x

Counter({'th found \x0cn': 524})

In [278]:
counter = Counter()

for idx, sent in enumerate(filtered_th):
    
    if re.search("เเ", sent):
        counter['th found ? '] += 1
        if counter['th found ? '] < 30:
            print(filtered_en[idx])
            print(sent)
            print()
 
 
counter

Counter()

In [279]:
sentence_counter = {
    'th': Counter(),
    'en': Counter()
}
for i in range(len(filtered_th)):
    for lang in ['th', 'en']:
        if lang == 'th':
            sentence_counter[lang][filtered_th[i]] += 1
        if lang == 'en':
            sentence_counter[lang][filtered_en[i]] += 1

In [280]:
sentence_counter['th'].most_common(10)

[('ใช่', 10609),
 ('ไม่', 6691),
 ('โอเค', 6606),
 ('ขอบคุณ', 4760),
 ('เฮ้', 3365),
 ('อะไรนะ?', 3246),
 ('ครับ', 2709),
 ('อะไรนะ', 2664),
 ('อะไร?', 2601),
 ('ไม่!', 2097)]

In [281]:
sentence_counter['en'].most_common(10)

[('What?', 13268),
 ('Yeah.', 12373),
 ('No.', 10795),
 ('Okay.', 7995),
 ('Yes.', 6653),
 ('Thank you.', 6488),
 ('Hey.', 4278),
 ('No!', 3772),
 ("I don't know.", 3650),
 ('Why?', 3565)]

In [282]:
print_only = 100
count = 0
for i in range(len(filtered_th)):
    if filtered_th[i] == 'โซ' and count < print_only:
        print(filtered_th[i], '|', filtered_en[i])
        count += 1

โซ | Zo
โซ | Zoe
โซ | Zo?
โซ | Zo?
โซ | Zo?
โซ | Zo?


In [290]:
print_only = 10
count = 0
for i in range(len(filtered_th)):
    if re.search(r'\.$', filtered_th[i]):
        if count < print_only:
            print(filtered_th[i])
            print(filtered_en[i])
            print()
        count += 1
print('count', count)

กรุณารอ.
Wait, please.

สวัสดี.
Hello there.

นั่นดีกว่า.
That's better.

สมเด็จพระราชินี.
The Queen.

ไปได้.
Now, go.

ใช่.
Yes.

มาสิ ชายไก่ ติดตามฉัน.
Come on, hen... Men. Follow me.

เงียบ.
Quiet.

อย่าปล่อยให้เขา หยุดเขา.
Don't let him.Stop him.

แค่นั้นแหละ.
That's it.

count 168365


In [120]:
toks = {
    'th': {
        'sentencepiece': [],
        'newmm':[]
    },
    'en': {
        'sentencepiece': [],
        'newmm':[]
    }
}

## 1a Segment texts into tokens with `newmm`

In [106]:
toks['th']['newmm'] = tokenize_handler(filtered_th, lang='th')
toks['en']['newmm'] = tokenize_handler(filtered_en, lang='en')


38.81156301498413 s
39.77449178695679 s


In [107]:
toks['th']['newmm'][0:10], toks['en']['newmm'][0:10]

(['ทาส ใน กระจก วิเศษ , มาจาก พื้นที่ ที่ ไกล ที่สุด',
  'ผ่าน ลม และ ความมืด ฉัน เรียก เจ้า',
  'พูด !',
  'ให้ ฉัน เห็น พระพักตร์ ของ พระองค์',
  'สิ่ง ที่ เจ้า จะ รู้ ว่า สมเด็จ พระราชินี ของ ฉัน ได้ อย่างไร',
  'กระจก วิเศษ บน ผนัง ผู้ ที่ เป็น สังขาร หนึ่ง ทั้งหมด หรือไม่',
  'ที่ มีชื่อเสียง เป็น ความงาม ของ เจ้า พระ บาท สมเด็จ พระเจ้าอยู่หัว',
  'แต่ ถือเป็น แม่บ้าน ที่ น่ารัก ที่ ฉัน เห็น',
  'ยาจก ไม่ สามารถ ซ่อน พระคุณ อ่อนโยน ของ เธอ',
  'อนิจจา เธอ มี ความเป็นธรรม มากขึ้น กว่า เจ้า'],
 ['Slave in the Magic Mirror , come from the farthest space .',
  'Through wind and darkness , I summon thee .',
  'Speak !',
  'Let me see thy face .',
  'What wouldst thou know , my Queen ?',
  'Magic Mirror on the wall , who is the fairest one of all ?',
  'Famed is thy beauty , Majesty .',
  'But hold , a lovely maid I see .',
  'Rags cannot hide her gentle grace .',
  'Alas , she is more fair than thee .'])

## 1b Segment texts into BPE tokens with SentencePiece (BPEmb)


In [108]:
def encode_bpe(sentences, lang, n_vocab=25000):
    """Return a list of bpe tokens give a list of sentences"""
    segmented_sentences = []
    for sentence in tqdm_notebook(sentences, total=len(sentences)):
#         print(sentence)
        bpe_tokens = bpemb_pretrained[lang]['{}'.format(n_vocab)].encode(sentence)
        segmented_sentences.append(' '.join(bpe_tokens))
        
    return segmented_sentences

### 1.1 Thai language

In [109]:
toks['th']['sentencepiece'] = encode_bpe(filtered_th, 'th', 25000)

print(toks['th']['sentencepiece'][0:10])

HBox(children=(IntProgress(value=0, max=3202751), HTML(value='')))


['▁ท าส ใน กระจก วิเศษ , ▁มาจาก พื้นที่ ที่ ไกล ที่สุด', '▁ผ่าน ลม และความ มืด ฉัน เรียก เจ้า', '▁พูด !', '▁ให้ ฉัน เห็น พระพักตร์ ของ ▁พระองค์', '▁สิ่งที่ เจ้า จะ รู้ว่า สมเด็จพระราชินี ▁ของ ฉัน ได้อย่างไร', '▁กระจ ก วิเศษ บน ผนัง ▁ผู้ ที่เป็น สัง ขาร หนึ่ง ทั้งหมด ▁หรือไม่', '▁ที่มีชื่อเสียง เป็น ความงาม ของ ▁เจ้า พระบาทสมเด็จพระ เจ้าอยู่หัว', '▁แต่ ถือเป็น แม่ บ้าน ที่น ่ารัก ที่ ฉัน ▁เห็น', '▁ยา จก ไม่สามารถ ซ่อน พระคุณ ▁อ่อน โยน ของเธอ', '▁อน ิจ จา เธอ มีความเป็น ธรรม ▁มาก ขึ้น กว่า เจ้า']


### 1.2 English language

In [110]:
toks['en']['sentencepiece']  = encode_bpe(filtered_en, 'en', 25000)
print(toks['en']['sentencepiece'][0:10])

HBox(children=(IntProgress(value=0, max=3202751), HTML(value='')))


['▁slave ▁in ▁the ▁magic ▁mirror , ▁come ▁from ▁the ▁fart hest ▁space .', '▁through ▁wind ▁and ▁darkness , ▁i ▁summon ▁the e .', '▁speak !', '▁let ▁me ▁see ▁thy ▁face .', '▁what ▁would st ▁thou ▁know , ▁my ▁queen ?', '▁magic ▁mirror ▁on ▁the ▁wall , ▁who ▁is ▁the ▁fa ire st ▁one ▁of ▁all ?', '▁famed ▁is ▁thy ▁beauty , ▁majesty .', '▁but ▁hold , ▁a ▁lov ely ▁maid ▁i ▁see .', '▁ra gs ▁cannot ▁hide ▁her ▁gentle ▁grace .', '▁al as , ▁she ▁is ▁more ▁fair ▁than ▁the e .']


## 2. Split train-valid-test 

In [111]:
#train-valid-test split 80/10/10

n = len(toks['th']['newmm'])

print('N = ',n)
idx = list(range(n))

random.seed(1234) # Set SEED
random.shuffle(idx)

train_idx, valid_idx, test_idx = idx[:int(n*0.8)], idx[int(n*0.8):int(n*0.9)], idx[int(n*0.9):]

dataset_split = {}
dataset_split['train'] = train_idx
dataset_split['valid'] = valid_idx
dataset_split['test'] = test_idx


len(train_idx),len(valid_idx),len(test_idx)



N =  3202751


(2562200, 320275, 320276)

In [112]:
dataset = {
    'train': {
        'en': {
            'sentencepiece': [],
            'newmm':[]
        },
        'th': {
             'sentencepiece': [],
            'newmm':[]
        }
    },
    'valid': {
        'en': {
            'sentencepiece': [],
            'newmm':[]
        },
        'th': {
             'sentencepiece': [],
            'newmm':[]
        }
    },
    'test': {
        'en': {
            'sentencepiece': [],
            'newmm':[]
        },
        'th': {
             'sentencepiece': [],
            'newmm':[]
        }
    }
}

for split_name in ['train', 'valid', 'test']:
    for lang in ['th', 'en']:
        for tok_type in ['sentencepiece', 'newmm']:

            dataset[split_name][lang][tok_type] = [toks[lang][tok_type][i] for i in dataset_split[split_name]] 


In [113]:
print(dataset['train']['th']['newmm'][0:2],'\n')
print(dataset['train']['en']['newmm'][0:2],'\n')
print(dataset['train']['th']['sentencepiece'][0:2],'\n')
print(dataset['train']['en']['sentencepiece'][0:2],'\n')

['เบค กี้ เธอ ทำท่า แปลก ๆ เมื่อกี้ ใน ห้อง', 'อยู่ กับ เธอ แอน นา จะ นำทาง คุณ ผม จะ กลับ ไป'] 

['Becky , um , you were acting particularly strange in there just now .', "Stay with her so Anna can guide you . I ' m going back ."] 

['▁เบ ค กี้ ▁เธอ ทํา ท่า แปลก ๆ ▁เมื่อ กี้ ▁ในห้อง', '▁ อยู่กับ เธอ ▁แอนนา จะนํา ทาง คุณ ▁ผม จะ กลับไป'] 

['▁bec ky , ▁um , ▁you ▁were ▁acting ▁particularly ▁strange ▁in ▁there ▁just ▁now .', "▁stay ▁with ▁her ▁so ▁anna ▁can ▁guide ▁you . ▁i ' m ▁going ▁back ."] 



In [119]:
# Counting number of tokens for train, valid, test
counter = Counter( )
for dataset_type in ['train', 'valid', 'test']:
    for th_sent_toks in dataset[dataset_type]['th']['newmm']:
        counter['th_{}_n_toks'.format(dataset_type)] += len(th_sent_toks)
    for en_sent_toks in dataset[dataset_type]['en']['newmm']:
        counter['en_{}_n_toks'.format(dataset_type)] += len(en_sent_toks)

print(counter) 

Counter({'en_train_n_toks': 92383739, 'th_train_n_toks': 86683223, 'en_valid_n_toks': 11536351, 'en_test_n_toks': 11535798, 'th_test_n_toks': 10833242, 'th_valid_n_toks': 10826042})


In [124]:

for tok_type_src in ['sentencepiece', 'newmm']:
    for tok_type_tgt in ['sentencepiece', 'newmm']:
        langs = ['th', 'en']
        for lang in langs:
            src_lang = lang
            tgt_lang = 'en' if lang =='th' else 'th'
            FOLDER_NAME = "opensubtitles_tok/{}-{}/{}-{}".format(tok_type_src, tok_type_tgt, src_lang, tgt_lang )
            FOLDER_NAME_BIN = "opensubtitles_bin/{}-{}/{}-{}".format(tok_type_src, tok_type_tgt, src_lang, tgt_lang)
           
            
            # Create directories
            print('create directories: ')
            print('dir: ../data/{}'.format(FOLDER_NAME))
            print('dir: ../data/{}'.format(FOLDER_NAME_BIN))

            !mkdir -p ../data/{FOLDER_NAME}
            !mkdir -p ../data/{FOLDER_NAME_BIN}

            for split_name in ['train', 'valid', 'test']:
                
                write_spaced_tokens_to_file(dataset[split_name][src_lang][tok_type_src],
                                            FOLDER_NAME, '{}.{}'.format(split_name, src_lang))
                
                write_spaced_tokens_to_file(dataset[split_name][tgt_lang][tok_type_tgt],
                                            FOLDER_NAME, '{}.{}'.format(split_name, tgt_lang))


create directories: 
dir: ../data/opensubtitles_tok/sentencepiece-sentencepiece/th-en
dir: ../data/opensubtitles_bin/sentencepiece-sentencepiece/th-en
create directories: 
dir: ../data/opensubtitles_tok/sentencepiece-sentencepiece/en-th
dir: ../data/opensubtitles_bin/sentencepiece-sentencepiece/en-th
create directories: 
dir: ../data/opensubtitles_tok/sentencepiece-newmm/th-en
dir: ../data/opensubtitles_bin/sentencepiece-newmm/th-en
create directories: 
dir: ../data/opensubtitles_tok/sentencepiece-newmm/en-th
dir: ../data/opensubtitles_bin/sentencepiece-newmm/en-th
create directories: 
dir: ../data/opensubtitles_tok/newmm-sentencepiece/th-en
dir: ../data/opensubtitles_bin/newmm-sentencepiece/th-en
create directories: 
dir: ../data/opensubtitles_tok/newmm-sentencepiece/en-th
dir: ../data/opensubtitles_bin/newmm-sentencepiece/en-th
create directories: 
dir: ../data/opensubtitles_tok/newmm-newmm/th-en
dir: ../data/opensubtitles_bin/newmm-newmm/th-en
create directories: 
dir: ../data/opens

In [125]:
!head ../data/opensubtitles_tok/newmm-sentencepiece/th-en/train.en


▁bec ky , ▁um , ▁you ▁were ▁acting ▁particularly ▁strange ▁in ▁there ▁just ▁now .
▁stay ▁with ▁her ▁so ▁anna ▁can ▁guide ▁you . ▁i ' m ▁going ▁back .
▁look .
▁oh , ▁no , ▁it ' s ▁the ▁other ▁way ▁around , ▁dr . ▁lewis .
▁sort ▁of .
▁bart ender , ▁something ▁really ▁strong , ▁please .
▁yes , ▁obviously .
▁la ' s ▁so ▁nice .
▁i ' m ▁going ▁to ▁fix ▁it .
▁i ▁get ▁b ored .


In [126]:
!head ../data/opensubtitles_tok/newmm-sentencepiece/th-en/train.th

เบค กี้ เธอ ทำท่า แปลก ๆ เมื่อกี้ ใน ห้อง
อยู่ กับ เธอ แอน นา จะ นำทาง คุณ ผม จะ กลับ ไป
ฟัง นะ
พอดี เลย ดร. ลี วิ ส
แบบ ว่า
เอ่อ บาร์ เท็น เด อร ์ ขอ อะไร ที่
ก็ ใช่ ห น่ะ สิ
แอลเอ สวย เนอะ
ฉัน กำลังจะ แก้ ไขมัน
ฉัน เบื่อ ละ
