In [None]:
# coding=utf-8

import random
from time import time
import io
import copy 
from multiprocessing import Pool
from functools import partial

from tqdm import tqdm_notebook
from pythainlp.tokenize import word_tokenize
from pythainlp.ulmfit import *

from subword_nmt import learn_bpe as learner
from subword_nmt import apply_bpe as subword_tokenizer


In [None]:
with open('data/opensubtitles/OpenSubtitles.en-th.en','r', encoding='utf-8') as f:
    en = f.read().split('\n')
len(en),en[:3]

In [None]:
with open('data/opensubtitles/OpenSubtitles.en-th.th','r', encoding='utf-8') as f:
    th = f.read().split('\n')
len(th),th[:3]

## 1. subword-nmt

-----

- pre-tokenize the input text (both source and target) with `newmm`

In [None]:
counter = 0
# list_of_unk_toks = [b'\xc2\x99', b'\xc2\x9a', b'\xc2\x95',  b'\xc2\x97', b'\xc2\x98',
#                     b'\xc2\x80', b'\xc2\x81', b'\xc2\x82', b'\xc2\x83', b'\xc2\x84',
#                     b'\xc2\x86',
#                     b'\xc2\x87', b'\xc2\x88', 
#                     b'\xc2\x89', b'\xc2\x8a']
  
list_of_unk_toks = [b'\x98\xc2', b'\xae\xc2', b'\x99\xc2', b'\xb1\xc2' , b'\xc2\xb7']
for index, text in enumerate(th):
    for token in list_of_unk_toks:
        if token in text.encode('utf-8'):
            print('found', token)
            print('index [{}]: {}'.format(index, text))
            print('English:', en[index])
            print('---')
            counter += 1
            break
            
            
#     break

print('total number of lines that contain "เธ..." =', counter)



In [None]:
substring_lists = ['โช', 'รขโขยช', '', '? ?', '​', '', '', '', '', '', 'โ', '​', '']
for item in substring_lists:
    print(item.encode('utf-8'))


In [None]:
_tokenizer_newmm = partial(word_tokenize, engine='newmm', keep_whitespace=False)
def tokenize_worker(sentence):
    
    for substring in substring_lists:
        sentence = sentence.replace(substring, '')

    return ' '.join(_tokenizer_newmm(sentence))

    
def tokenize_handler(sentences):
    toks = []
    p = Pool(12)
    t = time()

    toks = p.map(tokenize_worker, sentences)
    
    p.close()
    p.join() # call Pool.join() to wait for the worker processes to terminate.

    print('{} s'.format(time() -t))

    return toks



def sentences_filter(sentences):
    indices = []
    for index, sentence in enumerate(sentences):
        for token in list_of_unk_toks:
            if token in sentence.encode('utf-8'):
                indices.append(index)
                break
        if len(sentence) == 1:
            indices.append(index)
            
    return indices



In [None]:

indices_to_filter_out_th = sentences_filter(th)
indices_to_filter_out_en = sentences_filter(en)

print(len(indices_to_filter_out_th))
print(len(indices_to_filter_out_en))

indices_to_filter_out = indices_to_filter_out_th + indices_to_filter_out_en
indices_to_filter_out = set(indices_to_filter_out)



In [None]:
filtered_th = [x for i, x in enumerate(th) if i not in indices_to_filter_out]
filtered_en = [x for i, x in enumerate(en) if i not in indices_to_filter_out]


In [None]:
print(len(th), len(en))
print(len(filtered_th), len(filtered_en))
print('diff: {} sentences'.format((len(filtered_th) - len(th))))


In [None]:
en_tok = tokenize_handler(filtered_en[:])


In [None]:
print(len(en_tok), en_tok[:5])


In [None]:
th_tok = tokenize_handler(filtered_th[:])


In [None]:
print(len(th_tok), th_tok[:5])


In [None]:
#train-valid-test split 80/10/10

n = len(th_tok)
print('n=',n)
idx = list(range(n))

random.seed(1234) # Set SEED
random.shuffle(idx)

train_idx, valid_idx, test_idx = idx[:int(n*0.8)], idx[int(n*0.8):int(n*0.9)], idx[int(n*0.9):]

len(train_idx),len(valid_idx),len(test_idx)



In [None]:
en_train = [en_tok[i] for i in train_idx]
print(len(en_train))

en_valid = [en_tok[i] for i in valid_idx]
print(len(en_valid))

en_test = [en_tok[i] for i in test_idx]
print(len(en_test))


In [None]:
th_train = [th_tok[i] for i in train_idx]
print(len(th_train))

th_valid = [th_tok[i] for i in valid_idx]
print(len(th_valid))

th_test = [th_tok[i] for i in test_idx]
print(len(th_test))


In [None]:
# folder name is now `opensubtitles_tok_bpe` for this sanbox_bpe.ipynb

FOLDER_NAME = "opensubtitles_tok_bpe"
FOLDER_NAME_BIN = "opensubtitles_bin_bpe"


In [None]:
def write_spaced_tokens_to_file(data, filename):
    with open('./data/{}/{}'.format(FOLDER_NAME, filename),'w') as f:
        for item in data:
            f.write(item + '\n')
            
            

In [None]:
write_spaced_tokens_to_file(en_train, 'train.en')
write_spaced_tokens_to_file(th_train, 'train.th')
write_spaced_tokens_to_file(en_valid, 'valid.en')
write_spaced_tokens_to_file(th_valid, 'valid.th')
write_spaced_tokens_to_file(en_test, 'test.en')
write_spaced_tokens_to_file(th_test, 'test.th')


In [None]:
en_train[:5], th_train[:5]


In [None]:
en_valid[:5], th_valid[:5]


In [None]:
# test

for i in range(len(th_train)):
    if b'\xc2' in th_train[i].encode('utf-8'):
#     if '¶' in en_train[i]:
        print('en:', en_train[i])
        print('th:', th_train[i])
        print('th:', th_train[i].encode('utf-8'))
        print('------------')
        

### Learn BPE from train


## 1.1 bpe-to-bpe (en -> th) (separated vocab)

In [None]:
BPE_TOKENS = 25000

# BPE_CODE = './data/{}/code'.format(FOLDER_NAME)
BPE_CODE_JOINT = './data/{}/code.joint'.format(FOLDER_NAME)

TRAIN_EN = './data/{}/train.en'.format(FOLDER_NAME)
TRAIN_TH = './data/{}/train.th'.format(FOLDER_NAME)
VOCAB_PREFIX= './data/{}/vocab'.format(FOLDER_NAME)


print("learn_bpe.py on ${TRAIN}...")
print(BPE_TOKENS, BPE_CODE, TRAIN_EN, TRAIN_TH, VOCAB_PREFIX)


In [None]:
# run subword-nmt learn-bpe for Joint TH-EN

!subword-nmt learn-joint-bpe-and-vocab --input {TRAIN_TH} {TRAIN_EN} -s {BPE_TOKENS} -o {BPE_CODE_JOINT} --verbose --write-vocabulary {VOCAB_PREFIX}.th {VOCAB_PREFIX}.en


In [None]:
for lang in ['en', 'th']:
    for file_prefix in ['train', 'valid', 'test']:
        # <train/valid/test>.<en/th>
        file_name = './data/{}/{}.{}'.format(FOLDER_NAME, file_prefix, lang)
        
        # <train/valid/test>.bpe.<en/th>
        file_name_bpe = './data/{}/{}.bpe.{}'.format(FOLDER_NAME, file_prefix, lang)
       
        print('apply BPE to', file_name)

        !subword-nmt apply-bpe -c {BPE_CODE_JOINT} --vocabulary {VOCAB_PREFIX}.{lang} < {file_name} > {file_name_bpe}
        
        


In [None]:
!fairseq-preprocess --source-lang en --target-lang th \
    --trainpref data/$FOLDER_NAME/train.bpe \
    --validpref data/$FOLDER_NAME/valid.bpe \
    --testpref data/$FOLDER_NAME/test.bpe \
    --destdir data/$FOLDER_NAME_BIN \
    --bpe subword_nmt \
    --joined-dictionary \
    --workers 12
    
# shared dictionary

In [None]:
# Namespace(alignfile=None, bpe='subword_nmt', cpu=False, criterion='cross_entropy', dataset_impl='mmap', destdir='data/opensubtitles_bin_bpe', fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, joined_dictionary=True, log_format=None, log_interval=1000, lr_scheduler='fixed', memory_efficient_fp16=False, min_loss_scale=0.0001, no_progress_bar=False, nwordssrc=-1, nwordstgt=-1, only_source=False, optimizer='nag', padding_factor=8, seed=1, source_lang='en', srcdict=None, target_lang='th', task='translation', tbmf_wrapper=False, tensorboard_logdir='', testpref='data/opensubtitles_tok_bpe/test.bpe', tgtdict=None, threshold_loss_scale=None, thresholdsrc=0, thresholdtgt=0, tokenizer=None, trainpref='data/opensubtitles_tok_bpe/train.bpe', user_dir=None, validpref='data/opensubtitles_tok_bpe/valid.bpe', workers=12)
# | [en] Dictionary: 25311 types
# | [en] data/opensubtitles_tok_bpe/train.bpe.en: 2608023 sents, 27457327 tokens, 0.0% replaced by <unk>
# | [en] Dictionary: 25311 types
# | [en] data/opensubtitles_tok_bpe/valid.bpe.en: 326003 sents, 3437093 tokens, 0.000378% replaced by <unk>
# | [en] Dictionary: 25311 types
# | [en] data/opensubtitles_tok_bpe/test.bpe.en: 326003 sents, 3433729 tokens, 0.000146% replaced by <unk>
# | [th] Dictionary: 25311 types
# | [th] data/opensubtitles_tok_bpe/train.bpe.th: 2608023 sents, 22482753 tokens, 0.0% replaced by <unk>
# | [th] Dictionary: 25311 types
# | [th] data/opensubtitles_tok_bpe/valid.bpe.th: 326003 sents, 2811450 tokens, 0.0% replaced by <unk>
# | [th] Dictionary: 25311 types
# | [th] data/opensubtitles_tok_bpe/test.bpe.th: 326003 sents, 2812530 tokens, 0.0% replaced by <unk>
# | Wrote preprocessed data to data/opensubtitles_bin_bpe

In [None]:
!fairseq-train \
    data/opensubtitles_bin_bpe \
    --arch transformer_iwslt_de_en --share-decoder-input-output-embed \
    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
    --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
    --dropout 0.3 --weight-decay 0.0001 \
    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
    --max-tokens 2048 \
    --bpe subword_nmt \
    --save-dir data/opensubtitles_model/transformers_bpe \
    --tensorboard-logdir data/opensubtitles_model/transformers_bpe/tensorboard_log 

In [None]:
# fairseq-generate data/opensubtitles_bin_bpe \
#     --path data/opensubtitles_model/transformers_bpe/checkpoint_best.pt \
#     --beam 5 --remove-bpe


# for transformer-word2word-en2th
fairseq-generate data/opensubtitles_bin \
    --path checkpoints/checkpoint_best.pt \
    --beam 5