In [193]:
import re
from time import time
import math
import torch
import fairseq
from fairseq import checkpoint_utils, data, options, tasks
from fairseq.sequence_generator import SequenceGenerator

from nltk.translate.bleu_score import sentence_bleu

import sys
sys.path.append('..')
from pythainlp.tokenize import word_tokenize



In [194]:
print(fairseq.__version__)

0.8.0


__Load TH-EN sentences from Wang__

In [195]:
# from Wang -- 2019-06-26_output_Non_Empty_Answer.txt
pattern = r'answer: \[\'(.+)\'\] variable: (\w.+)'
texts = {}
texts['wang'] = {
    'th': [],
    'en': []
}

with open('../data/wang/2019-06-26_output_Non_Empty_Answer.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
#         print(line)
#         print(line)
        try:
            match_obj = re.search(pattern, line)

            th_text = match_obj[1]
            if th_text[0] == '?':
                th_text = th_text[1:]
            en_text = match_obj[2]

            texts['wang']['th'].append(th_text)
            texts['wang']['en'].append(en_text)
        except Exception as e:
#             print(e)
            continue
    
    print('number of examples = ', len(texts['wang']['th']))


number of examples =  6118


In [196]:
# Define BPE

from bpemb import BPEmb

bpemb_pretrained ={
    'th': {
        '25000': BPEmb(lang="th", vs=25000)
    },
    'en': {
        '25000': BPEmb(lang="en", vs=25000)
    }
}

def encode_bpe(sentences, lang, n_vocab=25000):
    """Return a list of bpe tokens give a list of sentences"""
    segmented_sentences = []
    for sentence in sentences:
#         print(sentence)
        bpe_tokens = bpemb_pretrained[lang]['{}'.format(n_vocab)].encode(sentence)
        segmented_sentences.append(' '.join(bpe_tokens))
        
    return segmented_sentences


In [178]:
# # test bpe, and blue
# sent = "ฉันกินผัดไทประตูผี"
# ref_word = [['ฉัน', 'กิน', 'ผัด', 'ไท', 'ประตูผี']]
# ref_bpe = [['▁ฉัน', 'กิน', 'ผัด', 'ไท', 'ประตู', 'ผี']]

# # print(word_tokenize(sent, engine="newmm"))
# # print(encode_bpe([sent], lang="th")[0].split(' '))

# print('score word level bleu')
# cand_word = {
#     '1': ['ฉัน', 'กิน', 'ผัด', 'ไท', 'ประตูผี'],
#     '2': ['ฉัน', 'ผัด', 'ประตูผี'],
#     '3': ['ฉัน', 'กิน', 'ผัด', 'ไท'],
#     '4': ['ฉัน', 'ผัด', 'ไท','ประตูผี'],

# }
# print(sentence_bleu(ref_word, cand_word['1']))
# print(sentence_bleu(ref_word, cand_word['2']))
# print(sentence_bleu(ref_word, cand_word['3']))
# print(sentence_bleu(ref_word, cand_word['4']))


# print('score subword level bleu')
# cand_bpe = {
#     '1': ['▁ฉัน', 'กิน', 'ผัด', 'ไท', 'ประตู', 'ผี'],
#     '2': ['▁ฉัน', 'กิน', 'ผัด', 'ประตู', 'ผี'],
#     '3': ['▁ฉัน', 'กิน', 'ผัด', 'ไท'],
#     '4': ['▁ฉัน', 'ผัด', 'ไท', 'ประตู', 'ผี'],

# }
# print(sentence_bleu(ref_bpe, cand_bpe['1']))
# print(sentence_bleu(ref_bpe, cand_bpe['2']))
# print(sentence_bleu(ref_bpe, cand_bpe['3']))
# print(sentence_bleu(ref_bpe, cand_bpe['4']))

In [197]:
# helper function to make inference given a model path, src/tgt language, token types (subword, word), and dictionary

def run_inference(src_lang, tgt_lang,
                  src_tok_type, tgt_tok_type,
                  src_dict_path, tgt_dict_path,
                  model_path,
                  examples=texts,
                  n_examples=10,
                  beam_size=5,
                  use_tokenizer=False):
    
    
#     model_path = model_path or '../data/opensubtitles_model/exp003-1/transformer_base/checkpoint_best.pt'
    print('| loading model from {}'.format(model_path))

    state = checkpoint_utils.load_checkpoint_to_cpu(model_path)
    args = vars(state['args'])
    args['data'] = '/root/mt-opus/' + args['data']
#     print('args', args)
    
    ensemble, _, _ = checkpoint_utils.load_model_ensemble_and_task([model_path], arg_overrides=args, task=None)
    
    src_dict = data.Dictionary()
    src_dict.add_from_file(src_dict_path)

    tgt_dict = data.Dictionary()
    tgt_dict.add_from_file(tgt_dict_path)
    
    print('len(src_dict) = ', len(src_dict))
    print('len(tgt_dict) = ', len(tgt_dict))


    _generator = SequenceGenerator(tgt_dict=tgt_dict, beam_size=beam_size,
                                   len_penalty=-100,
                                   unk_penalty=0,
                                   max_len_a=0,
                                   max_len_b=40,
                                   normalize_scores=False,
                                   sampling_topk=-1,
                                   sampling_topp=-1.0,
                                   no_repeat_ngram_size=2)
    
    print('Inference from {} to {}'.format(src_lang, tgt_lang))
    for i, src in enumerate(examples[src_lang][:n_examples]):
        
        print(i+1)
        print('Source :', src)
        if use_tokenizer:
            if src_tok_type == 'newmm':
                toks = word_tokenize(src, engine='newmm', keep_whitespace=False)
            elif src_tok_type == 'sentencepiece':
                toks = encode_bpe([src], lang=src_lang)[0].split(' ')
        else:
            toks = src.split(' ')
        print('tokens :', toks)
        
        
        src_indices = [src_dict.index(t) for t in toks]
        src_len = [len(src_indices)]

        src_indices_tensor = torch.LongTensor(src_indices).view(1, -1)
        src_len_tensor = torch.LongTensor(src_len).view(1, -1)

        print('src indices:', src_indices_tensor.size(), src_indices_tensor)
        print('src len    :', src_len_tensor.size(), src_len_tensor)

        sample = {
            'net_input': {
                    'src_tokens': src_indices_tensor, 'src_lengths': src_len_tensor,
            },
        }
        
#         print('sample', sample)


        
        hypos = _generator.generate(models=ensemble, sample=sample)
    
        print('\nTarget      :', examples[tgt_lang][i])
        if tgt_tok_type == 'newmm':
            toks = ' '.join(word_tokenize(examples[tgt_lang][i], keep_whitespace=""))
        elif tgt_tok_type == 'sentencepiece' and use_tokenizer == True:
            toks = encode_bpe([examples[tgt_lang][i]], lang=tgt_lang)[0]
        elif tgt_tok_type == 'sentencepiece' and use_tokenizer == False:
            toks = examples[tgt_lang][i]

        else:
            raise "error"
        print('Target toks : {}'.format(toks))
        print('')
        
        for i in range(0, min(5, beam_size)):
            hypo = hypos[0][i]
            indices = list(hypo['tokens'].clone().detach().numpy())

            out_symbols = ' '.join([tgt_dict[idx] for idx in indices])
            
            print('Prediction {}: {}'.format(i+1, out_symbols))
            if tgt_tok_type == 'sentencepiece':
                out_symbols = ''.join([tgt_dict[idx] for idx in indices])
                print('Prediction (detokenized) {}: {}'.format(i+1, out_symbols.replace('▁', ' ').lstrip()))

        print('\n\n')

    
# # print(task)

In [198]:
# helper function to make inference of the model by experiment id
def run_inference_by_exp(dataset_name, n_samples, exp_name_and_checkpoint, src_type, tgt_type, src_lang, tgt_lang, use_tokenizer):
    # e.g. exp_name = "exp002-1/transformer_base/checkpoint15.pt"
    model_path = '../data/opensubtitles_model/{}'.format(exp_name_and_checkpoint)
    
    src_dict_path = '../data/opensubtitles_bin/{}-{}/{}-{}/dict.{}.txt'.format(src_type, tgt_type, src_lang, tgt_lang, src_lang)
    tgt_dict_path = '../data/opensubtitles_bin/{}-{}/{}-{}/dict.{}.txt'.format(src_type, tgt_type, src_lang, tgt_lang, tgt_lang)

    
    test_src_tok_path = '../data/opensubtitles_tok/{}-{}/{}-{}/test.{}'.format(src_type, tgt_type, src_lang, tgt_lang, src_lang)
    test_tgt_tok_path = '../data/opensubtitles_tok/{}-{}/{}-{}/test.{}'.format(src_type, tgt_type, src_lang, tgt_lang, tgt_lang)

    texts['opensubtitles'] = {
        src_lang: [],
        tgt_lang: []
    }
    with open(test_src_tok_path, 'r') as f:
        for item in f.readlines():
            texts['opensubtitles'][src_lang].append(item.replace('\n', ''))
        
    with open(test_tgt_tok_path, 'r') as f:
        for item in f.readlines():
            texts['opensubtitles'][tgt_lang].append(item.replace('\n', ''))

    print('model_path', model_path)
    print('src_dict_path', src_dict_path)
    print('tgt_dict_path', tgt_dict_path)

    if dataset_name in ['opensubtitles', 'wang']:
        run_inference(src_lang=src_lang, tgt_lang=tgt_lang,
              src_tok_type=src_type, tgt_tok_type=tgt_type,
              src_dict_path=src_dict_path, tgt_dict_path=tgt_dict_path,
              model_path=model_path,
              examples=texts[dataset_name],
              n_examples=n_samples,
              beam_size=5,
             use_tokenizer=use_tokenizer)
    else:
        print('dataset_name ', dataset_name, 'not found')


## Prediction examples -- exp002-1.1

### Transformer Base (en->th) (word->word)  

1) Opensubtitles test set

In [180]:
# exp002-1.1 en-th word-word

run_inference_by_exp(dataset_name='opensubtitles',
                     n_samples=100,
                     exp_name_and_checkpoint='exp002-1/transformer_base/checkpoint_best.pt',
                     src_type='newmm', tgt_type='newmm',
                     src_lang='en', tgt_lang='th',
                     use_tokenizer=False)

model_path ../data/opensubtitles_model/exp002-1/transformer_base/checkpoint_best.pt
src_dict_path ../data/opensubtitles_bin/newmm-newmm/en-th/dict.en.txt
tgt_dict_path ../data/opensubtitles_bin/newmm-newmm/en-th/dict.th.txt
| loading model from ../data/opensubtitles_model/exp002-1/transformer_base/checkpoint_best.pt
| [en] dictionary: 100000 types
| [th] dictionary: 88848 types
len(src_dict) =  100000
len(tgt_dict) =  88848
Inference from en to th
1
Source : The entire complex has been wired with C- 4
tokens : ['The', 'entire', 'complex', 'has', 'been', 'wired', 'with', 'C-', '4']
src indices: torch.Size([1, 9]) tensor([[  62,  860, 4341,  142,  102, 6988,   44, 9280, 1047]])
src len    : torch.Size([1, 1]) tensor([[9]])

Target      : ที่ ซับซ้อน ทั้งหมด ได้ รับสาย ด้วย C- 4
Target toks : ที่ ซับซ้อน ทั้งหมด ได้ รับสาย ด้วย C- 4

Prediction 1: ส่วน เรื่อง ทั้งหมด นี้ เป็น แค่ ข้อ 4 </s>
Prediction 2: เรื่อง ทั้งหมด นี้ เป็น แค่ เรื่อง ซี - 4 </s>
Prediction 3: ส่วน เรื่อง ทั้งหมด นี้ 

2) Wang datset

In [181]:
# exp002-1.1 en-th word-word

run_inference_by_exp(dataset_name='wang',
                     n_samples=100,
                     exp_name_and_checkpoint='exp002-1/transformer_base/checkpoint_best.pt',
                     src_type='newmm', tgt_type='newmm',
                     src_lang='en', tgt_lang='th',
                     use_tokenizer=True)

model_path ../data/opensubtitles_model/exp002-1/transformer_base/checkpoint_best.pt
src_dict_path ../data/opensubtitles_bin/newmm-newmm/en-th/dict.en.txt
tgt_dict_path ../data/opensubtitles_bin/newmm-newmm/en-th/dict.th.txt
| loading model from ../data/opensubtitles_model/exp002-1/transformer_base/checkpoint_best.pt
| [en] dictionary: 100000 types
| [th] dictionary: 88848 types
len(src_dict) =  100000
len(tgt_dict) =  88848
Inference from en to th
1
Source : He brushed it off once more.
tokens : ['He', 'brushed', 'it', 'off', 'once', 'more', '.']
src indices: torch.Size([1, 7]) tensor([[   64, 18670,    16,   154,   423,   133,     4]])
src len    : torch.Size([1, 1]) tensor([[7]])

Target      : เขาปัดมันอีกครั้ง
Target toks : เขา ปัด มัน อีกครั้ง

Prediction 1: เขา หวี มัน ออก ไป อีกครั้ง </s>
Prediction 2: เขา หวี มัน ออก ไป อีก ครั้ง </s>
Prediction 3: เขา หวี มัน ออก ไป อีกครั้ง แล้ว </s>
Prediction 4: เขา หวี มัน ออก ไป ครั้ง เดียว </s>
Prediction 5: เขา หวี มัน ออก ไป อีก ครั้งห


## Prediction examples -- exp002-2.1

### Transformer Base (en->th) (sentencepiece->word)  



1) Opensubtitles test set

In [182]:
# exp002-1.1 en-th word-word

run_inference_by_exp(dataset_name='opensubtitles',
                     n_samples=100,
                     exp_name_and_checkpoint='exp002-2/transformer_base/checkpoint_best.pt',
                     src_type='sentencepiece', tgt_type='newmm',
                     src_lang='en', tgt_lang='th',
                     use_tokenizer=False)

model_path ../data/opensubtitles_model/exp002-2/transformer_base/checkpoint_best.pt
src_dict_path ../data/opensubtitles_bin/sentencepiece-newmm/en-th/dict.en.txt
tgt_dict_path ../data/opensubtitles_bin/sentencepiece-newmm/en-th/dict.th.txt
| loading model from ../data/opensubtitles_model/exp002-2/transformer_base/checkpoint_best.pt
| [en] dictionary: 23896 types
| [th] dictionary: 88848 types
len(src_dict) =  23896
len(tgt_dict) =  88848
Inference from en to th
1
Source : ▁the ▁entire ▁complex ▁has ▁been ▁w ired ▁with ▁c -0
tokens : ['▁the', '▁entire', '▁complex', '▁has', '▁been', '▁w', 'ired', '▁with', '▁c', '-0']
src indices: torch.Size([1, 10]) tensor([[  10,  969, 4653,  134,  109,  172, 3817,   47,  224, 3146]])
src len    : torch.Size([1, 1]) tensor([[10]])

Target      : ที่ ซับซ้อน ทั้งหมด ได้ รับสาย ด้วย C- 4
Target toks : ที่ ซับซ้อน ทั้งหมด ได้ รับสาย ด้วย C- 4

Prediction 1: ระบบ ทั้งหมด นี้ ได้รับ การ ติด จาก การ ที่ มี การ ต่อ จาก ซี ซี - ซี บี ซี 4 </s>
Prediction 2: ระบ

2) Wang datset

In [183]:

run_inference_by_exp(dataset_name='wang',
                     n_samples=100,
                     exp_name_and_checkpoint='exp002-2/transformer_base/checkpoint_best.pt',
                     src_type='sentencepiece', tgt_type='newmm',
                     src_lang='en', tgt_lang='th',
                     use_tokenizer=True)

model_path ../data/opensubtitles_model/exp002-2/transformer_base/checkpoint_best.pt
src_dict_path ../data/opensubtitles_bin/sentencepiece-newmm/en-th/dict.en.txt
tgt_dict_path ../data/opensubtitles_bin/sentencepiece-newmm/en-th/dict.th.txt
| loading model from ../data/opensubtitles_model/exp002-2/transformer_base/checkpoint_best.pt
| [en] dictionary: 23896 types
| [th] dictionary: 88848 types
len(src_dict) =  23896
len(tgt_dict) =  88848
Inference from en to th
1
Source : He brushed it off once more.
tokens : ['▁he', '▁br', 'ushed', '▁it', '▁off', '▁once', '▁more', '.']
src indices: torch.Size([1, 8]) tensor([[  28, 1957, 6943,   14,  149,  370,  132,    4]])
src len    : torch.Size([1, 1]) tensor([[8]])

Target      : เขาปัดมันอีกครั้ง
Target toks : เขา ปัด มัน อีกครั้ง

Prediction 1: อีกครั้ง ที่ ใช้ มัน อีกครั้ง </s>
Prediction 2: เขา เคย ใช้ มัน ครั้งหนึ่ง อีกครั้ง </s>
Prediction 3: เขา เคย ใช้ มัน มา ครั้ง ละ ครั้ง </s>
Prediction 4: เขา เคย ใช้ มัน มา ครั้ง นึง </s>
Prediction 5


## Prediction examples -- exp002-3.1

### Transformer Base (en->th) (word->sentencepiece)  



1) Opensubtitles test set

In [192]:
# exp002-3.1 en-th word-sentencepiece

run_inference_by_exp(dataset_name='opensubtitles',
                     n_samples=100,
                     exp_name_and_checkpoint='exp002-3/transformer_base/checkpoint13.pt',
                     src_type='newmm', tgt_type='sentencepiece',
                     src_lang='en', tgt_lang='th',
                     use_tokenizer=False)

model_path ../data/opensubtitles_model/exp002-3/transformer_base/checkpoint13.pt
src_dict_path ../data/opensubtitles_bin/newmm-sentencepiece/en-th/dict.en.txt
tgt_dict_path ../data/opensubtitles_bin/newmm-sentencepiece/en-th/dict.th.txt
| loading model from ../data/opensubtitles_model/exp002-3/transformer_base/checkpoint13.pt
| [en] dictionary: 90000 types
| [th] dictionary: 22144 types
len(src_dict) =  90000
len(tgt_dict) =  22144
Inference from en to th
1
Source : The entire complex has been wired with C- 4
tokens : ['The', 'entire', 'complex', 'has', 'been', 'wired', 'with', 'C-', '4']
src indices: torch.Size([1, 9]) tensor([[  62,  860, 4341,  142,  102, 6988,   44, 9280, 1047]])
src len    : torch.Size([1, 1]) tensor([[9]])

Target      : ▁ที่ ซับซ้อน ทั้งหมด ได้รับ สาย ด้วย ▁c -0
Target toks : ▁ที่ ซับซ้อน ทั้งหมด ได้รับ สาย ด้วย ▁c -0

Prediction 1: ▁ ใช่ </s>
Prediction (detokenized) 1: ใช่</s>
Prediction 2: ▁# และ # </s>
Prediction (detokenized) 2: #และ#</s>
Prediction 3: ▁โ #

2) Wang datset

In [191]:
# exp002-3.1 en-th word-sentencepiece

run_inference_by_exp(dataset_name='wang',
                     n_samples=100,
                     exp_name_and_checkpoint='exp002-3/transformer_base/checkpoint13.pt',
                     src_type='newmm', tgt_type='sentencepiece',
                     src_lang='en', tgt_lang='th',
                     use_tokenizer=True)

model_path ../data/opensubtitles_model/exp002-3/transformer_base/checkpoint13.pt
src_dict_path ../data/opensubtitles_bin/newmm-sentencepiece/en-th/dict.en.txt
tgt_dict_path ../data/opensubtitles_bin/newmm-sentencepiece/en-th/dict.th.txt
| loading model from ../data/opensubtitles_model/exp002-3/transformer_base/checkpoint13.pt
| [en] dictionary: 90000 types
| [th] dictionary: 22144 types
len(src_dict) =  90000
len(tgt_dict) =  22144
Inference from en to th
1
Source : He brushed it off once more.
tokens : ['He', 'brushed', 'it', 'off', 'once', 'more', '.']
src indices: torch.Size([1, 7]) tensor([[   64, 18670,    16,   154,   423,   133,     4]])
src len    : torch.Size([1, 1]) tensor([[7]])

Target      : เขาปัดมันอีกครั้ง
Target toks : ▁เขา ปัด มัน อีกครั้ง

Prediction 1: ▁แ ปร ง อีกครั้ง </s>
Prediction (detokenized) 1: แปรงอีกครั้ง</s>
Prediction 2: ▁แ ปร ง อีกครั้ง ซิ </s>
Prediction (detokenized) 2: แปรงอีกครั้งซิ</s>
Prediction 3: ▁แ ปร ง อีก แล้ว </s>
Prediction (detokenized) 3: 

## Prediction examples -- exp002-4.1

### Transformer Base (en->th) (sentencepiece->sentencepiece [shared])  


__Important Note:__ SentencePiece does normalization so the all words are converted into lowercase form and all number are converted to 0.

1) Opensubtitles test set

In [184]:
# exp002-1.1 en-th word-word

run_inference_by_exp(dataset_name='opensubtitles',
                     n_samples=100,
                     exp_name_and_checkpoint='exp002-4/transformer_base_b/checkpoint_best.pt',
                     src_type='sentencepiece', tgt_type='sentencepiece',
                     src_lang='en', tgt_lang='th',
                     use_tokenizer=False)

model_path ../data/opensubtitles_model/exp002-4/transformer_base_b/checkpoint_best.pt
src_dict_path ../data/opensubtitles_bin/sentencepiece-sentencepiece/en-th/dict.en.txt
tgt_dict_path ../data/opensubtitles_bin/sentencepiece-sentencepiece/en-th/dict.th.txt
| loading model from ../data/opensubtitles_model/exp002-4/transformer_base_b/checkpoint_best.pt
| [en] dictionary: 42760 types
| [th] dictionary: 42760 types
len(src_dict) =  42760
len(tgt_dict) =  42760
Inference from en to th
1
Source : ▁the ▁entire ▁complex ▁has ▁been ▁w ired ▁with ▁c -0
tokens : ['▁the', '▁entire', '▁complex', '▁has', '▁been', '▁w', 'ired', '▁with', '▁c', '-0']
src indices: torch.Size([1, 10]) tensor([[   10,  2526, 10526,   268,   210,   382,  8838,    69,   421,  3658]])
src len    : torch.Size([1, 1]) tensor([[10]])

Target      : ▁ที่ ซับซ้อน ทั้งหมด ได้รับ สาย ด้วย ▁c -0
Target toks : ▁ที่ ซับซ้อน ทั้งหมด ได้รับ สาย ด้วย ▁c -0

Prediction 1: ▁เรื่อง ทั้งหมด นี้ กําลัง ถูก ตรวจสอบ ด้วย เครื่อง ซี โฟ ไฟล์ คอม

2) Wang datset

In [185]:
# exp002-1.1 en-th word-word

run_inference_by_exp(dataset_name='wang',
                     n_samples=100,
                     exp_name_and_checkpoint='exp002-4/transformer_base_b/checkpoint_best.pt',
                     src_type='sentencepiece', tgt_type='sentencepiece',
                     src_lang='en', tgt_lang='th',
                     use_tokenizer=True)

model_path ../data/opensubtitles_model/exp002-4/transformer_base_b/checkpoint_best.pt
src_dict_path ../data/opensubtitles_bin/sentencepiece-sentencepiece/en-th/dict.en.txt
tgt_dict_path ../data/opensubtitles_bin/sentencepiece-sentencepiece/en-th/dict.th.txt
| loading model from ../data/opensubtitles_model/exp002-4/transformer_base_b/checkpoint_best.pt
| [en] dictionary: 42760 types
| [th] dictionary: 42760 types
len(src_dict) =  42760
len(tgt_dict) =  42760
Inference from en to th
1
Source : He brushed it off once more.
tokens : ['▁he', '▁br', 'ushed', '▁it', '▁off', '▁once', '▁more', '.']
src indices: torch.Size([1, 8]) tensor([[   33,  4228, 14946,    15,   312,   903,   266,     4]])
src len    : torch.Size([1, 1]) tensor([[8]])

Target      : เขาปัดมันอีกครั้ง
Target toks : ▁เขา ปัด มัน อีกครั้ง

Prediction 1: ▁ที่ รัก </s>
Prediction (detokenized) 1: ที่รัก</s>
Prediction 2: ▁มัน คือ </s>
Prediction (detokenized) 2: มันคือ</s>
Prediction 3: ▁มัน คือ ... </s>
Prediction (detokenize

In [171]:
# !ls ../data/opensubtitles_tok/sentencepiece-sentencepiece/en-th
# !head ../data/opensubtitles_tok/sentencepiece-sentencepiece/en-th/train.th

## Prediction examples -- exp003-1.1

### Transformer Base (th->en) (word->word)  



1) Opensubtitles test set

In [187]:

run_inference_by_exp(dataset_name='opensubtitles',
                     n_samples=100,
                     exp_name_and_checkpoint='exp003-1/transformer_base/checkpoint_best.pt',
                     src_type='newmm', tgt_type='newmm',
                     src_lang='th', tgt_lang='en',
                     use_tokenizer=False)


model_path ../data/opensubtitles_model/exp003-1/transformer_base/checkpoint_best.pt
src_dict_path ../data/opensubtitles_bin/newmm-newmm/th-en/dict.th.txt
tgt_dict_path ../data/opensubtitles_bin/newmm-newmm/th-en/dict.en.txt
| loading model from ../data/opensubtitles_model/exp003-1/transformer_base/checkpoint_best.pt
| [th] dictionary: 88848 types
| [en] dictionary: 100000 types
len(src_dict) =  88848
len(tgt_dict) =  100000
Inference from th to en
1
Source : ที่ ซับซ้อน ทั้งหมด ได้ รับสาย ด้วย C- 4
tokens : ['ที่', 'ซับซ้อน', 'ทั้งหมด', 'ได้', 'รับสาย', 'ด้วย', 'C-', '4']
src indices: torch.Size([1, 8]) tensor([[    6,  1799,   168,    21,  2197,    53, 17747,   687]])
src len    : torch.Size([1, 1]) tensor([[8]])

Target      : The entire complex has been wired with C- 4
Target toks : The entire complex has been wired with C- 4

Prediction 1: It ' s all complicated . </s>
Prediction 2: It ' s all complicated , picking up the phone by 4 th , four . </s>
Prediction 3: It ' s all complic

2) Wang datset

In [188]:

run_inference_by_exp(dataset_name='wang',
                     n_samples=100,
                     exp_name_and_checkpoint='exp003-1/transformer_base/checkpoint_best.pt',
                     src_type='newmm', tgt_type='newmm',
                     src_lang='th', tgt_lang='en',
                     use_tokenizer=True)


model_path ../data/opensubtitles_model/exp003-1/transformer_base/checkpoint_best.pt
src_dict_path ../data/opensubtitles_bin/newmm-newmm/th-en/dict.th.txt
tgt_dict_path ../data/opensubtitles_bin/newmm-newmm/th-en/dict.en.txt
| loading model from ../data/opensubtitles_model/exp003-1/transformer_base/checkpoint_best.pt
| [th] dictionary: 88848 types
| [en] dictionary: 100000 types
len(src_dict) =  88848
len(tgt_dict) =  100000
Inference from th to en
1
Source : เขาปัดมันอีกครั้ง
tokens : ['เขา', 'ปัด', 'มัน', 'อีกครั้ง']
src indices: torch.Size([1, 4]) tensor([[  19, 6285,    9,  383]])
src len    : torch.Size([1, 1]) tensor([[4]])

Target      : He brushed it off once more.
Target toks : He brushed it off once more .

Prediction 1: He turned it again . </s>
Prediction 2: He turned it again again . </s>
Prediction 3: He turned it over again . </s>
Prediction 4: He moved it again again . </s>
Prediction 5: He turned it over again again . </s>



2
Source : เขาปัดมัน
tokens : ['เขา', 'ปัด',