In [1]:
import os 
import sys

from typing import List

from pythainlp.tokenize import word_tokenize

In [2]:
sys.path.append("..")
from mt_opus.bleu_score import compute_bleu

In [3]:
# Define BPE

from bpemb import BPEmb

bpemb_pretrained ={
    'th': {
        '25000': BPEmb(lang="th", vs=25000)
    },
    'en': {
        '25000': BPEmb(lang="en", vs=25000)
    }
}

def encode_bpe(sentences: List[str], lang, n_vocab=25000):
    """Return a list of bpe tokens give a list of sentences"""
    segmented_sentences = []
    for sentence in sentences:
#         print(sentence)
        bpe_tokens = bpemb_pretrained[lang]['{}'.format(n_vocab)].encode(sentence)
        segmented_sentences.append(' '.join(bpe_tokens))
        
    return segmented_sentences


In [42]:
def model_inference(src, tgt, src_type, tgt_type):
    
    if src == "th" and tgt == "en":
        if tgt_type == "newmm":
            return ["I", "call", "to", "the", "Bullet", "chicken", "shop", "today", "."]
        if tgt_type == "sentencepiece":
            return ["_I", "_call", "_to", "_the", "_Bullet", "_chicken", "_shop", "_today."]
 
    if src == "en" and tgt == "th":
        if tgt_type == "newmm":
            return ["วันนี้", "ฉัน", "โทร", "ไป", "ที่", "ร้าน", "ไก่", "กระสุน", ]       
        if tgt_type == "sentencepiece":
            return ["_วันนี้", "_ฉัน", "โทร", "ไป", "ที่", "_ร้าน", "ไก่", "กระ", "_สุน"]        
        
        
def nmt(sentence, src, tgt, src_type, tgt_type):
    
    if src_type == "newmm":
        src_toks = word_tokenize(sentence, keep_whitespace=False)
    if src_type == "sentencepiece":
        src_toks = encode_bpe([sentence], lang=src)
    
    print("\t- src sentence tokenized: `{}`\n".format(src_toks))
    
    predicted_tokens = model_inference(src,tgt, src_type, tgt_type)
    
    print("\t- predicted tokens (before retokenize): `{}`".format(predicted_tokens))
    if tgt_type == "newmm":
        predicted_tokens = ' '.join(predicted_tokens)
        print("\t- predicted sentence (aftrer concatenation): `{}`".format(predicted_tokens))

    if tgt_type == "sentencepiece":
        predicted_tokens = ''.join(predicted_tokens).replace("_", " ")
        print("\t- predicted sentence (aftrer remobe bpe): `{}`".format(predicted_tokens))

    return word_tokenize(predicted_tokens, keep_whitespace=False)

In [43]:
refs_tokens = {
    "th": [[["ฉัน", "โทร", "ไป", "ที่", "ร้าน", "ไก่", "กระสุน", "วันนี้"]]],
    "en": [[["Today", ",", "I", "call", "to", "the", "Bullet", "Chicken", "shop", "."]]],
}

refs_text = {
    "th": "ฉันโทรไปที่ร้านไก่กระสุน วันนี้",
    "en": "Today, I call to the Bullet Chicken shop."
}


In [46]:
for lang in ["th", "en"]:
      
    src_lang = lang
    tgt_lang = "en" if src_lang == "th" else "th"

    print("\n\n{} → {}".format(src_lang, tgt_lang))

    for src_type in ["newmm", "sentencepiece"]:
        for tgt_type in ["newmm", "sentencepiece"]:
          
          
            print("\n    - {} → {}\n".format(src_type, tgt_type))

            sentence = refs_text[src_lang]
            
            print("\t- src sentence: `{}`\n".format(sentence))
            predicted_tokens = nmt(sentence, src_lang, tgt_lang, src_type, tgt_type)
            print("\n\t- predicted_tokens (after retokenize): `{}`".format(predicted_tokens))
            score = compute_bleu(refs_tokens[tgt_lang], [predicted_tokens])            
            
            
            print("\n\t- score = (bleu, precisions, bp, ratio, translation_length, reference_length) \n\t         = {}".format(score))



th → en

    - newmm → newmm

	- src sentence: `ฉันโทรไปที่ร้านไก่กระสุน วันนี้`

	- src sentence tokenized: `['ฉัน', 'โทร', 'ไป', 'ที่', 'ร้าน', 'ไก่', 'กระสุน', 'วันนี้']`

	- predicted tokens (before retokenize): `['I', 'call', 'to', 'the', 'Bullet', 'chicken', 'shop', 'today', '.']`
	- predicted sentence (aftrer concatenation): `I call to the Bullet chicken shop today .`

	- predicted_tokens (after retokenize): `['I', 'call', 'to', 'the', 'Bullet', 'chicken', 'shop', 'today', '.']`

	- score = (bleu, precisions, bp, ratio, translation_length, reference_length) 
	         = (0.43443712531357925, [0.7777777777777778, 0.5, 0.42857142857142855, 0.3333333333333333], 0.8948393168143697, 0.9, 9, 10)

    - newmm → sentencepiece

	- src sentence: `ฉันโทรไปที่ร้านไก่กระสุน วันนี้`

	- src sentence tokenized: `['ฉัน', 'โทร', 'ไป', 'ที่', 'ร้าน', 'ไก่', 'กระสุน', 'วันนี้']`

	- predicted tokens (before retokenize): `['_I', '_call', '_to', '_the', '_Bullet', '_chicken', '_shop', '_today.']`
