In [8]:
from transformer.layers import make_model
from transformer.utils import subsequent_mask, tokenize, detokenize
import torch, unicodedata
from transformer.layers import EncoderDecoder
from torchtext.data.metrics import bleu_score
from tqdm import tqdm

In [9]:
# embedding_data = torch.load('../5. Embedding/embedding_base.h5')
# model_data = torch.load('./model_base.h5')

In [10]:
def load_model_embedding(embedding_file, model_data) -> tuple[EncoderDecoder, dict, list]:
    embedding_data = torch.load(embedding_file, map_location=torch.device(DEVICE))
    model_data = torch.load(model_data, map_location=torch.device(DEVICE))
    model = make_model(
        SRC_VOCAB_LEN,
        DST_VOCAB_LEN,
        N=N, d_ff=D_FF, h=HEAD, dropout=DROPOUT,
        d_model=EMBEDDING_SIZE,
        device=DEVICE,
    ).eval()
    model.load_state_dict(model_data['model_state'])
    model.eval()
    mad_tokens = {x:i for i,x in enumerate(embedding_data['MAD']['token'])}
    ind_tokens = [x for i,x in enumerate(embedding_data['IND']['token'])]
    return model, mad_tokens, ind_tokens

In [11]:
def translate(src_teks: list[str], model_data: tuple[EncoderDecoder, dict, list], debug=False) -> list[str]:
    model, mad_tokens, ind_tokens = model_data
    src = tokenize(src_teks, mad_tokens, wordpiece=True, debug=debug)
    src_mask = (src != 2).unsqueeze(-2)
    memory = model.encode(src, src_mask)
    ys = torch.zeros(1, 1).type_as(src)
    for i in range(100):
            out = model.decode(
                memory, src_mask, ys, subsequent_mask(ys.size(1)).type_as(src.data)
            )
            prob = model.generator(out[:, -1])
            _, next_word = torch.max(prob, dim=1)
            next_word = next_word.data[0]
            ys = torch.cat(
                [ys, torch.empty(1, 1).type_as(src.data).fill_(next_word)], dim=1
            )
            if next_word == 1:
                break
    return detokenize(ys, ind_tokens, wordpiece=False, debug=debug)

In [12]:
BATCH_SIZE, EPOCH, EMBEDDING_SIZE, PAD_TOKEN, DEVICE = 200, 500, 512, 2, 'cpu'
N, D_FF, HEAD, DROPOUT, LABEL_SMOOTHING, LR_MODE = 6, 2048, 8, 0.1, 0.1, 'warmup'
SRC_VOCAB_LEN, DST_VOCAB_LEN = 30994, 14972
model = load_model_embedding('../5. Embedding/embedding_base.h5', './model_high_base.h5')

In [4]:
BATCH_SIZE, EPOCH, EMBEDDING_SIZE, PAD_TOKEN, DEVICE = 200, 500, 512, 2, 'cpu'
N, D_FF, HEAD, DROPOUT, LABEL_SMOOTHING, LR_MODE = 6, 2048, 8, 0.1, 0.1, 'warmup'
SRC_VOCAB_LEN, DST_VOCAB_LEN = 8004, 8004
model = load_model_embedding('../5. Embedding/embedding_wp8k.h5', './model_high_wp8k.h5')

In [36]:
BATCH_SIZE, EPOCH, EMBEDDING_SIZE, PAD_TOKEN, DEVICE = 200, 500, 256, 2, 'cpu'
N, D_FF, HEAD, DROPOUT, LABEL_SMOOTHING, LR_MODE = 6, 2048, 2, 0.3, 0.1, 'decay'
SRC_VOCAB_LEN, DST_VOCAB_LEN = 30994, 14972
model = load_model_embedding('../5. Embedding/embedding_base.h5', './model_low_base.h5')

In [44]:
BATCH_SIZE, EPOCH, EMBEDDING_SIZE, PAD_TOKEN, DEVICE = 200, 500, 256, 2, 'cpu'
N, D_FF, HEAD, DROPOUT, LABEL_SMOOTHING, LR_MODE = 6, 2048, 2, 0.3, 0.1, 'decay'
SRC_VOCAB_LEN, DST_VOCAB_LEN = 8004, 8004
model = load_model_embedding('../5. Embedding/embedding_wp8k.h5', './model_low_wp8k.h5')

In [13]:
translate(["pulpèn"], model, debug=True)

src_tokenized ['pulpèn']
src_indexed [0, 5082, 1]
dst_indexed tensor([    0, 14930,     1])
dst_tokenized ['pulpen']


['pulpen']

In [28]:
tests = [
    "sèngko' ngèbâ pancèng ka songay"
]
truths = [
     ["saya membawa pancing ke sungai".split(' ')]
]

In [19]:
predicts = [x.split(' ') for x in translate(tests, model, debug=True)]

predicts, truths, bleu_score(predicts, truths)

src_tokenized ["sèngko'", 'ngèbâ', 'pancèng', 'ka', 'songay']
src_indexed [0, 19, 3138, 19270, 64, 1367, 1]
src_tokenized ['sa', 'arè', "pa'", 'ama', 'mancèng', 'è', 'songay']
src_indexed [0, 20095, 656, 2486, 1146, 4128, 100, 1367, 1]
src_tokenized ['ban', 'orèng', "ngala'", 'be', 'è', 'songay']
src_indexed [0, 8947, 78, 717, 2548, 100, 1367, 1]
src_tokenized ['buwâ', 'salak', 'jârèya', 'sèpa', "ènga'", 'be']
src_indexed [0, 1439, 4033, 79, 27609, 782, 2548, 1]
src_tokenized ['tongket', 'jârèya', 'toghel']
src_indexed [0, 8654, 79, 24291, 1]
src_tokenized ['amin', 'ng', 'noghel', 'tongket']
src_indexed [0, 1295, 5439, 30340, 8654, 1]
src_tokenized ['sarè', 'ale']
src_indexed [0, 6670, 17495, 1]
src_tokenized ['èbhu', 'nyarè', 'tang', "alè'"]
src_indexed [0, 322, 376, 65, 1015, 1]
src_tokenized ['kala', 'bârâmpa', "bâ'na"]
src_indexed [0, 4212, 824, 50, 1]
src_tokenized ['sè', 'bhâghus', 'ngala', 'bhâi']
src_indexed [0, 62, 1759, 3882, 291, 1]
src_tokenized ["

AttributeError: 'str' object has no attribute 'decode'

In [13]:
predicts[0][1] == truths[0][0][1]

False

In [14]:
tests, truths, predicts = [], [], []
import unicodedata
with open('../7. Testing/kamus.txt', 'r', encoding='utf8') as f:
    while True:
        t, t2 = f.readline(), f.readline()
        if not t or not t2:
            break
        tests.append(unicodedata.normalize('NFKD', t).strip().lower().replace('q', "'"))
        truths.append([unicodedata.normalize('NFKD', t2).strip().lower().split(' ')])

In [15]:
for i in tqdm(range(len(tests))):
    predicts.append(translate([tests[i]], model)[0].split(' '))

  0%|          | 241/50084 [00:24<1:25:36,  9.70it/s]


KeyboardInterrupt: 

In [7]:
for i in range(1, 5):
    print('BLEU w/ n-gram', i, ":", bleu_score(predicts, truths, max_n=i, weights=[1/i]*i))

BLEU w/ n-gram 1 : 0.1887523101732215
BLEU w/ n-gram 2 : 0.10094028285978175
BLEU w/ n-gram 3 : 0.0496115126037617
BLEU w/ n-gram 4 : 0.02108332851610731


In [32]:
with open('result.txt', 'w', encoding='utf8') as f:
    for i in range(len(predicts)):
        f.write(f'Question: {tests[i]}\n')
        f.write(f'Truth   : {" ".join(truths[i][0])}\n')
        f.write(f'Answer  : {" ".join(predicts[i])}\n')
        f.write('\n')
   

In [33]:
truths

[[['saya', 'membawa', 'pancing', 'ke', 'sungai']],
 [['tiap', 'hari', 'pak', 'amat', 'memancing', 'di', 'sungai']],
 [['banyak', 'orang', 'mengambil', 'pasir', 'di', 'sungai']],
 [['buah', 'salak', 'itu', 'sifatnya', 'seperti', 'pasir']],
 [['tongkat', 'itu', 'putus']],
 [['amin', 'dapat', 'memutus', 'tongkat']],
 [['cari', 'adiknya!']],
 [['ibu', 'mencari', 'adik', 'saya']],
 [['kalah', 'berapa', 'kamu?']],
 [['yang', 'baik', 'mengalah', 'saja']],
 [['saya', 'mempunyai', 'seorang', 'abdi/pembantu']],
 [['kamu', 'harus', 'mengabdi', 'kepada', 'negara']],
 [['udeng', 'itu', 'baru']],
 [['pak', 'ali', 'memakai', 'udeng']],
 [['telur', 'itu', 'masak']],
 [['tiap', 'hari', 'ayam', 'saya', 'bertelur']],
 [['roti', 'itu', 'dimakan', 'oleh', 'adik', 'saya']],
 [['air', 'itu', 'diminum', 'oleh', 'kuda']],
 [['surat', 'saya', 'dibaca', 'oleh', 'anak', 'saya']],
 [['cincin', 'yang', 'hilang', 'dicari', 'oleh', 'kakak', 'saya']],
 [['pohon', 'itu', 'ditarik', 'sampai', 'roboh']],
 [['bawa', 'buku

In [34]:
tests

["sèngko' ngèbâ pancèng ka songay",
 "sabhhan arè pa' amat mancèng è songay",
 "banya' orèng ngala' beddhi è songay",
 "buwâ salak jârèya sèpaddha ènga' beddhi",
 'tongket jârèya toghel',
 'amin ngannèng noghel tongket',
 "sarè ale'èn!",
 "èbhu nyarè tang alè'",
 "kala bârâmpa bâ'na?",
 'sè bhâghus ngala bhâi',
 "sèngko' andi' abdhi kasorang",
 "bâ'na kodhu ngabdhi dâ' naghârâ",
 'odheng jârèya anyar',
 "pa' ali aodheng",
 "tellor jârèya massa'",
 'sabbhân arè tang ajâm atellor',
 "roti arowa èkakan bi' tang alè'",
 "aèng jârèya èènom bi' jhârân",
 "tang sorat ebaca bi' tang ana'",
 "sello' sè èlang èsare bi' tang kaka'",
 "bhungka jârèya ètajha' sampè rohghu",
 'kèbâ buku arèya kabengko',
 'kembhâng arèya bârnana mèra',
 'bârna kaèn arèya tamèra ghâllu',
 'è pèrèng bâdâ ghângan',
 'arèya kaghângan',
 'odheng arèya larang',
 'kaèn arèya kaodheng',
 'arèya pèssè ringgit',
 'arèya arghâna saringgit',
 'or