In [1]:
from transformer.layers import make_model
from transformer.utils import subsequent_mask, tokenize, detokenize
import torch, unicodedata
from transformer.layers import EncoderDecoder
from ctypes import Union
from torchtext.data.metrics import bleu_score
from tqdm import tqdm

In [2]:
# embedding_data = torch.load('../5. Embedding/embedding_base.h5')
# model_data = torch.load('./model_base.h5')

In [2]:
def load_model_embedding(embedding_file, model_data) -> tuple[EncoderDecoder, dict, list]:
    embedding_data = torch.load(embedding_file, map_location=torch.device(DEVICE))
    model_data = torch.load(model_data, map_location=torch.device(DEVICE))
    model = make_model(
        SRC_VOCAB_LEN,
        DST_VOCAB_LEN,
        N=N, d_ff=D_FF, h=HEAD, dropout=DROPOUT,
        d_model=EMBEDDING_SIZE,
        device=DEVICE,
    ).eval()
    model.load_state_dict(model_data['model_state'])
    model.eval()
    mad_tokens = {x:i for i,x in enumerate(embedding_data['MAD']['token'])}
    ind_tokens = [x for i,x in enumerate(embedding_data['IND']['token'])]
    return model, mad_tokens, ind_tokens

In [3]:
model[1]

NameError: name 'model' is not defined

In [4]:
def translate(src_teks: list[str], model_data: tuple[EncoderDecoder, dict, list], debug=False) -> list[str]:
    model, mad_tokens, ind_tokens = model_data
    src = tokenize(src_teks, mad_tokens, wordpiece=True, debug=debug)
    src_mask = (src != 2).unsqueeze(-2)
    memory = model.encode(src, src_mask)
    ys = torch.zeros(1, 1).type_as(src)
    for i in range(100):
            out = model.decode(
                memory, src_mask, ys, subsequent_mask(ys.size(1)).type_as(src.data)
            )
            prob = model.generator(out[:, -1])
            _, next_word = torch.max(prob, dim=1)
            next_word = next_word.data[0]
            ys = torch.cat(
                [ys, torch.empty(1, 1).type_as(src.data).fill_(next_word)], dim=1
            )
            if next_word == 1:
                break
    return detokenize(ys, ind_tokens, wordpiece=False, debug=debug)

In [7]:
BATCH_SIZE, EPOCH, EMBEDDING_SIZE, PAD_TOKEN, DEVICE = 200, 500, 512, 2, 'cpu'
N, D_FF, HEAD, DROPOUT, LABEL_SMOOTHING, LR_MODE = 6, 2048, 8, 0.1, 0.1, 'warmup'
SRC_VOCAB_LEN, DST_VOCAB_LEN = 30994, 14972
model = load_model_embedding('../5. Embedding/embedding_base.h5', './model_high_base.h5')

In [18]:
BATCH_SIZE, EPOCH, EMBEDDING_SIZE, PAD_TOKEN, DEVICE = 200, 500, 512, 2, 'cpu'
N, D_FF, HEAD, DROPOUT, LABEL_SMOOTHING, LR_MODE = 6, 2048, 8, 0.1, 0.1, 'warmup'
SRC_VOCAB_LEN, DST_VOCAB_LEN = 8004, 8004
model = load_model_embedding('../5. Embedding/embedding_wp8k.h5', './model_high_wp8k.h5')

In [5]:
BATCH_SIZE, EPOCH, EMBEDDING_SIZE, PAD_TOKEN, DEVICE = 200, 500, 256, 2, 'cpu'
N, D_FF, HEAD, DROPOUT, LABEL_SMOOTHING, LR_MODE = 6, 2048, 2, 0.3, 0.1, 'decay'
SRC_VOCAB_LEN, DST_VOCAB_LEN = 30994, 14972
model = load_model_embedding('../5. Embedding/embedding_base.h5', './model_low_base.h5')

In [6]:
BATCH_SIZE, EPOCH, EMBEDDING_SIZE, PAD_TOKEN, DEVICE = 200, 500, 256, 2, 'cpu'
N, D_FF, HEAD, DROPOUT, LABEL_SMOOTHING, LR_MODE = 6, 2048, 2, 0.3, 0.1, 'decay'
SRC_VOCAB_LEN, DST_VOCAB_LEN = 8004, 8004
model = load_model_embedding('../5. Embedding/embedding_wp8k.h5', './model_low_wp8k.h5')

In [19]:
translate(["pulpèn"], model, debug=True)

src_tokenized ['p', '_u', '_l', '_p', '_èn']
src_indexed [0, 41, 27, 16, 18, 250, 1]
dst_indexed tensor([   0, 5123,    7,   33,  196,    1])
dst_tokenized ['pu', '_l', '_p', '_en']


['pulpen']

In [20]:
tests = [
    "sèngko' ngèbâ pancèng ka songay"
]
truths = [
     ["saya membawa pancing ke sungai".split(' ')]
]

In [21]:
predicts = [x.split(' ') for x in translate(tests, model, debug=True)]

predicts, truths, bleu_score(predicts, truths)

src_tokenized ["sèngko'", 'ngèbâ', 'pancèng', 'ka', 'song', '_a', '_y']
src_indexed [0, 383, 3663, 562, 300, 2718, 9, 44, 1]
dst_indexed tensor([   0, 5380, 2051,    9, 4400,    8, 2235,    8,    1])
dst_tokenized ['saya', 'diber', '_i', 'sis', '_a', '_-sis', '_a']


([['saya', 'diberi', 'sisa-sisa']],
 [[['saya', 'membawa', 'pancing', 'ke', 'sungai']]],
 0.0)

In [13]:
predicts[0][1] == truths[0][0][1]

False

In [22]:
tests, truths, predicts = [], [], []
import unicodedata
with open('../7. Testing/data.txt', 'r', encoding='utf8') as f:
    while True:
        t, t2 = f.readline(), f.readline()
        if not t or not t2:
            break
        tests.append(unicodedata.normalize('NFD', t).strip().lower().replace('q', "'"))
        truths.append([unicodedata.normalize('NFD', t2).strip().lower().split(' ')])

In [13]:
for i in tqdm(range(len(tests))):
    predicts.append(translate([tests[i]], model)[0].split(' '))

100%|██████████| 100/100 [00:19<00:00,  5.21it/s]


In [15]:
bleu_score(predicts, truths)

0.0

In [21]:
with open('result.txt', 'w', encoding='utf8') as f:
    for i in range(len(predicts)):
        f.write(f'{tests[i]}\n')
        f.write(f'{" ".join(truths[i][0])}\n')
        f.write(f'{" ".join(predicts[i])}\n')
        f.write('\n')
   

In [22]:
truths

[[['adiknya', 'ada', 'tujuh']],
 [['jauhnya', 'dari', 'sini', 'tiga', 'puluh', 'kilo']],
 [['yang', 'dinantikan', 'kamu']],
 [['putranya', 'orang', 'itu,', 'dia']],
 [['yang', 'akan', 'mengantarkan,', 'saya']],
 [['adiknya', 'pandai']],
 [['saya',
   'sangat',
   'kasihan',
   'kepada',
   'sapinya,',
   'yang',
   'sampai-sampai',
   'susah',
   'payah',
   'menariknya']],
 [['kuda', 'itu', 'masih', 'muda', 'dan', 'sangat', 'bagus']],
 [['ayah', 'saya', 'guru', 'sd', 'teladan', 'yang', 'terkenal']],
 [['ayah',
   'saya,',
   'orang',
   'yang',
   'pandai',
   'dl',
   'daerah',
   'saya,',
   'guru']],
 [['ayah', 'saya', 'guru', 'sd', 'yang', 'terkenal', 'di', 'bangkalan']],
 [['sesudah',
   'darah',
   'itu',
   'masuk',
   'ke',
   'ruang',
   'yang',
   'atas',
   'yang',
   'kiri,',
   'lalu',
   'masuk',
   'ke',
   'ruang',
   'kiri',
   'yang',
   'bawah',
   'lalu',
   'masuk',
   'lagi',
   'ke',
   'aorta',
   'terus',
   'lagi',
   'ke',
   'seluruh',
   'badan']],
 [['kam

In [23]:
tests

["alè'en bâda papèttoe",
 "jhâuèpon dâri ka'dinto tello polo kilo",
 "sè èdantè' dhika",
 "pottrana orèng èngghanèka, abâ'na",
 "sè bhâkal ngater raghina sèngko'",
 "ale'èn pènter",
 "kaula sakalangkong neserra dâ' sapèna, sè kantos anjhingjhing panarèggha",
 "jhârân jârèya ghi' ngoḍâ sarta cè' bhâghussâ",
 'tang rama ghuru sd teladan sè terkenal',
 'tang rama, orèng sè pentèr è tang daerah, ghuru',
 'tang rama ghuru sd teladan sè terkenal è bangkalan',
 "saellana dârâ jârèya maso' ka pangkèng sè attas sè kèrè, lajhu maso' ka pangkèng kèrè se bâbâ lajhu maso' polè ka aorta terros polè ka sakabbhinna bhâdhân",
 "bâ'en ma' katondu bhâi, jhâghâ ra, ngaterraghi sorat arèya ka kantor pos; pas lekkas molè",
 "tang eppa' andi' huku bhâb jhâmona jhârân, jârèya cè' parlona sabab mon oreng ngobu jhârân ta' tao ka parkara jârèya, cè' sossana",
 "ya', eppa' ella ḍâteng, ngolok bâ'en",
 'kocèng ngakan tèkos matè',
 '',
