In [2]:
from transformer.layers import make_model
from transformer.utils import subsequent_mask, tokenize, detokenize, split_tokens
from transformer.layers import EncoderDecoder
from torchtext.data.metrics import bleu_score
import torch, unicodedata
from tqdm import tqdm

In [3]:
# embedding_data = torch.load('../5. Embedding/embedding_base.h5')
# model_data = torch.load('./model_base.h5')

In [4]:
def load_model_embedding(embedding_file, model_data) -> tuple[EncoderDecoder, dict, list]:
    embedding_data = torch.load(embedding_file, map_location=torch.device(DEVICE))
    model_data = torch.load(model_data, map_location=torch.device(DEVICE))
    model = make_model(
        SRC_VOCAB_LEN,
        DST_VOCAB_LEN,
        N=N, d_ff=D_FF, h=HEAD, dropout=DROPOUT,
        d_model=EMBEDDING_SIZE,
        device=DEVICE,
    ).eval()
    model.load_state_dict(model_data['model_state'])
    model.eval()
    mad_tokens = {x:i for i,x in enumerate(embedding_data['MAD']['token'])}
    ind_tokens = [x for i,x in enumerate(embedding_data['IND']['token'])]
    return model, mad_tokens, ind_tokens

In [5]:
def translate(src_teks: list[str], model_data: tuple[EncoderDecoder, dict, list], debug=False) -> list[str]:
    model, mad_tokens, ind_tokens = model_data
    src = tokenize(src_teks, mad_tokens, wordpiece=True, debug=debug)
    src_mask = (src != 2).unsqueeze(-2)
    memory = model.encode(src, src_mask)
    ys = torch.zeros(1, 1).type_as(src)
    for i in range(100):
            out = model.decode(
                memory, src_mask, ys, subsequent_mask(ys.size(1)).type_as(src.data)
            )
            prob = model.generator(out[:, -1])
            _, next_word = torch.max(prob, dim=1)
            next_word = next_word.data[0]
            ys = torch.cat(
                [ys, torch.empty(1, 1).type_as(src.data).fill_(next_word)], dim=1
            )
            if next_word == 1:
                break
    return detokenize(ys, ind_tokens, debug=debug)

In [15]:
tests, truths = [], []
with open('../7. Testing/data.txt', 'r', encoding='utf8') as f:
    while True:
        t, t2 = f.readline(), f.readline()
        if not t or not t2:
            break
        tests.append(unicodedata.normalize('NFKD', t).strip().lower().replace('q', "'"))
        truths.append([split_tokens(t2)])

In [16]:
def test_model(model, tests, truths, output):
    predicts = []
    for i in tqdm(range(len(tests))):
        predicts.append(split_tokens(translate([tests[i]], model)[0]))
    with open(output, 'w', encoding='utf8') as f:
        for i in range(1, 5):
            f.write(f'BLEU w/ n-gram {i} : \
                {bleu_score(predicts, truths,max_n=i, weights=[1/i]*i)*100}\
            \n')
        f.write('\n')
        for i in range(len(predicts)):
            f.write(f'Question: {tests[i]}\n')
            f.write(f'Truth   : {" ".join(truths[i][0])}\n')
            f.write(f'Answer  : {" ".join(predicts[i])}\n')
            f.write('\n')

In [17]:
BATCH_SIZE, EPOCH, EMBEDDING_SIZE, PAD_TOKEN, DEVICE = 200, 500, 512, 2, 'cpu'
N, D_FF, HEAD, DROPOUT, LABEL_SMOOTHING, LR_MODE = 6, 2048, 8, 0.1, 0.1, 'warmup'
SRC_VOCAB_LEN, DST_VOCAB_LEN = 30994, 14972
model = load_model_embedding('../5. Embedding/embedding_base.h5', './model_high_base.h5')
test_model(model, tests, truths, './result_high_base.txt')

100%|██████████| 284/284 [00:32<00:00,  8.73it/s]


In [37]:
BATCH_SIZE, EPOCH, EMBEDDING_SIZE, PAD_TOKEN, DEVICE = 200, 500, 512, 2, 'cpu'
N, D_FF, HEAD, DROPOUT, LABEL_SMOOTHING, LR_MODE = 6, 2048, 8, 0.1, 0.1, 'warmup'
SRC_VOCAB_LEN, DST_VOCAB_LEN = 8004, 8004
model = load_model_embedding('../5. Embedding/embedding_wp8k.h5', 
'./model_high_wp8k.h5')
# test_model(model, tests, truths, './result_high_wp8k.txt')

In [38]:
BATCH_SIZE, EPOCH, EMBEDDING_SIZE, PAD_TOKEN, DEVICE = 200, 500, 256, 2, 'cpu'
N, D_FF, HEAD, DROPOUT, LABEL_SMOOTHING, LR_MODE = 6, 2048, 2, 0.3, 0.1, 'decay'
SRC_VOCAB_LEN, DST_VOCAB_LEN = 30994, 14972
model = load_model_embedding('../5. Embedding/embedding_base.h5', './model_low_base.h5')
test_model(model, tests, truths, './result_low_base.txt')

100%|██████████| 285/285 [00:45<00:00,  6.20it/s]


In [39]:
BATCH_SIZE, EPOCH, EMBEDDING_SIZE, PAD_TOKEN, DEVICE = 200, 500, 256, 2, 'cpu'
N, D_FF, HEAD, DROPOUT, LABEL_SMOOTHING, LR_MODE = 6, 2048, 2, 0.3, 0.1, 'decay'
SRC_VOCAB_LEN, DST_VOCAB_LEN = 8004, 8004
model = load_model_embedding('../5. Embedding/embedding_wp8k.h5', './model_low_wp8k.h5')
test_model(model, tests, truths, './result_low_wp8k.txt')

100%|██████████| 285/285 [00:03<00:00, 77.39it/s]


In [20]:
translate(["dâpa' è dissa' tèdung sèngko'"], model, debug=True)

src_tokenized ['[UNK]', 'è', '[UNK]', 'tèdung', "sèngko'"]
src_indexed [0, 3, 100, 3, 10752, 19, 1]
dst_indexed tensor([ 0, 16,  1])
dst_tokenized ['saya']


['saya']

In [18]:
# Find total unknown words
known_words, unknown_words, words_len, count_unk, count_tot, count_cov = set(), set(), list(), 0, 0, 0
for test in tests:
    words = split_tokens(test)
    words_len.append(len(words))
    has_unk = False
    for word in words:
        count_tot += 1
        if word.encode('utf-8') in model[1]:
            known_words.add(word)
        else:
            count_unk += 1
            unknown_words.add(word)
            has_unk = True
    if has_unk:
        count_cov += 1
print(f'Total known words: {len(known_words)}')
print(f'Total unknown words: {len(unknown_words)}')
print(f'Total words: {len(known_words) + len(unknown_words)}')
print(f'Average words per sentence: {sum(words_len)/len(words_len)}, Max words: {max(words_len)}, Min words: {min(words_len)}')
print(f'Percentage of unknown words: {count_unk/count_tot*100}%, {count_unk} of {count_tot} words')
print(f'Percentage of sentences with unknown words: {count_cov/len(tests)*100}%, {count_cov} of {len(tests)} sentences')

Total known words: 486
Total unknown words: 393
Total words: 879
Average words per sentence: 5.816901408450704, Max words: 28, Min words: 1
Percentage of unknown words: 27.72397094430993%, 458 of 1652 words
Percentage of sentences with unknown words: 80.98591549295774%, 230 of 284 sentences


In [11]:
486/879

0.552901023890785

In [None]:
torch.save(model, './model.h5')