In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from dataloader import get_dataloaders
import nltk
from transformer import Transformer,TransformerEncoder,TransformerDecoder
import utils
import pickle
from tabulate import tabulate
nltk.download('punkt')  
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/wicaksonolxn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
DATA_PATH = "dataset/"  
SAVE_DIR = "saved"
BATCH_SIZE = 32
_, _, test_loader = get_dataloaders(
    data_path=DATA_PATH, 
    source_lang="min", 
    target_lang="eng", 
    batch_size=BATCH_SIZE, 
    device=device
)
SRC_VOCAB_SIZE = 4000     
TGT_VOCAB_SIZE = 4000     
N_LAYERS = 1            
N_HEADS = 1
D_MODEL =  128
FFN_HIDDEN = D_MODEL * 4
DROPOUT = 0.3
encoder = TransformerEncoder(SRC_VOCAB_SIZE,D_MODEL,N_LAYERS,N_HEADS,FFN_HIDDEN,DROPOUT,device)
decoder = TransformerDecoder(TGT_VOCAB_SIZE,D_MODEL,N_LAYERS,N_HEADS,FFN_HIDDEN,DROPOUT,device)
best_model = Transformer(encoder,decoder,device,utils.PAD_TOKEN).to(device)
best_model.load_state_dict(torch.load(os.path.join(SAVE_DIR, "best_dictbase.pt")))
criterion = nn.CrossEntropyLoss(ignore_index=utils.PAD_TOKEN) 
print("Model initialized on:", device)
print("Loaded best model for testing!")


TrainData - Max 'min' sentence length: 76
TrainData - Max 'eng' sentence length: 107
TestData - Max 'min' sentence length: 61
TestData - Max 'eng' sentence length: 75
ValidData - Max 'min' sentence length: 65
ValidData - Max 'eng' sentence length: 85
Number of examples in train_dataset,train origin,train_raw: 799 799 799
Number of examples in valid_dataset: 100
Number of examples in test_dataset: 100
Model initialized on: cuda
Loaded best model for testing!


In [3]:
pth = "dataset"
src = "min"
tgt = "eng"
tp  = os.path.join(pth, f"{src}_{tgt}")
with open(os.path.join(tp, "input_dic.pkl"),  "rb") as f:
    input_lang_dic = pickle.load(f)
with open(os.path.join(tp, "output_dic.pkl"), "rb") as f:
    output_lang_dic = pickle.load(f)
def evaluate_test(model, test_dataset):
    model.eval()
    total_loss = 0.0
    all_bleu   = []
    with torch.no_grad():
        for i in range(len(test_dataset)):
            sample = test_dataset[i]
            src_token_ids = sample["src"]
            tgt_token_ids = sample["tgt"]
            if torch.is_tensor(src_token_ids):
                src_token_ids = src_token_ids.tolist()
            if torch.is_tensor(tgt_token_ids):
                tgt_token_ids = tgt_token_ids.tolist()
            src_tensor = torch.LongTensor(src_token_ids).unsqueeze(0).to(device)
            tgt_tensor = torch.LongTensor(tgt_token_ids).unsqueeze(0).to(device)
            output, _ = model(src_tensor, tgt_tensor[:, :-1])  # shape [1, seq_len-1, vocab_size]
            vocab_size = output.shape[-1]
            output_2d = output.view(-1, vocab_size)                 # [seq_len-1, vocab_size]
            tgt_2d    = tgt_tensor[:, 1:].contiguous().view(-1)     # [seq_len-1]
            loss = criterion(output_2d, tgt_2d)
            total_loss += loss.item()
            ref_text = utils.detokenize(tgt_token_ids, output_lang_dic)
            pred_ids = output[0].argmax(dim=1).tolist()  # shape [seq_len-1]
            hyp_text = utils.detokenize(pred_ids, output_lang_dic)
            bleu_score = utils.get_bleu(hyp_text.split(), ref_text.split())
            all_bleu.append(bleu_score)
    avg_loss = total_loss / len(test_dataset)
    avg_bleu = sum(all_bleu) / len(all_bleu)
    return avg_loss, avg_bleu
test_loss, test_bleu = evaluate_test(best_model, test_loader)
print(f"Test Loss = {test_loss:.4f} | BLEU = {test_bleu:.2f}")

Test Loss = 6.2575 | BLEU = 2.75


In [4]:
import os
import pickle
import torch
from utils import tokenize,detokenize
import torch
from translation import translate_sentence,translate_sentence_beam

num_samples_to_translate = 20
for i in range(num_samples_to_translate):
    sample = test_loader[i]  
    src_token_ids = sample["src"]
    tgt_token_ids = sample["tgt"]
    if torch.is_tensor(src_token_ids):
        src_token_ids = src_token_ids.tolist()
    if torch.is_tensor(tgt_token_ids):
        tgt_token_ids = tgt_token_ids.tolist()
    src_text = utils.detokenize(src_token_ids, input_lang_dic)
    real_target_text = utils.detokenize(tgt_token_ids, output_lang_dic)
    predicted_translation ,predicted_tokens= translate_sentence(
        token_ids=src_token_ids,
        input_dic=input_lang_dic,
        output_dic=output_lang_dic,
        model=best_model,
        device=device,
        max_len=utils.MAX_SENT_LEN,
        
    )
    print(f"_________________________________________________")
    print(f"|                 SRC NO.{i+1}                  |")
    print(f"_________________________________________________")
    print(f"Source: {src_text}")
    print(f"Predicted Token : {predicted_tokens}")
    print(f"Predicted Translation: {predicted_translation}")
    print(f"Real Target: {real_target_text}\n")
    print(f"_______________________________________________")

_________________________________________________
|                 SRC NO.1                  |
_________________________________________________
Source: kangkuangnyo lumayan tapi kapitiang saus UNK mangecewaan kami diagiah kapitiang yang UNK UNK kami ndak makan kapitiang dan dibaliakan .
Predicted Token : [1, 22, 250, 30, 56, 37, 28, 2]
Predicted Translation: the food is also great .
Real Target: the water spinach was alright but the crab with padang sauce was disappointing . we were given a UNK crab . in the end we decided not to eat the crab and UNK it .

_______________________________________________
_________________________________________________
|                 SRC NO.2                  |
_________________________________________________
Source: UNK tarimo bantuan sosial sagadang rp miliar
Predicted Token : [1, 22, 209, 181, 367, 28, 2]
Predicted Translation: the same as well .
Real Target: UNK UNK a total billion rupiah from government UNK in

______________________________

In [5]:
for i in range(len(test_loader)):
    if i<1:
        sample = test_loader[i] 
        src_token_ids = sample["src"].tolist()
        tgt_token_ids = sample["tgt"].tolist()
        src_text = utils.detokenize(src_token_ids, input_lang_dic)
        tgt_text = utils.detokenize(tgt_token_ids, output_lang_dic)
        print(f"src: {src_text}\ntgt: {tgt_text}\n")

src: kangkuangnyo lumayan tapi kapitiang saus UNK mangecewaan kami diagiah kapitiang yang UNK UNK kami ndak makan kapitiang dan dibaliakan .
tgt: the water spinach was alright but the crab with padang sauce was disappointing . we were given a UNK crab . in the end we decided not to eat the crab and UNK it .



In [6]:
print("Index to word mapping (first 10):")
for i in range(10):
    print(i, output_lang_dic.index2word[i])
print("Dictionary size:", len(input_lang_dic.word2index))

Index to word mapping (first 10):
0 PAD
1 SOS
2 EOS
3 UNK
4 enjoy
5 instalment
6 for
7 up
8 to
9 months
Dictionary size: 3800


In [7]:
print("Special tokens in the dictionary:")
for idx in range(4):
    print(idx, input_lang_dic.index2word[idx])

Special tokens in the dictionary:
0 PAD
1 SOS
2 EOS
3 UNK
