In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from dataloader import get_dataloaders
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformer import Transformer,TransformerEncoder,TransformerDecoder
import utils
nltk.download('punkt')  
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/wicaksonolxn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
BATCH_SIZE = 64
DATA_PATH = "dataset/"  
train_loader, val_loader, test_loader = get_dataloaders(
    data_path=DATA_PATH, 
    source_lang="min", 
    target_lang="eng", 
    batch_size=BATCH_SIZE, 
    device=device
)


In [3]:
for i, batch in enumerate(train_loader):
    if i < 1:
        print("src_batch type:", type(batch['src']))  
        print("tgt_batch type:", type(batch['tgt'])) 
        print("src_batch shape:", batch['src'].shape)  
        print("tgt_batch shape:", batch['tgt'].shape)  
    else:
        break


src_batch type: <class 'torch.Tensor'>
tgt_batch type: <class 'torch.Tensor'>
src_batch shape: torch.Size([32, 52])
tgt_batch shape: torch.Size([32, 52])


  src_batch = [torch.tensor(item['src'], dtype=torch.long) for item in batch]
  tgt_batch = [torch.tensor(item['tgt'], dtype=torch.long) for item in batch]


In [4]:
SRC_VOCAB_SIZE = 3000  
TGT_VOCAB_SIZE = 3000  
DROPOUT = 0.3         
N_LAYERS = 2          
N_HEADS = 2          
FFN_HIDDEN = 256      
D_MODEL = 128        

encoder = TransformerEncoder(SRC_VOCAB_SIZE,D_MODEL,N_LAYERS,N_HEADS,FFN_HIDDEN,DROPOUT,device)
decoder = TransformerDecoder(TGT_VOCAB_SIZE,D_MODEL,N_LAYERS,N_HEADS,FFN_HIDDEN,DROPOUT,device)
model = Transformer(encoder,decoder,device,utils.PAD_TOKEN).to(device)


In [5]:
optimizer = optim.Adam(model.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss(ignore_index=utils.PAD_TOKEN) 
print("Model initialized on:", device)

Model initialized on: cuda


In [6]:
import os
import torch
from tqdm import tqdm

EPOCHS = 40
SAVE_DIR = "saved"
os.makedirs(SAVE_DIR, exist_ok=True)
best_val_loss = float("inf")
best_model_path = None
for epoch in range(1, EPOCHS + 1):
    print(f"Epoch {epoch}/{EPOCHS}")
    model.train()
    total_train_loss = 0.0
    train_bar = tqdm(train_loader, desc="🚀 Training", 
                leave=False, total=len(train_loader))
    for batch in train_loader:
        src_batch = batch['src'].to(device)
        tgt_batch = batch['tgt'].to(device)
        
        optimizer.zero_grad()
        output,_= model(src_batch, tgt_batch)
        output_dim = output.shape[-1]
        
        output_dim = output.shape[-1]
        output = output[:, :-1, :].reshape(-1, output_dim)
        tgt_y  = tgt_batch[:, 1:].reshape(-1)
        
        loss = criterion(output, tgt_y)
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item()
        train_bar.set_postfix(loss=f"{loss.item():.4f}")


    avg_train_loss = total_train_loss / len(train_loader)
    model.eval()
    total_val_loss = 0.0
    val_bar = tqdm(val_loader, desc="🚀 Validation",
              leave=True, total=len(val_loader))
    with torch.no_grad():
        for batch in val_bar:
            src_batch = batch['src'].to(device)
            tgt_batch = batch['tgt'].to(device)
            
            output,_= model(src_batch, tgt_batch)
            output_dim = output.shape[-1]
            output = output[:, :-1, :].reshape(-1, output_dim)
            tgt_y  = tgt_batch[:, 1:].reshape(-1)
            
            loss = criterion(output, tgt_y)
            total_val_loss += loss.item()
            val_bar.set_postfix(loss=f"{loss.item():.4f}")
    
    avg_val_loss = total_val_loss / len(val_loader)
    print(f"[Epoch {epoch}] Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
    if avg_val_loss < best_val_loss:
        if best_model_path and os.path.exists(best_model_path):
            os.remove(best_model_path)
        best_val_loss = avg_val_loss
        best_model_path = os.path.join(SAVE_DIR, "best.pt")
        torch.save(model.state_dict(), best_model_path)
        print(f"  -> New best model saved at {best_model_path}")


Epoch 1/40


  src_batch = [torch.tensor(item['src'], dtype=torch.long) for item in batch]
  tgt_batch = [torch.tensor(item['tgt'], dtype=torch.long) for item in batch]
🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 202.48it/s, loss=5.8679]


[Epoch 1] Train Loss: 6.3013 | Val Loss: 5.8666
  -> New best model saved at saved/best.pt
Epoch 2/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 233.16it/s, loss=6.2520]


[Epoch 2] Train Loss: 4.8139 | Val Loss: 6.1405
Epoch 3/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 210.48it/s, loss=6.7910]


[Epoch 3] Train Loss: 3.9964 | Val Loss: 6.5639
Epoch 4/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 190.70it/s, loss=7.0355]


[Epoch 4] Train Loss: 3.6485 | Val Loss: 6.8089
Epoch 5/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 200.17it/s, loss=7.0741]


[Epoch 5] Train Loss: 3.4814 | Val Loss: 6.8153
Epoch 6/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 228.64it/s, loss=7.1275]


[Epoch 6] Train Loss: 3.3394 | Val Loss: 6.9300
Epoch 7/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 170.98it/s, loss=7.1835]


[Epoch 7] Train Loss: 3.2365 | Val Loss: 6.9612
Epoch 8/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 168.42it/s, loss=7.2649]


[Epoch 8] Train Loss: 3.1629 | Val Loss: 7.1045
Epoch 9/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 183.60it/s, loss=7.3076]


[Epoch 9] Train Loss: 3.1088 | Val Loss: 7.0779
Epoch 10/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 200.28it/s, loss=7.3612]


[Epoch 10] Train Loss: 3.0591 | Val Loss: 7.1917
Epoch 11/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 191.87it/s, loss=7.3515]


[Epoch 11] Train Loss: 3.0060 | Val Loss: 7.1928
Epoch 12/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 154.46it/s, loss=7.4909]


[Epoch 12] Train Loss: 2.9954 | Val Loss: 7.3098
Epoch 13/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 182.84it/s, loss=7.4190]


[Epoch 13] Train Loss: 2.9739 | Val Loss: 7.2139
Epoch 14/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 214.08it/s, loss=7.5437]


[Epoch 14] Train Loss: 2.9317 | Val Loss: 7.3522
Epoch 15/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 192.67it/s, loss=7.5038]


[Epoch 15] Train Loss: 2.9042 | Val Loss: 7.3165
Epoch 16/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 219.52it/s, loss=7.5830]


[Epoch 16] Train Loss: 2.9069 | Val Loss: 7.3752
Epoch 17/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 195.40it/s, loss=7.4480]


[Epoch 17] Train Loss: 2.8889 | Val Loss: 7.2234
Epoch 18/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 228.13it/s, loss=7.6120]


[Epoch 18] Train Loss: 2.8636 | Val Loss: 7.3675
Epoch 19/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 209.72it/s, loss=7.6400]


[Epoch 19] Train Loss: 2.8506 | Val Loss: 7.3839
Epoch 20/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 226.28it/s, loss=7.6084]


[Epoch 20] Train Loss: 2.8207 | Val Loss: 7.3661
Epoch 21/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 200.54it/s, loss=7.6744]


[Epoch 21] Train Loss: 2.8257 | Val Loss: 7.4616
Epoch 22/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 239.91it/s, loss=7.4704]


[Epoch 22] Train Loss: 2.8061 | Val Loss: 7.3408
Epoch 23/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 192.86it/s, loss=7.6001]


[Epoch 23] Train Loss: 2.7991 | Val Loss: 7.4445
Epoch 24/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 189.25it/s, loss=7.6267]


[Epoch 24] Train Loss: 2.7954 | Val Loss: 7.4143
Epoch 25/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 185.56it/s, loss=7.6496]


[Epoch 25] Train Loss: 2.7693 | Val Loss: 7.3866
Epoch 26/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 201.44it/s, loss=7.9169]


[Epoch 26] Train Loss: 2.7734 | Val Loss: 7.6588
Epoch 27/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 172.45it/s, loss=7.7425]


[Epoch 27] Train Loss: 2.7610 | Val Loss: 7.4907
Epoch 28/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 206.66it/s, loss=7.6634]


[Epoch 28] Train Loss: 2.7714 | Val Loss: 7.4773
Epoch 29/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 187.96it/s, loss=7.8558]


[Epoch 29] Train Loss: 2.7350 | Val Loss: 7.6643
Epoch 30/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 227.08it/s, loss=7.7977]


[Epoch 30] Train Loss: 2.7403 | Val Loss: 7.5837
Epoch 31/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 174.11it/s, loss=7.8149]


[Epoch 31] Train Loss: 2.7381 | Val Loss: 7.5911
Epoch 32/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 200.05it/s, loss=7.8211]


[Epoch 32] Train Loss: 2.7327 | Val Loss: 7.6076
Epoch 33/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 201.62it/s, loss=7.8531]


[Epoch 33] Train Loss: 2.7182 | Val Loss: 7.6751
Epoch 34/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 119.19it/s, loss=7.8473]


[Epoch 34] Train Loss: 2.7294 | Val Loss: 7.6988
Epoch 35/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 198.76it/s, loss=8.0075]


[Epoch 35] Train Loss: 2.7111 | Val Loss: 7.8096
Epoch 36/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 225.29it/s, loss=7.9818]


[Epoch 36] Train Loss: 2.7042 | Val Loss: 7.8015
Epoch 37/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 185.97it/s, loss=7.8766]


[Epoch 37] Train Loss: 2.6939 | Val Loss: 7.7438
Epoch 38/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 213.55it/s, loss=8.1027]


[Epoch 38] Train Loss: 2.6899 | Val Loss: 7.9189
Epoch 39/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 198.07it/s, loss=8.0806]


[Epoch 39] Train Loss: 2.6802 | Val Loss: 7.9260
Epoch 40/40


🚀 Validation: 100%|██████████| 6/6 [00:00<00:00, 214.45it/s, loss=8.0661]


[Epoch 40] Train Loss: 2.6913 | Val Loss: 7.8957


## Bleu Score

In [None]:
def greedy_decode(model, src, max_len, start_symbol=utils.SOS_TOKEN):
    model.eval()
    src_mask = model.make_input_mask(src)
    memory = model.encoder(src, src_mask)
    batch_size = src.size(0)
    ys = torch.ones(batch_size, 1).fill_(start_symbol).long().to(src.device)
    
    for i in range(max_len - 1):
        tgt_mask = model.make_target_mask(ys)
        out, _ = model.decoder(ys, memory, tgt_mask, src_mask)
        prob = out[:, -1, :]  # shape: [batch_size, d_model] (logits for last token)
        next_word = prob.argmax(dim=-1).unsqueeze(1)
        ys = torch.cat([ys, next_word], dim=1)
        if (next_word == utils.EOS_TOKEN).all():
            break
    return ys


In [None]:

import torch 
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

encoder = TransformerEncoder(SRC_VOCAB_SIZE, D_MODEL, N_LAYERS, N_HEADS, FFN_HIDDEN, DROPOUT, device)
decoder = TransformerDecoder(TGT_VOCAB_SIZE, D_MODEL, N_LAYERS, N_HEADS, FFN_HIDDEN, DROPOUT, device)
best_model = Transformer(encoder, decoder, device, utils.PAD_TOKEN).to(device)
best_model.load_state_dict(torch.load(os.path.join(SAVE_DIR, "best.pt")))
print("Loaded best model for testing!")
smooth_fn = SmoothingFunction().method1
references = []
hypotheses = []
best_model.eval()

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        src_batch = batch['src'].to(device)
        tgt_batch = batch['tgt'].to(device)
        preds = greedy_decode(best_model, src_batch, max_len=70)
        
        for i in range(src_batch.size(0)):
            gold = tgt_batch[i].tolist()
            gold_tokens = [str(t) for t in gold if t not in [utils.SOS_TOKEN, utils.PAD_TOKEN]]
            if utils.EOS_TOKEN in gold_tokens:
                gold_tokens = gold_tokens[:gold_tokens.index(utils.EOS_TOKEN)]
            pred = preds[i].tolist()
            pred_tokens = [str(t) for t in pred if t != utils.SOS_TOKEN]
            if utils.EOS_TOKEN in pred_tokens:
                pred_tokens = pred_tokens[:pred_tokens.index(utils.EOS_TOKEN)]
            references.append([gold_tokens])
            hypotheses.append(pred_tokens)

            
weights = (0.5, 0.5)  
bleu_score = corpus_bleu(
    references,
    hypotheses,
    weights=weights,
    smoothing_function=smooth_fn
)

print(f"Test BLEU-2: {bleu_score:.4f}")


Loaded best model for testing!


Testing:   0%|          | 0/664 [00:00<?, ?it/s]


IndexError: tuple index out of range

## Translasi

In [None]:
from preprocessing import Tokenize
tokenizer = Tokenize(path="dataset", src_lang="min", tgt_lang="eng")
tokenizer.load_vocab(filename_src="src_vocab.pkl", filename_tgt="tgt_vocab.pkl")

In [None]:
import torch
def translate_sentence(sentence, tokenizer, model, device, max_len=50):
    model.eval()
    with torch.no_grad():
        src_indices = tokenizer.numericalize(sentence, is_source=True)
        src_tensor  = torch.tensor(src_indices, dtype=torch.long).unsqueeze(0).to(device)
        encoder_outputs, (hidden, cell) = model.encoder(src_tensor)
        sos_idx = tokenizer.tgt_vocab["<sos>"]
        eos_idx = tokenizer.tgt_vocab["<eos>"]
        decoder_input = torch.LongTensor([[sos_idx]]).to(device)
        preds = []
        for _ in range(max_len):
            logits, (hidden, cell) = model.decoder(decoder_input, hidden, cell)

            next_token = logits.argmax(dim=-1)  
            pred_idx = next_token.item()
            if pred_idx == eos_idx:
                break
            preds.append(pred_idx)

            decoder_input = next_token
        translated_tokens = tokenizer.detokenize(preds, is_source=False)

    return translated_tokens


# Translasi: 
Ambo mancari awaknyo besok

    Bahasa Indonesia: Saya akan mencarimu besok.
    English: I will look for you tomorrow.

Alun salama, apo kabar?

    Bahasa Indonesia: Halo, apa kabar?
    English: Hello, how are you?

Dunsanak ka rumah gadang

    Bahasa Indonesia: Saudara, mari ke rumah gadang.
    English: Relatives, let's go to the traditional house.

Urang minang manarimo tradisi

    Bahasa Indonesia: Orang Minang menerima tradisi.
    English: Minangkabau people embrace tradition.

Apo ado di pasar?

    Bahasa Indonesia: Apa ada di pasar?
    English: What's there in the market?

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

minang_sentence = "Ambo mancari awaknyo besok"
translation_en = translate_sentence(minang_sentence, tokenizer, model, device, max_len=50)
print("Minangkabau:", minang_sentence)
print("English:    ", translation_en)
minang_sentences = [
    "Ambo mancari awaknyo besok", # saya mencari 
    "Alun salama, apo kabar?",
    "Dunsanak ka rumah gadang",
    "Urang minang manarimo tradisi",
    "Apo ado di pasar?"
]
for sentence in minang_sentences:
    translation = translate_sentence(sentence, tokenizer, model, device, max_len=50)
    print("Minangkabau:", sentence)
    print("English:    ", translation)


Minangkabau: Ambo mancari awaknyo besok
English:     The place is the the the
Minangkabau: Ambo mancari awaknyo besok
English:     The place is the the the
Minangkabau: Alun salama, apo kabar?
English:     The place is the the the
Minangkabau: Dunsanak ka rumah gadang
English:     The place is the the the
Minangkabau: Urang minang manarimo tradisi
English:     The place is the the the
Minangkabau: Apo ado di pasar?
English:     The place is the the the


Hasil kurang bagus karnea kekurangan dataset Hanya terdapat 400 dataset dan vocabnya terbatas utk proses pelatihan