In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from dataloader import get_dataloader
from model import Seq2SeqModel
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
nltk.download('punkt')  


[nltk_data] Downloading package punkt to
[nltk_data]     /home/wicaksonolxn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
BATCH_SIZE = 32
DATA_PATH = "dataset/eng_min"  
train_loader, val_loader, test_loader = get_dataloader(
    pth=DATA_PATH,
    batch_size=BATCH_SIZE,
    preprocessed_file="preprocessed_data.pkl" 
)




In [3]:
for i, (src_batch, tgt_batch) in enumerate(train_loader):
    if i < 1:
        print("src_batch shape:", src_batch.size())  
        print("tgt_batch shape:", tgt_batch.size())  
    else:
        break


src_batch shape: torch.Size([32, 81])
tgt_batch shape: torch.Size([32, 69])


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SRC_VOCAB_SIZE = 5000 
TGT_VOCAB_SIZE = 5000  
EMBED_DIM = 256
HIDDEN_DIM = 512
N_LAYERS = 10
DROPOUT = 0.5
PAD_IDX = 3  
model = Seq2SeqModel(
    src_vocab_size=SRC_VOCAB_SIZE,
    tgt_vocab_size=TGT_VOCAB_SIZE,
    embed_dim=EMBED_DIM,
    hidden_dim=HIDDEN_DIM,
    n_layers=N_LAYERS,
    pad_idx=PAD_IDX,
    dropout=DROPOUT,
    device=device
).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX) 
print("Model initialized on:", device)


Model initialized on: cuda


In [5]:
import os
import torch
from tqdm import tqdm
EPOCHS = 10
SAVE_DIR = "saved"
os.makedirs(SAVE_DIR, exist_ok=True)

best_val_loss = float("inf")
best_model_path = None

for epoch in range(1, EPOCHS + 1):
    print(f"Epoch {epoch}/{EPOCHS}")
    model.train()
    total_train_loss = 0.0
    train_bar = tqdm(train_loader, desc="Training", leave=False)
    for batch_idx, (src_batch, tgt_batch) in enumerate(train_bar):
        src_batch = src_batch.to(device)  
        tgt_batch = tgt_batch.to(device)  
        
        optimizer.zero_grad()
        output = model(src_batch, tgt_batch)

        output_dim = output.shape[-1]
        output = output[:, :-1, :].reshape(-1, output_dim)  
        tgt_y  = tgt_batch[:, 1:].reshape(-1)              
        loss = criterion(output, tgt_y)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
        train_bar.set_postfix({
            "loss": f"{loss.item():.4f}"
        })
    avg_train_loss = total_train_loss / len(train_loader)
    model.eval()
    total_val_loss = 0.0
    val_bar = tqdm(val_loader, desc="Validation", leave=False)
    with torch.no_grad():
        for batch_idx, (src_batch, tgt_batch) in enumerate(val_bar):
            src_batch = src_batch.to(device)
            tgt_batch = tgt_batch.to(device)

            output = model(src_batch, tgt_batch)
            output_dim = output.shape[-1]
            output = output[:, :-1, :].reshape(-1, output_dim)
            tgt_y  = tgt_batch[:, 1:].reshape(-1)
            
            loss = criterion(output, tgt_y)
            total_val_loss += loss.item()
            val_bar.set_postfix({
                "loss": f"{loss.item():.4f}"
            })

    avg_val_loss = total_val_loss / len(val_loader)
    print(f"[Epoch {epoch}] Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
    if avg_val_loss < best_val_loss:
        if best_model_path and os.path.exists(best_model_path):
            os.remove(best_model_path)
        best_val_loss = avg_val_loss
        best_model_path = os.path.join(SAVE_DIR, "best.pt")
        torch.save(model.state_dict(), best_model_path)
        print(f"  -> New best model saved at {best_model_path}")


Epoch 1/10


                                                                      

[Epoch 1] Train Loss: 7.5111 | Val Loss: 7.7419
  -> New best model saved at saved/best.pt
Epoch 2/10


                                                                      

[Epoch 2] Train Loss: 7.0064 | Val Loss: 7.8450
Epoch 3/10


                                                                      

[Epoch 3] Train Loss: 6.8774 | Val Loss: 7.9946
Epoch 4/10


                                                                      

[Epoch 4] Train Loss: 6.7686 | Val Loss: 8.1131
Epoch 5/10


                                                                      

[Epoch 5] Train Loss: 6.7176 | Val Loss: 8.1620
Epoch 6/10


                                                                      

[Epoch 6] Train Loss: 6.6501 | Val Loss: 8.2260
Epoch 7/10


                                                                      

[Epoch 7] Train Loss: 6.5832 | Val Loss: 8.2944
Epoch 8/10


                                                                      

[Epoch 8] Train Loss: 6.5376 | Val Loss: 8.3361
Epoch 9/10


                                                                      

[Epoch 9] Train Loss: 6.4862 | Val Loss: 8.4352
Epoch 10/10


                                                                      

[Epoch 10] Train Loss: 6.4338 | Val Loss: 8.5099




## Bleu Score

In [6]:
def greedy_decode(model, src, max_len=100):
    model.eval()
    with torch.no_grad():
        _, (hidden, cell) = model.seq2seq.encoder(src) 
        batch_size = src.size(0)
        outs = []
        next_token = torch.LongTensor([0]*batch_size).unsqueeze(1).to(model.seq2seq.device) 
        for _ in range(max_len):
            logits, (hidden, cell) = model.seq2seq.decoder(next_token, hidden, cell)
            next_word = logits.argmax(dim=-1) 
            outs.append(next_word.squeeze(1)) 
            next_token = next_word 
        outs = torch.stack(outs, dim=1)  
    return outs



In [7]:
import torch
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
best_model = Seq2SeqModel(
    src_vocab_size=SRC_VOCAB_SIZE,
    tgt_vocab_size=TGT_VOCAB_SIZE,
    embed_dim=EMBED_DIM,
    hidden_dim=HIDDEN_DIM,
    n_layers=N_LAYERS,
    pad_idx=PAD_IDX,
    dropout=DROPOUT,
    device=device
).to(device)
best_model.load_state_dict(torch.load(os.path.join(SAVE_DIR, "best.pt")))
print("Loaded best model for testing!")
smooth_fn = SmoothingFunction().method1
references = []
hypotheses = []
best_model.eval()
with torch.no_grad():
    for src_batch, tgt_batch in tqdm(test_loader, desc="Testing"):
        src_batch = src_batch.to(device)
        tgt_batch = tgt_batch.to(device)
        preds = greedy_decode(best_model, src_batch, max_len=70)  
        for i in range(src_batch.size(0)):
            gold = tgt_batch[i].tolist()
            pred = preds[i].tolist()
            references.append([gold])  
            hypotheses.append(pred)
weights_for_bleu2 = (0.5, 0.5)
bleu_scores = []
for ref, hyp in zip(references, hypotheses):
    bleu = sentence_bleu(
        ref, 
        hyp, 
        weights=weights_for_bleu2, 
        smoothing_function=smooth_fn
    )
    bleu_scores.append(bleu)
avg_bleu2 = sum(bleu_scores) / len(bleu_scores)
print(f"Test BLEU-2: {avg_bleu2:.4f}")


Loaded best model for testing!


Testing: 100%|██████████| 25/25 [00:02<00:00,  9.15it/s]

Test BLEU-2: 0.0056





## Translasi

In [8]:
from preprocessing import Tokenize
tokenizer = Tokenize(path="dataset", src_lang="eng", tgt_lang="min")
tokenizer.load_vocab(filename_src="src_vocab.pkl", filename_tgt="tgt_vocab.pkl")

In [9]:
import torch
def translate_sentence(sentence, tokenizer, model, device, max_len=50):
    model.eval()
    with torch.no_grad():
        src_indices = tokenizer.numericalize(sentence, is_source=True)
        src_tensor  = torch.tensor(src_indices, dtype=torch.long).unsqueeze(0).to(device)
        encoder_outputs, (hidden, cell) = model.encoder(src_tensor)
        sos_idx = tokenizer.tgt_vocab["<sos>"]
        eos_idx = tokenizer.tgt_vocab["<eos>"]
        decoder_input = torch.LongTensor([[sos_idx]]).to(device)
        preds = []
        for _ in range(max_len):
            logits, (hidden, cell) = model.decoder(decoder_input, hidden, cell)

            next_token = logits.argmax(dim=-1)  
            pred_idx = next_token.item()
            if pred_idx == eos_idx:
                break
            preds.append(pred_idx)

            decoder_input = next_token
        translated_tokens = tokenizer.detokenize(preds, is_source=False)

    return translated_tokens


In [10]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sentence = "I want to bike tomorrow"
prediction = translate_sentence(sentence, tokenizer, model, device, max_len=50)
print("English:    ", sentence)
print("Minangkabau:", prediction)


English:     I want to bike tomorrow
Minangkabau: Awak ka ka di di
