In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from dataloader import get_dataloaders
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformer import Transformer,TransformerEncoder,TransformerDecoder
import utils
nltk.download('punkt')  
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/wicaksonolxn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
BATCH_SIZE = 32
DATA_PATH = "dataset/"  
train_loader, val_loader, test_loader = get_dataloaders(
    data_path=DATA_PATH, 
    source_lang="min", 
    target_lang="eng", 
    batch_size=BATCH_SIZE, 
    device=device
)


In [3]:
for i, batch in enumerate(train_loader):
    if i < 1:
        print("src_batch type:", type(batch['src']))  
        print("tgt_batch type:", type(batch['tgt'])) 
        print("src_batch shape:", batch['src'].shape)  
        print("tgt_batch shape:", batch['tgt'].shape)  
    else:
        break


src_batch type: <class 'torch.Tensor'>
tgt_batch type: <class 'torch.Tensor'>
src_batch shape: torch.Size([32, 52])
tgt_batch shape: torch.Size([32, 52])


  src_batch = [torch.tensor(item['src'], dtype=torch.long) for item in batch]
  tgt_batch = [torch.tensor(item['tgt'], dtype=torch.long) for item in batch]


In [None]:
SRC_VOCAB_SIZE = 5000 
TGT_VOCAB_SIZE = 5000  
DROPOUT = 0.3
N_LAYERS = 7
N_HEADS = 8
FFN_HIDDEN =1024
D_MODEL =256
encoder = TransformerEncoder(SRC_VOCAB_SIZE,D_MODEL,N_LAYERS,N_HEADS,FFN_HIDDEN,DROPOUT,device)
decoder = TransformerDecoder(TGT_VOCAB_SIZE,D_MODEL,N_LAYERS,N_HEADS,FFN_HIDDEN,DROPOUT,device)
model = Transformer(encoder,decoder,device,utils.PAD_TOKEN).to(device)


In [5]:
optimizer = optim.Adam(model.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss(ignore_index=utils.PAD_TOKEN) 
print("Model initialized on:", device)

Model initialized on: cuda


In [None]:
import os
import torch
from tqdm import tqdm

EPOCHS = 40
SAVE_DIR = "saved"
os.makedirs(SAVE_DIR, exist_ok=True)
best_val_loss = float("inf")
best_model_path = None
for epoch in range(1, EPOCHS + 1):
    print(f"Epoch {epoch}/{EPOCHS}")
    model.train()
    total_train_loss = 0.0
    train_bar = tqdm(train_loader, desc="🚀 Training", 
                leave=False, total=len(train_loader))
    for batch in train_loader:
        src_batch = batch['src'].to(device)
        tgt_batch = batch['tgt'].to(device)
        
        optimizer.zero_grad()
        output,_= model(src_batch, tgt_batch)
        output_dim = output.shape[-1]
        
        output_dim = output.shape[-1]
        output = output[:, :-1, :].reshape(-1, output_dim)
        tgt_y  = tgt_batch[:, 1:].reshape(-1)
        
        loss = criterion(output, tgt_y)
        loss.backward()
        optimizer.step()
        
        total_train_loss += loss.item()
        train_bar.set_postfix(loss=f"{loss.item():.4f}")


    avg_train_loss = total_train_loss / len(train_loader)
    model.eval()
    total_val_loss = 0.0
    val_bar = tqdm(val_loader, desc="🚀 Validation",
              leave=True, total=len(val_loader))
    with torch.no_grad():
        for batch in val_bar:
            src_batch = batch['src'].to(device)
            tgt_batch = batch['tgt'].to(device)
            
            output,_= model(src_batch, tgt_batch)
            output_dim = output.shape[-1]
            output = output[:, :-1, :].reshape(-1, output_dim)
            tgt_y  = tgt_batch[:, 1:].reshape(-1)
            
            loss = criterion(output, tgt_y)
            total_val_loss += loss.item()
            val_bar.set_postfix(loss=f"{loss.item():.4f}")
    
    avg_val_loss = total_val_loss / len(val_loader)
    print(f"[Epoch {epoch}] Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
    if avg_val_loss < best_val_loss:
        if best_model_path and os.path.exists(best_model_path):
            os.remove(best_model_path)
        best_val_loss = avg_val_loss
        best_model_path = os.path.join(SAVE_DIR, "best.pt")
        torch.save(model.state_dict(), best_model_path)
        print(f"  -> New best model saved at {best_model_path}")


Epoch 1/40


Training:   0%|          | 0/26 [00:01<?, ?it/s, loss=5.2564]

KeyboardInterrupt: 

## Bleu Score

In [None]:
def greedy_decode(model, src, max_len=100):
    model.eval()
    with torch.no_grad():
        _, (hidden, cell) = model.seq2seq.encoder(src) 
        batch_size = src.size(0)
        outs = []
        next_token = torch.LongTensor([0]*batch_size).unsqueeze(1).to(model.seq2seq.device) 
        for _ in range(max_len):
            logits, (hidden, cell) = model.seq2seq.decoder(next_token, hidden, cell)
            next_word = logits.argmax(dim=-1) 
            outs.append(next_word.squeeze(1)) 
            next_token = next_word 
        outs = torch.stack(outs, dim=1)  
    return outs



In [None]:
import torch
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

encoder = TransformerEncoder(SRC_VOCAB_SIZE,D_MODEL,N_LAYERS,N_HEADS,FFN_HIDDEN,DROPOUT,device)
decoder = TransformerDecoder(TGT_VOCAB_SIZE,D_MODEL,N_LAYERS,N_HEADS,FFN_HIDDEN,DROPOUT,device)
best_model = Transformer(encoder,decoder,device,utils.PAD_TOKEN).to(device)
best_model.load_state_dict(torch.load(os.path.join(SAVE_DIR, "best.pt")))
print("Loaded best model for testing!")
smooth_fn = SmoothingFunction().method1
references = []
hypotheses = []
best_model.eval()

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        src_batch = batch['src'].to(device) 
        tgt_batch = batch['tgt'].to(device) 
        preds = greedy_decode(best_model, src_batch, max_len=70)
        
        for i in range(src_batch.size(0)):
            gold = tgt_batch[i].tolist()
            gold_tokens = [str(t) for t in gold if t not in [utils.SOS_TOKEN, utils.PAD_TOKEN]]
            if utils.EOS_TOKEN in gold_tokens:
                gold_tokens = gold_tokens[:gold_tokens.index(utils.EOS_TOKEN)]
            pred = preds[i].tolist()
            pred_tokens = [str(t) for t in pred if t != utils.SOS_TOKEN]
            if utils.EOS_TOKEN in pred_tokens:
                pred_tokens = pred_tokens[:pred_tokens.index(utils.EOS_TOKEN)]
            references.append([gold_tokens])  # Wrap in list for multiple references
            hypotheses.append(pred_tokens)
weights = (0.5, 0.5)  # Bigram weights for BLEU-2
bleu_score = corpus_bleu(
    references,
    hypotheses,
    weights=weights,
    smoothing_function=smooth_fn
)

print(f"Test BLEU-2: {bleu_score:.4f}")


Loaded best model for testing!


Testing:   0%|          | 0/664 [00:00<?, ?it/s]


AttributeError: 'str' object has no attribute 'to'

## Translasi

In [None]:
from preprocessing import Tokenize
tokenizer = Tokenize(path="dataset", src_lang="min", tgt_lang="eng")
tokenizer.load_vocab(filename_src="src_vocab.pkl", filename_tgt="tgt_vocab.pkl")

In [None]:
import torch
def translate_sentence(sentence, tokenizer, model, device, max_len=50):
    model.eval()
    with torch.no_grad():
        src_indices = tokenizer.numericalize(sentence, is_source=True)
        src_tensor  = torch.tensor(src_indices, dtype=torch.long).unsqueeze(0).to(device)
        encoder_outputs, (hidden, cell) = model.encoder(src_tensor)
        sos_idx = tokenizer.tgt_vocab["<sos>"]
        eos_idx = tokenizer.tgt_vocab["<eos>"]
        decoder_input = torch.LongTensor([[sos_idx]]).to(device)
        preds = []
        for _ in range(max_len):
            logits, (hidden, cell) = model.decoder(decoder_input, hidden, cell)

            next_token = logits.argmax(dim=-1)  
            pred_idx = next_token.item()
            if pred_idx == eos_idx:
                break
            preds.append(pred_idx)

            decoder_input = next_token
        translated_tokens = tokenizer.detokenize(preds, is_source=False)

    return translated_tokens


# Translasi: 
Ambo mancari awaknyo besok

    Bahasa Indonesia: Saya akan mencarimu besok.
    English: I will look for you tomorrow.

Alun salama, apo kabar?

    Bahasa Indonesia: Halo, apa kabar?
    English: Hello, how are you?

Dunsanak ka rumah gadang

    Bahasa Indonesia: Saudara, mari ke rumah gadang.
    English: Relatives, let's go to the traditional house.

Urang minang manarimo tradisi

    Bahasa Indonesia: Orang Minang menerima tradisi.
    English: Minangkabau people embrace tradition.

Apo ado di pasar?

    Bahasa Indonesia: Apa ada di pasar?
    English: What's there in the market?

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

minang_sentence = "Ambo mancari awaknyo besok"
translation_en = translate_sentence(minang_sentence, tokenizer, model, device, max_len=50)
print("Minangkabau:", minang_sentence)
print("English:    ", translation_en)
minang_sentences = [
    "Ambo mancari awaknyo besok", # saya mencari 
    "Alun salama, apo kabar?",
    "Dunsanak ka rumah gadang",
    "Urang minang manarimo tradisi",
    "Apo ado di pasar?"
]
for sentence in minang_sentences:
    translation = translate_sentence(sentence, tokenizer, model, device, max_len=50)
    print("Minangkabau:", sentence)
    print("English:    ", translation)


Minangkabau: Ambo mancari awaknyo besok
English:     The place is the the the
Minangkabau: Ambo mancari awaknyo besok
English:     The place is the the the
Minangkabau: Alun salama, apo kabar?
English:     The place is the the the
Minangkabau: Dunsanak ka rumah gadang
English:     The place is the the the
Minangkabau: Urang minang manarimo tradisi
English:     The place is the the the
Minangkabau: Apo ado di pasar?
English:     The place is the the the


Hasil kurang bagus karnea kekurangan dataset Hanya terdapat 400 dataset dan vocabnya terbatas utk proses pelatihan