In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from dataloader import get_dataloaders
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from transformer import Transformer,TransformerEncoder,TransformerDecoder
import utils
import pickle
nltk.download('punkt')  
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/wicaksonolxn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
BATCH_SIZE = 8 # butuh lebih banyak update 
DATA_PATH = "dataset/"  
SAVE_DIR = "saved"

train_loader, val_loader, test_loader = get_dataloaders(
    data_path=DATA_PATH, 
    source_lang="min", 
    target_lang="eng", 
    batch_size=BATCH_SIZE, 
    device=device
)
SRC_VOCAB_SIZE = 4000
TGT_VOCAB_SIZE = 4000
N_LAYERS = 1
N_HEADS = 1
D_MODEL = 64
FFN_HIDDEN = D_MODEL*4
DROPOUT = 0.3
EPOCHS = 150

encoder = TransformerEncoder(SRC_VOCAB_SIZE,D_MODEL,N_LAYERS,N_HEADS,FFN_HIDDEN,DROPOUT,device)
decoder = TransformerDecoder(TGT_VOCAB_SIZE,D_MODEL,N_LAYERS,N_HEADS,FFN_HIDDEN,DROPOUT,device)
model = Transformer(encoder,decoder,device,utils.PAD_TOKEN).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss(ignore_index=utils.PAD_TOKEN) 
print("Model initialized on:", device)


TrainData - Max 'min' sentence length: 76
TrainData - Max 'eng' sentence length: 107
TestData - Max 'min' sentence length: 61
TestData - Max 'eng' sentence length: 75
ValidData - Max 'min' sentence length: 71
ValidData - Max 'eng' sentence length: 80
Number of examples in train_dataset,train origin,train_raw: 800 800 800
Number of examples in valid_dataset: 100
Number of examples in test_dataset: 100
Model initialized on: cuda


In [3]:
tokens = {
    "Padding": utils.PAD_TOKEN,
    "Start of Sequence": utils.SOS_TOKEN,
    "End of Sequence": utils.EOS_TOKEN,
    "Unknown": utils.UNK_TOKEN
}
for i, batch in enumerate(train_loader):
    if i < 8:
        src = batch["src"]
        tgt = batch["tgt"]
        ss,fss=src[0,:],src.shape
        st,fst=tgt[0,:],tgt.shape
        print(fss,fst)
    for name, token in tokens.items():
        print(f"{name}: {token}")
    else:
        break

torch.Size([8, 109]) torch.Size([8, 109])
Padding: 0
Start of Sequence: 1
End of Sequence: 2
Unknown: 3


testing input , is it correct

In [4]:
data_iter = iter(train_loader)
for i in range(7):
    batch = next(data_iter)
    print(batch["src"].shape)


torch.Size([8, 109])
torch.Size([8, 109])
torch.Size([8, 109])
torch.Size([8, 109])
torch.Size([8, 109])
torch.Size([8, 109])
torch.Size([8, 109])


In [5]:
for i,batch in enumerate(train_loader):
    if i <1:
        src_batch = batch['src'].to(device)
        tgt_batch = batch['tgt'].to(device)
        output, _ = model(src_batch, tgt_batch[:, :-1]) 
        output_dim = output.shape[-1]
        output = output.reshape(-1, output_dim)
        tgt_y = tgt_batch[:,1:].contiguous().view(-1)
        print(tgt_y)

tensor([  47,  152,  153, 1300,  260, 1731, 2410,   36,  725,    6,  189,  123,
          21,  163,   10,   21, 3620,   76,  934,   34,   85,  119,  102, 2335,
          21, 2410,   33,  101, 1186,  345,   36,  351,   54,  531,  204,  784,
          47,    2,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
         133, 2070,  805, 1098,  151,   58,  178, 1965,  756,  696,   21, 1966,
        1004,   21,  993,  542,  504,   21,   11,   36,  943, 1940,   76,   54,
         119,  134,    2,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,   

In [None]:
import os
import torch
from tqdm import tqdm
os.makedirs(SAVE_DIR, exist_ok=True)
best_val_loss = float("inf") 
best_model_path = None
atl = {}
avl = {}
for epoch in range(1, EPOCHS + 1):
    print(f"Epoch {epoch}/{EPOCHS}")
    model.train()
    total_train_loss = 0.0
    train_bar = tqdm(train_loader, desc="🚀 Training", 
                leave=True, total=len(train_loader))
    for batch in train_bar:
        optimizer.zero_grad()
        src_batch = batch['src'].to(device)
        tgt_batch = batch['tgt'].to(device)
        
        output, _ = model(src_batch, tgt_batch[:, :-1]) 
        output_dim = output.shape[-1]
        output = output.reshape(-1, output_dim)
        tgt_y = tgt_batch[:,1:].contiguous().view(-1)
        loss = criterion(output, tgt_y)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
        train_bar.set_postfix(loss=f"{loss.item():.4f}")

    avg_train_loss = total_train_loss / len(train_loader)
    atl[epoch]=avg_train_loss
    model.eval()
    total_val_loss = 0.0
    val_bar = tqdm(val_loader, desc="🚀 Validation",
              leave=True, total=len(val_loader))
    with torch.no_grad():
        for batch in val_bar:
            src_batch = batch['src'].to(device)
            tgt_batch = batch['tgt'].to(device)
            
            output, _ = model(src_batch, tgt_batch[:, :-1]) 
            output_dim = output.shape[-1]
            output = output.reshape(-1, output_dim)
            tgt_y = tgt_batch[:,1:].contiguous().view(-1)

            loss = criterion(output, tgt_y)
            total_val_loss += loss.item()
            val_bar.set_postfix(loss=f"{loss.item():.4f}")
    
    avg_val_loss = total_val_loss / len(val_loader)
    avl[epoch]=avg_val_loss
    print(f"[Epoch {epoch}] Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
    if avg_val_loss < best_val_loss:
        if best_model_path and os.path.exists(best_model_path):
            os.remove(best_model_path)
        best_val_loss = avg_val_loss
        best_model_path = os.path.join(SAVE_DIR, "best_dict.pt")
        torch.save(model.state_dict(), best_model_path)
        print(f"  -> New best model saved at {best_model_path}")

utils.plot_loss(atl, avl, SAVE_DIR,"loss_dictbase")


Epoch 1/150


🚀 Training: 100%|██████████| 100/100 [00:00<00:00, 133.74it/s, loss=8.4283]
🚀 Validation: 100%|██████████| 13/13 [00:00<00:00, 333.05it/s, loss=8.4775]


[Epoch 1] Train Loss: 8.4411 | Val Loss: 8.4469
  -> New best model saved at saved/best.pt
Epoch 2/150


🚀 Training: 100%|██████████| 100/100 [00:00<00:00, 161.01it/s, loss=8.3577]
🚀 Validation: 100%|██████████| 13/13 [00:00<00:00, 350.96it/s, loss=8.4461]


[Epoch 2] Train Loss: 8.4106 | Val Loss: 8.4131
  -> New best model saved at saved/best.pt
Epoch 3/150


🚀 Training: 100%|██████████| 100/100 [00:00<00:00, 156.88it/s, loss=8.4096]
🚀 Validation: 100%|██████████| 13/13 [00:00<00:00, 392.60it/s, loss=8.4120]


[Epoch 3] Train Loss: 8.3700 | Val Loss: 8.3772
  -> New best model saved at saved/best.pt
Epoch 4/150


🚀 Training: 100%|██████████| 100/100 [00:00<00:00, 142.75it/s, loss=8.3173]
🚀 Validation: 100%|██████████| 13/13 [00:00<00:00, 321.00it/s, loss=8.3731]


[Epoch 4] Train Loss: 8.3420 | Val Loss: 8.3368
  -> New best model saved at saved/best.pt
Epoch 5/150


🚀 Training: 100%|██████████| 100/100 [00:00<00:00, 137.75it/s, loss=8.2737]
🚀 Validation: 100%|██████████| 13/13 [00:00<00:00, 316.58it/s, loss=8.3287]


[Epoch 5] Train Loss: 8.2964 | Val Loss: 8.2907
  -> New best model saved at saved/best.pt
Epoch 6/150


🚀 Training:  13%|█▎        | 13/100 [00:00<00:00, 128.92it/s, loss=8.2550]