In [1]:
from build_dataset_dataloader import get_dataset

train_dataset, val_dataset, vocabulary = get_dataset(case=2)

  from .autonotebook import tqdm as notebook_tqdm


Kích thước từ vựng: 9125
Index của '<pad>': 0
Index của một từ ngẫu nhiên 'hello': 2
Index của một từ không có trong từ điển: 2


In [2]:
len(vocabulary)

9125

In [3]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [4]:
next(iter(train_dataloader))

[tensor([[2620,    8, 2396,  ...,    0,    0,    0],
         [  26,  329,    4,  ...,    0,    0,    0],
         [  10,   14,    7,  ...,    0,    0,    0],
         ...,
         [   7,  308,   16,  ...,    0,    0,    0],
         [ 421,   81,   48,  ...,    0,    0,    0],
         [   3,   96,  116,  ...,    0,    0,    0]]),
 tensor([[   8, 2396,    4,  ...,    0,    0,    0],
         [  26,  329,    4,  ...,  571,  572,  573],
         [   7, 1285, 2626,  ...,  630, 1669,  807],
         ...,
         [6510,  467,  147,  ...,    0,    0,    0],
         [   7,  156, 1941,  ...,    0,    0,    0],
         [   0,    0,    0,  ...,    0,    0,    0]]),
 tensor([0., 0., 0., 0., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0.])]

In [5]:
import torch
from torch import nn
from LSTM import SiameseLSTM

EMBEDDING_DIM = 64
HIDDEN_DIM = 128
OUTPUT_DIM = 1 # Output 1 giá trị logit cho binary classification
NUM_EPOCHS = 20
LEARNING_RATE = 0.001
VOCAB_SIZE = len(vocabulary)
SEQ_LENGTH = 600

model = SiameseLSTM(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)


In [8]:
def evaluate(model, valid_dataloader, criterion):
    model.eval()
    total_loss = 0
    running_correct = 0
    total = 0
    with torch.no_grad():
        for seq1, seq2, labels in valid_dataloader:
            seq1, seq2, labels = seq1.to(device), seq2.to(device), labels.to(device).float()
            outputs = model(seq1, seq2)
            loss = criterion(outputs.squeeze(1), labels)
            total_loss += loss.item()
            
            predicted = (torch.sigmoid(outputs.squeeze(1)) > 0.5).float()
            running_correct += (predicted == labels).sum().item()
            total += labels.size(0)
    accuracy = 100 * running_correct / total
    total_loss = total_loss / total
    return total_loss, accuracy

In [9]:
import time

def train(model, max_epoch, train_dataloader, valid_dataloader, criterion, optimizer, device):
    model.to(device)
    train_losses = []
    train_accuracies = []
    test_losses = []
    test_accuracies = []

    # callbacks (save best params)
    best_weights = None
    best_test_acc, best_test_loss = -1, float('inf')


    for epoch in range(max_epoch):
        model.train()
        running_loss = 0.0
        running_correct = 0   # to track number of correct predictions
        total = 0   
        
        for i, (seq1, seq2, labels) in enumerate(train_dataloader):
            epoch_start_time = time.time()
            
            seq1, seq2, labels = seq1.to(device), seq2.to(device), labels.to(device).float()
            
            optimizer.zero_grad()
            
            # Forward
            outputs = model(seq1, seq2)
            loss = criterion(outputs.squeeze(1), labels)
            running_loss += loss.item()
            
            predicted = (torch.sigmoid(outputs.squeeze(1)) > 0.5).float()
            total += labels.size(0)
            running_correct += (predicted == labels).sum().item()
            
            # Backward
            loss.backward()
            optimizer.step()

        epoch_accuracy = 100 * running_correct / total
        epoch_loss = running_loss / total

        test_loss, test_accuracy = evaluate(model, valid_dataloader, criterion)

        if test_loss < best_test_loss:
            best_test_loss = test_loss
            # best_test_acc = test_accuracy
            best_weights = model.state_dict()
            
        print(
            "| Epoch {:3d} | Time: {:5.2f}s | Train Accuracy {:8.3f}% | Train Loss {:8.3f} "
            "| Valid Accuracy {:8.3f}% | Valid Loss {:8.3f} ".format(
                epoch+1, time.time() - epoch_start_time, epoch_accuracy, epoch_loss, test_accuracy, test_loss
            )
        )
        
        # save for plot
        train_losses.append(epoch_loss)
        train_accuracies.append(epoch_accuracy)
        test_losses.append(test_loss)
        test_accuracies.append(test_accuracy)
        
    values_dict = {
        "train_losses": train_losses,
        "train_accuracies": train_accuracies,
        "test_losses": test_losses,
        "test_accuracies": test_accuracies
    }

    return values_dict, best_weights

In [10]:
model.to(device)

history, best_weights = train(model, 2, train_dataloader, val_dataloader, criterion, optimizer, device)

| Epoch   1 | Time:  0.32s | Train Accuracy   58.068% | Train Loss    0.042 | Valid Accuracy   66.000% | Valid Loss    0.043 
| Epoch   2 | Time:  0.31s | Train Accuracy   60.895% | Train Loss    0.040 | Valid Accuracy   67.000% | Valid Loss    0.042 
