In [2]:
!pip install -q transformers datasets torch


In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoTokenizer, get_scheduler
from tqdm import tqdm
import numpy as np
from sklearn.metrics import classification_report

In [2]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In [3]:
train_data = torch.load("/kaggle/input/enpchatbot/train.pt")
valid_data = torch.load("/kaggle/input/enpchatbot/valid.pt")
test_data = torch.load("/kaggle/input/enpchatbot/test.pt")

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=32, shuffle=False)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

In [4]:
class PhoBERTClassifier(nn.Module):
    def __init__(self, num_classes=5):
        super(PhoBERTClassifier, self).__init__()
        self.phobert = AutoModel.from_pretrained("vinai/phobert-base")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.phobert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.phobert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # Lấy vector CLS
        logits = self.classifier(self.dropout(pooled_output))
        return logits

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = PhoBERTClassifier().to(device)

config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

In [5]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction="mean"):
        super(FocalLoss, self).__init__()
        self.alpha = alpha.to(device) if alpha is not None else None
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction="none")
        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt) ** self.gamma * ce_loss

        if self.alpha is not None:
            focal_loss *= self.alpha[targets]

        if self.reduction == "mean":
            return focal_loss.mean()
        elif self.reduction == "sum":
            return focal_loss.sum()
        return focal_loss

alpha = torch.tensor([2.32, 5.47, 7.47, 19.94, 4.04]).to(device) 
loss_fn = FocalLoss(alpha=alpha)

In [6]:
optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
scheduler = get_scheduler("cosine", optimizer=optimizer, num_warmup_steps=100, num_training_steps=len(train_loader) * 5)

In [7]:
def train_model(model, train_loader, valid_loader, loss_fn, optimizer, scheduler, num_epochs=5):
    best_accuracy = 0.0
    for epoch in range(num_epochs):
        model.train()
        total_loss, correct, total = 0, 0, 0
        
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            input_ids, labels = batch
            input_ids, labels = input_ids.to(device), labels.to(device)

            optimizer.zero_grad()
            logits = model(input_ids, attention_mask=(input_ids != 1))
            loss = loss_fn(logits, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            correct += (logits.argmax(dim=1) == labels).sum().item()
            total += labels.size(0)

        train_acc = correct / total
        valid_acc = evaluate(model, valid_loader)
        print(f"Epoch {epoch+1}: Train Loss: {total_loss/len(train_loader):.4f}, Train Acc: {train_acc:.4f}, Valid Acc: {valid_acc:.4f}")

        if valid_acc > best_accuracy:
            best_accuracy = valid_acc
            torch.save(model.state_dict(), "best_model.pt")  # Lưu model tốt nhất

def evaluate(model, valid_loader):
    model.eval()
    correct, total = 0, 0
    all_preds, all_labels = [], []
    
    with torch.no_grad():
        for batch in valid_loader:
            input_ids, labels = batch
            input_ids, labels = input_ids.to(device), labels.to(device)

            logits = model(input_ids, attention_mask=(input_ids != 1))
            preds = logits.argmax(dim=1)

            correct += (preds == labels).sum().item()
            total += labels.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    print("\nClassification Report:\n", classification_report(all_labels, all_preds, target_names=["Vui vẻ", "Tức giận", "Buồn bã", "Sợ hãi", "Trung lập"]))
    return correct / total

In [9]:
print("\nTesting on test set...")
model.load_state_dict(torch.load("best_model.pt"))  # Load model tốt nhất
test_acc = evaluate(model, test_loader)
print(f"Test Accuracy: {test_acc:.4f}")


Testing on test set...

Classification Report:
               precision    recall  f1-score   support

      Vui vẻ       0.92      0.92      0.92       960
    Tức giận       0.57      0.65      0.61       407
     Buồn bã       0.56      0.56      0.56       298
      Sợ hãi       0.76      0.89      0.82       111
   Trung lập       0.74      0.63      0.68       551

    accuracy                           0.76      2327
   macro avg       0.71      0.73      0.72      2327
weighted avg       0.76      0.76      0.76      2327

Test Accuracy: 0.7568


In [10]:
torch.save(model, "/kaggle/working/final_model_full.pth")

In [8]:
train_model(model, train_loader, valid_loader, loss_fn, optimizer, scheduler, num_epochs=3)

Epoch 1: 100%|██████████| 582/582 [12:32<00:00,  1.29s/it]



Classification Report:
               precision    recall  f1-score   support

      Vui vẻ       0.93      0.84      0.88       959
    Tức giận       0.57      0.60      0.59       407
     Buồn bã       0.50      0.56      0.53       298
      Sợ hãi       0.53      0.92      0.67       112
   Trung lập       0.71      0.64      0.67       551

    accuracy                           0.72      2327
   macro avg       0.65      0.71      0.67      2327
weighted avg       0.74      0.72      0.73      2327

Epoch 1: Train Loss: 2.8088, Train Acc: 0.5797, Valid Acc: 0.7207


Epoch 2: 100%|██████████| 582/582 [12:38<00:00,  1.30s/it]



Classification Report:
               precision    recall  f1-score   support

      Vui vẻ       0.95      0.84      0.89       959
    Tức giận       0.58      0.62      0.60       407
     Buồn bã       0.53      0.60      0.56       298
      Sợ hãi       0.50      0.91      0.65       112
   Trung lập       0.74      0.66      0.70       551

    accuracy                           0.73      2327
   macro avg       0.66      0.73      0.68      2327
weighted avg       0.76      0.73      0.74      2327

Epoch 2: Train Loss: 1.6253, Train Acc: 0.7392, Valid Acc: 0.7327


Epoch 3: 100%|██████████| 582/582 [12:38<00:00,  1.30s/it]



Classification Report:
               precision    recall  f1-score   support

      Vui vẻ       0.93      0.87      0.90       959
    Tức giận       0.66      0.61      0.64       407
     Buồn bã       0.54      0.70      0.61       298
      Sợ hãi       0.65      0.88      0.75       112
   Trung lập       0.73      0.68      0.70       551

    accuracy                           0.76      2327
   macro avg       0.70      0.75      0.72      2327
weighted avg       0.77      0.76      0.76      2327

Epoch 3: Train Loss: 1.3129, Train Acc: 0.7691, Valid Acc: 0.7589


In [9]:
print("\nTesting on test set...")
model.load_state_dict(torch.load("best_model.pt"))  # Load model tốt nhất
test_acc = evaluate(model, test_loader)
print(f"Test Accuracy: {test_acc:.4f}")


Testing on test set...

Classification Report:
               precision    recall  f1-score   support

      Vui vẻ       0.94      0.89      0.91       960
    Tức giận       0.62      0.57      0.60       407
     Buồn bã       0.49      0.66      0.57       298
      Sợ hãi       0.74      0.89      0.81       111
   Trung lập       0.72      0.66      0.69       551

    accuracy                           0.75      2327
   macro avg       0.70      0.74      0.72      2327
weighted avg       0.77      0.75      0.76      2327

Test Accuracy: 0.7516


In [10]:
train_model(model, train_loader, valid_loader, loss_fn, optimizer, scheduler, num_epochs=3)

Epoch 1: 100%|██████████| 582/582 [12:39<00:00,  1.30s/it]



Classification Report:
               precision    recall  f1-score   support

      Vui vẻ       0.92      0.90      0.91       959
    Tức giận       0.66      0.60      0.63       407
     Buồn bã       0.54      0.75      0.62       298
      Sợ hãi       0.74      0.85      0.79       112
   Trung lập       0.75      0.65      0.70       551

    accuracy                           0.77      2327
   macro avg       0.72      0.75      0.73      2327
weighted avg       0.78      0.77      0.77      2327

Epoch 1: Train Loss: 1.0999, Train Acc: 0.7927, Valid Acc: 0.7654


Epoch 2: 100%|██████████| 582/582 [12:39<00:00,  1.30s/it]



Classification Report:
               precision    recall  f1-score   support

      Vui vẻ       0.92      0.89      0.91       959
    Tức giận       0.66      0.60      0.63       407
     Buồn bã       0.53      0.72      0.61       298
      Sợ hãi       0.71      0.87      0.78       112
   Trung lập       0.74      0.66      0.70       551

    accuracy                           0.76      2327
   macro avg       0.71      0.75      0.73      2327
weighted avg       0.77      0.76      0.77      2327

Epoch 2: Train Loss: 1.0235, Train Acc: 0.8002, Valid Acc: 0.7632


Epoch 3: 100%|██████████| 582/582 [12:38<00:00,  1.30s/it]



Classification Report:
               precision    recall  f1-score   support

      Vui vẻ       0.92      0.90      0.91       959
    Tức giận       0.65      0.62      0.63       407
     Buồn bã       0.55      0.72      0.62       298
      Sợ hãi       0.72      0.88      0.79       112
   Trung lập       0.76      0.64      0.70       551

    accuracy                           0.76      2327
   macro avg       0.72      0.75      0.73      2327
weighted avg       0.77      0.76      0.77      2327

Epoch 3: Train Loss: 1.0060, Train Acc: 0.8049, Valid Acc: 0.7649


In [None]:
print("\nTesting on test set...")
model.load_state_dict(torch.load("best_model.pt")) 
test_acc = evaluate(model, test_loader)
print(f"Test Accuracy: {test_acc:.4f}")


Testing on test set...

Classification Report:
               precision    recall  f1-score   support

      Vui vẻ       0.93      0.91      0.92       960
    Tức giận       0.62      0.57      0.59       407
     Buồn bã       0.48      0.67      0.56       298
      Sợ hãi       0.80      0.88      0.84       111
   Trung lập       0.74      0.64      0.69       551

    accuracy                           0.75      2327
   macro avg       0.71      0.73      0.72      2327
weighted avg       0.77      0.75      0.76      2327

Test Accuracy: 0.7529


In [12]:
model_path = "/kaggle/working/phobert_sentiment_model.pth"  
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Model saved to /kaggle/working/phobert_sentiment_model.pth
