## Bài 1: Thực hiện xây dựng mô hình dự đoán cho bài toán phân tích cảm xúc (sentiment-based) dựa trên bộ dữ liệu UIT-VSFC. Có thể chọn 1 bộ pre-trained embedding khác. (Tham khảo tại đây: https://github.com/vietnlp/etnlp)

In [1]:
!pip install pyvi

Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting sklearn-crfsuite (from pyvi)
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite->pyvi)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m60.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite, pyvi
Successfully installed python-crfsuite-0.9.11 pyvi-0.1.1 sklearn-crfsui

In [2]:
import json
import numpy as np
from tqdm import tqdm 

import torch 
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from typing import List, Dict, Tuple
from collections import Counter
from pyvi import ViTokenizer

from sklearn.metrics import f1_score, accuracy_score, classification_report, precision_score, recall_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def read_data_json(path):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        
    return data

# train
train_data = read_data_json('/kaggle/input/uit-vsfc/UIT-VSFC-train.json')
# dev
dev_data = read_data_json('/kaggle/input/uit-vsfc/UIT-VSFC-dev.json')
# test
test_data = read_data_json('/kaggle/input/uit-vsfc/UIT-VSFC-test.json')

# Dataset preparation

### Build vocab

In [4]:
# Build vocab
train_sentences = [item['sentence'] for item in train_data]
V = []
for t in train_sentences:
    tokenized_sentence = ViTokenizer.tokenize(t)
    V = V + tokenized_sentence.split()
    
V = list(set(V))

### Create label mapping 

In [5]:
# Word to index
word2idx = {word: idx for idx, word in enumerate(V)}
word2idx['PAD'] = 0
word2idx['UNK'] = 1

# Index to word
idx2word = {idx: word for word, idx in word2idx.items()}

# Label to index
unique_labels = sorted(list(set([item['sentiment'] for item in train_data])))
label2idx = {label: idx for idx, label in enumerate(unique_labels)}
idx2label = {idx: label for label, idx in label2idx.items()}

### Encoding sentences

In [6]:
# ============================
# DATASET + DATALOADER
# ============================
class SentimentDataset(Dataset):
    def __init__(self, encoded_sentences, labels):
        self.encoded_sentences = encoded_sentences
        self.labels = labels
    
    def __len__(self):
        return len(self.encoded_sentences)
    
    def __getitem__(self, idx):
        return torch.tensor(self.encoded_sentences[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

# Encode function
def encode_sentence(sentence, max_length):
    tokenized = ViTokenizer.tokenize(sentence).split()
    indices = [word2idx.get(token, word2idx['UNK']) for token in tokenized]
    
    if max_length:
        if len(indices) > max_length:
            indices = indices[:max_length]
        else:
            indices = indices + [word2idx['PAD']] * (max_length - len(indices))
    
    return indices

# Encode sentences
train_encoded = [encode_sentence(item['sentence'], max_length=256) for item in train_data]
train_labels = [label2idx[item['sentiment']] for item in train_data]

dev_encoded = [encode_sentence(item['sentence'], max_length=256) for item in dev_data]
dev_labels = [label2idx[item['sentiment']] for item in dev_data]

test_encoded = [encode_sentence(item['sentence'], max_length=256) for item in test_data]
test_labels = [label2idx[item['sentiment']] for item in test_data]

# Create DataLoaders
train_dataset = SentimentDataset(train_encoded, train_labels)
dev_dataset = SentimentDataset(dev_encoded, dev_labels)
test_dataset = SentimentDataset(test_encoded, test_labels)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Training

## Model

In [7]:
# ============================
# LOAD PRETRAINED W2V_NER.VEC
# ============================
embedding_dim = 300
embeddings_index = {}

with open("/kaggle/input/embedding-model/FastText_ner.vec", "r", encoding="utf-8") as f:
    for line in f:
        values = line.rstrip().split(" ")
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = vector


# ============================
# BUILD EMBEDDING MATRIX
# ============================
embedding_matrix = np.zeros((len(word2idx), embedding_dim))

for word, idx in word2idx.items():
    vector = embeddings_index.get(word)
    if vector is not None:
        embedding_matrix[idx] = vector
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))

embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)

In [None]:
# ============================
# MODEL WITH PRETRAINED EMBEDDING
# ============================
class Classifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, embedding_matrix):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.embedding.weight = nn.Parameter(embedding_matrix, requires_grad=True)

        self.output = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        embeds = self.embedding(x) # [batch, seq_len, embed_dim]
        doc_vec = embeds.mean(dim=1) # Mean-pooling
        logits = self.output(doc_vec) 
        return logits

model = Classifier(vocab_size=len(word2idx), embed_dim=embedding_dim, num_classes=len(idx2label), embedding_matrix=embedding_matrix)
model.to(device)

Classifier(
  (embedding): Embedding(3706, 300)
  (output): Linear(in_features=300, out_features=3, bias=True)
)

## Train setup

In [9]:
# ============================
# TRAINING SETUP
# ============================
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.5)
epochs = 20


# ============================
# TRAIN LOOP
# ============================
def evaluate(loader):
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for X, y in loader:
            X, y = X.to(device), y.to(device)
            logits = model(X)
            pred = logits.argmax(dim=1)
            preds.extend(pred.cpu().tolist())
            trues.extend(y.cpu().tolist())
    return (
        accuracy_score(trues, preds),
        f1_score(trues, preds, average="macro"),
        precision_score(trues, preds, average="macro"),
        recall_score(trues, preds, average="macro"),
    )

## Train

In [10]:
best_f1 = 0.0
best_model_state = None

for epoch in range(epochs):
    model.train()
    train_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)

    for X, y in train_bar:
        X, y = X.to(device), y.to(device)

        optimizer.zero_grad()
        logits = model(X)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()

        train_bar.set_postfix(loss=loss.item())

    # Evaluate on dev set
    acc, f1, prec, rec = evaluate(dev_loader)
    print(f"Epoch {epoch+1}/{epochs} — Dev Acc: {acc:.4f}, F1: {f1:.4f}")

    # Save best model
    if f1 > best_f1:
        best_f1 = f1
        best_model_state = model.state_dict()
        torch.save(best_model_state, "best_model.pth")
        print(f"*** Best model updated at epoch {epoch+1} with F1: {best_f1:.4f} ***")

                                                                           

Epoch 1/20 — Dev Acc: 0.8907, F1: 0.7186
*** Best model updated at epoch 1 with F1: 0.7186 ***


                                                                           

Epoch 2/20 — Dev Acc: 0.6147, F1: 0.3981


                                                                            

Epoch 3/20 — Dev Acc: 0.7833, F1: 0.6600


                                                                            

Epoch 4/20 — Dev Acc: 0.8181, F1: 0.5987


                                                                             

Epoch 5/20 — Dev Acc: 0.8402, F1: 0.6190


                                                                             

Epoch 6/20 — Dev Acc: 0.8686, F1: 0.6988


                                                                             

Epoch 7/20 — Dev Acc: 0.8876, F1: 0.6768


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 8/20 — Dev Acc: 0.8579, F1: 0.5857


                                                                             

Epoch 9/20 — Dev Acc: 0.8427, F1: 0.7223
*** Best model updated at epoch 9 with F1: 0.7223 ***


                                                                              

Epoch 10/20 — Dev Acc: 0.8459, F1: 0.7260
*** Best model updated at epoch 10 with F1: 0.7260 ***


                                                                              

Epoch 11/20 — Dev Acc: 0.6627, F1: 0.5109


                                                                             

Epoch 12/20 — Dev Acc: 0.8945, F1: 0.7356
*** Best model updated at epoch 12 with F1: 0.7356 ***


                                                                              

Epoch 13/20 — Dev Acc: 0.8882, F1: 0.6146


                                                                              

Epoch 14/20 — Dev Acc: 0.8661, F1: 0.7456
*** Best model updated at epoch 14 with F1: 0.7456 ***


                                                                              

Epoch 15/20 — Dev Acc: 0.8478, F1: 0.7206


                                                                              

Epoch 16/20 — Dev Acc: 0.8939, F1: 0.7682
*** Best model updated at epoch 16 with F1: 0.7682 ***


                                                                              

Epoch 17/20 — Dev Acc: 0.8787, F1: 0.7226


                                                                              

Epoch 18/20 — Dev Acc: 0.8692, F1: 0.7241


                                                                              

Epoch 19/20 — Dev Acc: 0.8238, F1: 0.6941


                                                                              

Epoch 20/20 — Dev Acc: 0.8604, F1: 0.7454


## Evaluation

In [11]:
# ============================
#  TEST
# ============================

# Load best model
model.load_state_dict(torch.load("best_model.pth"))

# Evaluate on test set
acc, f1, prec, rec = evaluate(test_loader)
print("\n=== TEST RESULT ===")
print(f"Accuracy : {acc:.4f}")
print(f"F1       : {f1:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall   : {rec:.4f}")


=== TEST RESULT ===
Accuracy : 0.8762
F1       : 0.7364
Precision: 0.7472
Recall   : 0.7298
