In [1]:
# import thư viện cần thiết

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from collections import Counter
import os


In [2]:
# data paths
TRAIN_PATH = r"C:\Users\DoubleDD\HUS\NLP&DL\datasets\UD_English-EWT\en_ewt-ud-train.conllu"
DEV_PATH   = r"C:\Users\DoubleDD\HUS\NLP&DL\datasets\UD_English-EWT\en_ewt-ud-dev.conllu"


#### Task 1.1 – Hàm đọc file .conllu

In [3]:
def load_conllu(file_path):
    sentences = []
    current_sentence = []

    with open(file_path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()

            # Câu kết thúc
            if not line:
                if current_sentence:
                    sentences.append(current_sentence)
                    current_sentence = []
                continue

            # Bỏ comment
            if line.startswith("#"):
                continue

            parts = line.split("\t")
            if "-" in parts[0] or "." in parts[0]:
                continue  # bỏ multi-word token

            word = parts[1]
            upos = parts[3]

            current_sentence.append((word, upos))

    if current_sentence:
        sentences.append(current_sentence)

    return sentences


#### Task 1.2 – Load dữ liệu & xây dựng Vocabulary

In [4]:
train_data = load_conllu(TRAIN_PATH)
dev_data   = load_conllu(DEV_PATH)

print(f"Số câu train: {len(train_data)}")
print(f"Số câu dev  : {len(dev_data)}")


Số câu train: 12544
Số câu dev  : 2001


In [5]:
# Tạo `word_to_ix` và `tag_to_ix`
word_counter = Counter()
tag_counter = Counter()

for sentence in train_data:
    for word, tag in sentence:
        word_counter[word.lower()] += 1
        tag_counter[tag] += 1

# Special tokens
PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"

word_to_ix = {
    PAD_TOKEN: 0,
    UNK_TOKEN: 1
}

for word in word_counter:
    word_to_ix[word] = len(word_to_ix)

tag_to_ix = {
    PAD_TOKEN: 0
}

for tag in tag_counter:
    tag_to_ix[tag] = len(tag_to_ix)

ix_to_tag = {v: k for k, v in tag_to_ix.items()}

print("Vocabulary size:", len(word_to_ix))
print("Number of POS tags:", len(tag_to_ix))


Vocabulary size: 16656
Number of POS tags: 18


#### Task 2.1 – Tạo POSDataset

In [6]:
class POSDataset(Dataset):
    def __init__(self, sentences, word_to_ix, tag_to_ix):
        self.sentences = sentences
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]

        word_indices = []
        tag_indices = []

        for word, tag in sentence:
            word_indices.append(
                self.word_to_ix.get(word.lower(), self.word_to_ix["<UNK>"])
            )
            tag_indices.append(
                self.tag_to_ix[tag]
            )

        return (
            torch.tensor(word_indices, dtype=torch.long),
            torch.tensor(tag_indices, dtype=torch.long)
        )


#### Task 2.2 – `collate_fn` & `DataLoader`

In [7]:
def collate_fn(batch):
    sentences, tags = zip(*batch)

    sentences_padded = pad_sequence(
        sentences, batch_first=True, padding_value=word_to_ix["<PAD>"]
    )
    tags_padded = pad_sequence(
        tags, batch_first=True, padding_value=tag_to_ix["<PAD>"]
    )

    return sentences_padded, tags_padded


train_dataset = POSDataset(train_data, word_to_ix, tag_to_ix)
dev_dataset   = POSDataset(dev_data, word_to_ix, tag_to_ix)

train_loader = DataLoader(
    train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn
)

dev_loader = DataLoader(
    dev_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn
)


#### Task 3 – Mô hình `SimpleRNNForTokenClassification`

In [8]:
class SimpleRNNForTokenClassification(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_tags):
        super().__init__()

        self.embedding = nn.Embedding(
            vocab_size, embedding_dim, padding_idx=word_to_ix["<PAD>"]
        )

        self.rnn = nn.RNN(
            embedding_dim, hidden_dim, batch_first=True
        )

        self.fc = nn.Linear(hidden_dim, num_tags)

    def forward(self, x):
        # x: (batch_size, seq_len)
        emb = self.embedding(x)               # (batch, seq_len, emb_dim)
        out, _ = self.rnn(emb)                # (batch, seq_len, hidden_dim)
        logits = self.fc(out)                 # (batch, seq_len, num_tags)
        return logits


#### Task 4 – Khởi tạo & Huấn luyện

In [9]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SimpleRNNForTokenClassification(
    vocab_size=len(word_to_ix),
    embedding_dim=100,
    hidden_dim=128,
    num_tags=len(tag_to_ix)
).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

criterion = nn.CrossEntropyLoss(
    ignore_index=tag_to_ix["<PAD>"]
)


# Vòng lặp huấn luyện
def train_one_epoch(model, dataloader):
    model.train()
    total_loss = 0

    for words, tags in dataloader:
        words = words.to(DEVICE)
        tags = tags.to(DEVICE)

        optimizer.zero_grad()
        outputs = model(words)

        loss = criterion(
            outputs.view(-1, outputs.shape[-1]),
            tags.view(-1)
        )

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


#### Task 5 – Đánh giá mô hình

In [10]:
def evaluate(model, dataloader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for words, tags in dataloader:
            words = words.to(DEVICE)
            tags = tags.to(DEVICE)

            outputs = model(words)
            predictions = torch.argmax(outputs, dim=-1)

            mask = tags != tag_to_ix["<PAD>"]
            correct += (predictions[mask] == tags[mask]).sum().item()
            total += mask.sum().item()

    return correct / total


In [11]:
# HUẤN LUYỆN HOÀN CHỈNH
NUM_EPOCHS = 5

for epoch in range(NUM_EPOCHS):
    train_loss = train_one_epoch(model, train_loader)
    train_acc = evaluate(model, train_loader)
    dev_acc = evaluate(model, dev_loader)

    print(f"Epoch {epoch+1}")
    print(f"  Train Loss: {train_loss:.4f}")
    print(f"  Train Acc : {train_acc:.4f}")
    print(f"  Dev Acc   : {dev_acc:.4f}")


Epoch 1
  Train Loss: 1.0603
  Train Acc : 0.7859
  Dev Acc   : 0.7679
Epoch 2
  Train Loss: 0.5758
  Train Acc : 0.8498
  Dev Acc   : 0.8227
Epoch 3
  Train Loss: 0.4321
  Train Acc : 0.8817
  Dev Acc   : 0.8508
Epoch 4
  Train Loss: 0.3448
  Train Acc : 0.9076
  Dev Acc   : 0.8633
Epoch 5
  Train Loss: 0.2843
  Train Acc : 0.9223
  Dev Acc   : 0.8744
