In [1]:
# import thư viện

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from datasets import load_dataset
from collections import Counter
import numpy as np


In [5]:
# Tải dữ liệu CoNLL 2003
dataset = load_dataset("conll2003", revision="refs/convert/parquet")

print(dataset)


conll2003/train/0000.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


0000.parquet:   0%|          | 0.00/312k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/283k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})


#### Task 1.2: Trích xuất câu và nhãn

In [6]:
# Lấy mapping id -> tag string
tag_names = dataset["train"].features["ner_tags"].feature.names

# Train
train_sentences = dataset["train"]["tokens"]
train_tags_id = dataset["train"]["ner_tags"]
train_tags = [[tag_names[tag] for tag in sent] for sent in train_tags_id]

# Validation
val_sentences = dataset["validation"]["tokens"]
val_tags_id = dataset["validation"]["ner_tags"]
val_tags = [[tag_names[tag] for tag in sent] for sent in val_tags_id]

print(train_sentences[0])
print(train_tags[0])


['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


#### Task 1.3: Xây dựng Vocabulary (word_to_ix, tag_to_ix)

In [7]:
# -------- WORD VOCAB --------
word_counter = Counter()
for sent in train_sentences:
    word_counter.update(sent)

word_to_ix = {
    "<PAD>": 0,
    "<UNK>": 1
}

for word in word_counter:
    word_to_ix[word] = len(word_to_ix)

# -------- TAG VOCAB --------
tag_to_ix = {}
for sent in train_tags:
    for tag in sent:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

print("Vocabulary size:", len(word_to_ix))
print("Number of NER tags:", len(tag_to_ix))


Vocabulary size: 23625
Number of NER tags: 9


#### Task 2.1: Dataset cho NER

In [8]:
class NERDataset(Dataset):
    def __init__(self, sentences, tags, word_to_ix, tag_to_ix):
        self.sentences = sentences
        self.tags = tags
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        tags = self.tags[idx]

        sent_idx = [
            self.word_to_ix.get(word, self.word_to_ix["<UNK>"])
            for word in sentence
        ]

        tag_idx = [
            self.tag_to_ix[tag] for tag in tags
        ]

        return torch.tensor(sent_idx), torch.tensor(tag_idx)


#### Task 2.2: Collate function + DataLoader

In [9]:
PAD_WORD_IDX = word_to_ix["<PAD>"]
PAD_TAG_IDX = -1  # ignore_index cho loss

def collate_fn(batch):
    sentences, tags = zip(*batch)

    sentences_padded = pad_sequence(
        sentences, batch_first=True, padding_value=PAD_WORD_IDX
    )

    tags_padded = pad_sequence(
        tags, batch_first=True, padding_value=PAD_TAG_IDX
    )

    return sentences_padded, tags_padded


train_dataset = NERDataset(train_sentences, train_tags, word_to_ix, tag_to_ix)
val_dataset = NERDataset(val_sentences, val_tags, word_to_ix, tag_to_ix)

train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn
)

val_loader = DataLoader(
    val_dataset,
    batch_size=32,
    shuffle=False,
    collate_fn=collate_fn
)


#### Task 3: Mô hình RNN cho Token Classification

In [10]:
class SimpleRNNForNER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_tags):
        super().__init__()

        self.embedding = nn.Embedding(
            vocab_size, embedding_dim, padding_idx=PAD_WORD_IDX
        )

        self.rnn = nn.RNN(
            embedding_dim, hidden_dim, batch_first=True
        )

        self.fc = nn.Linear(hidden_dim, num_tags)

    def forward(self, x):
        emb = self.embedding(x)              # (B, T, E)
        out, _ = self.rnn(emb)               # (B, T, H)
        logits = self.fc(out)                # (B, T, C)
        return logits


In [11]:
# Khởi tạo mô hình, loss, optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SimpleRNNForNER(
    vocab_size=len(word_to_ix),
    embedding_dim=100,
    hidden_dim=128,
    num_tags=len(tag_to_ix)
).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_TAG_IDX)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [12]:
# Hàm huấn luyện
def train_epoch(model, loader):
    model.train()
    total_loss = 0

    for sentences, tags in loader:
        sentences = sentences.to(device)
        tags = tags.to(device)

        optimizer.zero_grad()

        outputs = model(sentences)
        loss = criterion(
            outputs.view(-1, outputs.shape[-1]),
            tags.view(-1)
        )

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(loader)

# Hàm đánh giá
def evaluate(model, loader):
    model.eval()
    correct, total = 0, 0

    with torch.no_grad():
        for sentences, tags in loader:
            sentences = sentences.to(device)
            tags = tags.to(device)

            outputs = model(sentences)
            predictions = torch.argmax(outputs, dim=-1)

            mask = tags != PAD_TAG_IDX
            correct += ((predictions == tags) & mask).sum().item()
            total += mask.sum().item()

    return correct / total



#### Task 4: Huấn luyện mô hình

In [13]:
EPOCHS = 5

for epoch in range(EPOCHS):
    train_loss = train_epoch(model, train_loader)
    val_acc = evaluate(model, val_loader)

    print(f"Epoch {epoch+1}/{EPOCHS}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Accuracy: {val_acc:.4f}")
    print("-" * 40)


Epoch 1/5
Train Loss: 0.6412
Validation Accuracy: 0.8699
----------------------------------------
Epoch 2/5
Train Loss: 0.3720
Validation Accuracy: 0.8992
----------------------------------------
Epoch 3/5
Train Loss: 0.2572
Validation Accuracy: 0.9119
----------------------------------------
Epoch 4/5
Train Loss: 0.1859
Validation Accuracy: 0.9278
----------------------------------------
Epoch 5/5
Train Loss: 0.1381
Validation Accuracy: 0.9305
----------------------------------------


In [14]:
# Dự đoán cho 1 câu mới và test nhanh
ix_to_tag = {v: k for k, v in tag_to_ix.items()}

def predict_sentence(sentence):
    model.eval()

    tokens = sentence.split()
    indices = [
        word_to_ix.get(word, word_to_ix["<UNK>"])
        for word in tokens
    ]

    x = torch.tensor(indices).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model(x)
        preds = torch.argmax(outputs, dim=-1).squeeze(0)

    for word, tag_idx in zip(tokens, preds):
        print(f"{word:15} -> {ix_to_tag[tag_idx.item()]}")


predict_sentence("U.N. official Ekeus heads for Baghdad .")


U.N.            -> B-ORG
official        -> O
Ekeus           -> B-ORG
heads           -> O
for             -> O
Baghdad         -> B-LOC
.               -> O
