# Task 1

In [13]:
from datasets import load_dataset

dataset = load_dataset("lhoestq/conll2003")

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})


In [14]:
train_sentences = dataset["train"]["tokens"]
train_tags = dataset["train"]["ner_tags"]
val_sentences = dataset["validation"]["tokens"]
val_tags = dataset["validation"]["ner_tags"]
test_sentences = dataset["test"]["tokens"]
test_tags = dataset["test"]["ner_tags"]

tag_names = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"]
train_tags_str = [[tag_names[tag] for tag in seq] for seq in train_tags]
val_tags_str = [[tag_names[tag] for tag in seq] for seq in val_tags]
test_tags_str = [[tag_names[tag] for tag in seq] for seq in test_tags]

print("Example sentence: ", train_sentences[0])
print("NER tags: ", train_tags_str[0])

Example sentence:  ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
NER tags:  ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


In [15]:
# Từ điển từ
vocab = set(word for sentence in train_sentences for word in sentence)
word_to_ix = {word: idx for idx, word in enumerate(vocab)}

word_to_ix["<PAD>"] = len(word_to_ix)
word_to_ix["<UNK>"] = len(word_to_ix)

print("Vocabulary size:", len(word_to_ix))

# Từ điển nhãn
unique_tags = set(tag for seq in train_tags_str for tag in seq)
tag_to_ix = {tag: idx for idx, tag in enumerate(unique_tags)}

tag_to_ix["<PAD>"] = len(tag_to_ix)

print("Number of unique NER tags:", len(tag_to_ix))
print("NER tags mapping:", tag_to_ix)

Vocabulary size: 23625
Number of unique NER tags: 10
NER tags mapping: {'I-PER': 0, 'B-LOC': 1, 'I-ORG': 2, 'B-PER': 3, 'I-MISC': 4, 'I-LOC': 5, 'B-ORG': 6, 'O': 7, 'B-MISC': 8, '<PAD>': 9}


# Task 2

In [16]:
import torch
from torch.utils.data import Dataset

PAD_WORD_IDX = word_to_ix["<PAD>"]
UNK_IDX = word_to_ix["<UNK>"]
PAD_TAG_IDX = tag_to_ix["<PAD>"]

class NERDataset(Dataset):
    def __init__(self, sentences, tags, word_to_ix, tag_to_ix):
        self.sentences = sentences
        self.tags = tags
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        tags = self.tags[idx]

        sentence_idx = [self.word_to_ix.get(word, UNK_IDX) for word in sentence]
        tag_idx = [self.tag_to_ix.get(tag, PAD_TAG_IDX) for tag in tags]

        return torch.tensor(sentence_idx, dtype=torch.long), torch.tensor(tag_idx, dtype=torch.long)


In [17]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    sentences, tags = zip(*batch)

    # pad các câu và nhãn về cùng độ dài
    sentences_padded = pad_sequence(sentences, batch_first=True, padding_value=PAD_WORD_IDX)
    tags_padded = pad_sequence(tags, batch_first=True, padding_value=PAD_TAG_IDX)

    return sentences_padded, tags_padded


In [18]:
from torch.utils.data import DataLoader

# Dataset
train_dataset = NERDataset(train_sentences, train_tags_str, word_to_ix, tag_to_ix)
val_dataset = NERDataset(val_sentences, val_tags_str, word_to_ix, tag_to_ix)
test_dataset = NERDataset(test_sentences, test_tags_str, word_to_ix, tag_to_ix)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

# Test batch
for sentences_batch, tags_batch in train_loader:
    print("Sentences batch shape:", sentences_batch.shape)
    print("Tags batch shape:", tags_batch.shape)
    break


Sentences batch shape: torch.Size([32, 39])
Tags batch shape: torch.Size([32, 39])


# Task 3

In [19]:
import torch
import torch.nn as nn

class SimpleRNNForTokenClassification(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size, num_layers=1, dropout=0.1):
        super(SimpleRNNForTokenClassification, self).__init__()

        # 1. Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=PAD_WORD_IDX)

        # 2. Bi-LSTM
        self.rnn = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=True
        )

        # 3. Linear layer
        self.fc = nn.Linear(hidden_dim * 2, output_size)

    def forward(self, x):
        # embedding
        embeds = self.embedding(x)  # [batch_size, seq_len, embedding_dim]

        # Bi-LSTM
        rnn_out, _ = self.rnn(embeds)  # [batch_size, seq_len, hidden_dim]

        # Ánh xạ sang output_size
        logits = self.fc(rnn_out)      # [batch_size, seq_len, output_size]

        return logits

# Tham số
vocab_size = len(word_to_ix)       # từ điển từ
embedding_dim = 100                # kích thước embedding
hidden_dim = 128                   # số hidden units
output_size = len(tag_to_ix)       # số lượng nhãn NER

model = SimpleRNNForTokenClassification(vocab_size, embedding_dim, hidden_dim, output_size)
print(model)


SimpleRNNForTokenClassification(
  (embedding): Embedding(23625, 100, padding_idx=23623)
  (rnn): LSTM(100, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=10, bias=True)
)


# Task 4

In [20]:
import torch.optim as optim
import torch.nn as nn

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Loss function, bỏ qua padding tokens
criterion = nn.CrossEntropyLoss(ignore_index=PAD_TAG_IDX)


In [21]:
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    count = 0

    for sentences_batch, tags_batch in train_loader:
        optimizer.zero_grad()

        # Forward pass
        outputs = model(sentences_batch)  # [batch, seq_len, num_classes]

        # Reshape outputs & targets để tính loss
        # CrossEntropyLoss expects [batch*seq_len, num_classes] and [batch*seq_len]
        outputs = outputs.view(-1, outputs.shape[-1])
        tags_batch = tags_batch.view(-1)

        loss = criterion(outputs, tags_batch)

        # Backward
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        count += 1

    avg_loss = total_loss / count
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")


Epoch 1/3, Average Loss: 0.5952
Epoch 2/3, Average Loss: 0.2667
Epoch 3/3, Average Loss: 0.1550


# Task 5

In [22]:
!pip install seqeval
import torch
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score

def evaluate(model, data_loader, tag_to_ix):
    model.eval()
    idx_to_tag = {idx: tag for tag, idx in tag_to_ix.items()}

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for sentences_batch, tags_batch in data_loader:
            outputs = model(sentences_batch)  # [batch, seq_len, num_classes]
            preds = torch.argmax(outputs, dim=-1)  # [batch, seq_len]

            # Chuyển tensor về list
            preds = preds.tolist()
            tags_batch = tags_batch.tolist()

            for pred_seq, true_seq in zip(preds, tags_batch):
                pred_tags = []
                true_tags = []
                for p, t in zip(pred_seq, true_seq):
                    if t == PAD_TAG_IDX:  # bỏ qua token padding
                        continue
                    pred_tags.append(idx_to_tag[p])
                    true_tags.append(idx_to_tag[t])
                all_preds.append(pred_tags)
                all_labels.append(true_tags)

    # Token-level accuracy
    correct = sum([p==t for seq_p, seq_t in zip(all_preds, all_labels) for p,t in zip(seq_p, seq_t)])
    total = sum([len(seq) for seq in all_labels])
    accuracy = correct / total
    print(f"Token-level Accuracy: {accuracy:.4f}")

    # F1, Precision, Recall
    f1 = f1_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")

    return accuracy, precision, recall, f1




In [23]:
def predict_sentence(model, sentence, word_to_ix, tag_to_ix):
    if isinstance(sentence, str):
        sentence = sentence.split()

    model.eval()
    idx_to_tag = {idx: tag for tag, idx in tag_to_ix.items()}

    # Chuyển words sang indices
    sentence_idx = [word_to_ix.get(w, UNK_IDX) for w in sentence]
    sentence_tensor = torch.tensor(sentence_idx, dtype=torch.long).unsqueeze(0)  # [1, seq_len]

    with torch.no_grad():
        outputs = model(sentence_tensor)  # [1, seq_len, num_classes]
        preds = torch.argmax(outputs, dim=-1).squeeze(0).tolist()  # [seq_len]

    predicted_tags = [idx_to_tag[p] for p in preds]
    return list(zip(sentence, predicted_tags))


In [24]:
# Đánh giá trên tập validation
print("Validation:")
accuracy, precision, recall, f1 = evaluate(model, val_loader, tag_to_ix)
# Đánh giá trên tập test
print("\nTest:")
accuracy, precision, recall, f1 = evaluate(model, test_loader, tag_to_ix)

# Dự đoán cho một câu mới
sentence = "VNU University is located in Hanoi"
predictions = predict_sentence(model, sentence, word_to_ix, tag_to_ix)
print("\n", predictions)


Validation:
Token-level Accuracy: 0.9332
Precision: 0.7534, Recall: 0.6082, F1-score: 0.6731

Test:
Token-level Accuracy: 0.9100
Precision: 0.6697, Recall: 0.4873, F1-score: 0.5641

 [('VNU', 'B-ORG'), ('University', 'I-ORG'), ('is', 'O'), ('located', 'O'), ('in', 'O'), ('Hanoi', 'O')]
