In [None]:
# Install required libraries
!pip install datasets pytorch-crf --quiet

In [None]:
from datasets import load_dataset
ds = load_dataset("GateNLP/broad_twitter_corpus")
ds = ds["train"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.60k [00:00<?, ?B/s]

broad_twitter_corpus.py:   0%|          | 0.00/6.58k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/449k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/163k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/192k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5342 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2002 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2002 [00:00<?, ? examples/s]

In [None]:
print(ds)

Dataset({
    features: ['id', 'tokens', 'ner_tags'],
    num_rows: 5342
})


In [None]:
len(ds)

5342

In [None]:
tag_list = ds.features['ner_tags'].feature.names if hasattr(ds, 'features') else ds['train'].features['ner_tags'].feature.names
print("All tags:", tag_list)

All tags: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']


In [None]:
selected_tags = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
selected_tag_indices = [tag_list.index(t) for t in selected_tags]

In [None]:
def contains_selected_tag(example):
    return any(tag in selected_tag_indices for tag in example['ner_tags'])

ds_filtered = ds.filter(contains_selected_tag)

Filter:   0%|          | 0/5342 [00:00<?, ? examples/s]

In [None]:
from collections import Counter
from itertools import chain
from sklearn.model_selection import train_test_split

In [None]:
# Building vocab
all_tokens = list(chain.from_iterable(ds_filtered['tokens']))
word_counts = Counter(all_tokens)
vocab = {word: idx + 2 for idx, (word, _) in enumerate(word_counts.items())}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1

In [None]:
# Define tag mappings for all selected tags
tag2id = {tag: idx for idx, tag in enumerate(selected_tags)}
id2tag = {idx: tag for tag, idx in tag2id.items()}

In [None]:
def encode(example):
    tokens = [vocab.get(t.lower(), vocab["<UNK>"]) for t in example['tokens']]
    tags = [tag2id[tag_list[t]] for t in example['ner_tags']]
    return {"input_ids": tokens, "labels": tags}

In [None]:
encoded_dataset = ds_filtered.map(encode)

Map:   0%|          | 0/5338 [00:00<?, ? examples/s]

In [None]:
# Pad sequences
def pad_sequences(inputs, pad_token=0):
    max_len = max(len(seq) for seq in inputs)
    return [seq + [pad_token] * (max_len - len(seq)) for seq in inputs]

In [None]:
split_dataset = encoded_dataset.train_test_split(test_size=0.2)
train_ds = split_dataset['train']
test_ds = split_dataset['test']

In [None]:
len(train_ds), len(test_ds)

(4270, 1068)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
class NERDataset(Dataset):
    def __init__(self, data):
        self.inputs = pad_sequences(data['input_ids'])
        self.labels = pad_sequences(data['labels'])

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.inputs[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long),
            'mask': torch.tensor([1 if t != 0 else 0 for t in self.inputs[idx]], dtype=torch.uint8)
        }

train_dataset = NERDataset(train_ds)
test_dataset = NERDataset(test_ds)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)


In [None]:
import torch.nn as nn
from torchcrf import CRF

In [None]:
class CRFTagger(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim=100):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.fc = nn.Linear(embedding_dim, tagset_size)
        self.crf = CRF(tagset_size, batch_first=True)

    def forward(self, input_ids, mask):
        x = self.embedding(input_ids)
        emissions = self.fc(x)
        return self.crf.decode(emissions, mask=mask)

    def loss(self, input_ids, labels, mask):
        x = self.embedding(input_ids)
        emissions = self.fc(x)
        return -self.crf(emissions, labels, mask=mask, reduction='mean')

In [None]:
class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim=100, hidden_dim=128):
        super(BiLSTM_CRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim, tagset_size)
        self.crf = CRF(tagset_size, batch_first=True)

    def forward(self, input_ids, mask):
        x = self.embedding(input_ids)
        x, _ = self.lstm(x)
        emissions = self.fc(x)
        return self.crf.decode(emissions, mask=mask)

    def loss(self, input_ids, labels, mask):
        x = self.embedding(input_ids)
        x, _ = self.lstm(x)
        emissions = self.fc(x)
        return -self.crf(emissions, labels, mask=mask, reduction='mean')

In [None]:
def train_and_evaluate(model, name):
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    print(f"\nTraining {name} model")

    for epoch in range(20):
        model.train()
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            mask = batch['mask'].to(device)

            optimizer.zero_grad()
            loss = model.loss(input_ids, labels, mask)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            mask = batch['mask'].to(device)

            preds = model(input_ids, mask)
            for pred, gold, m in zip(preds, labels, mask):
                length = m.sum().item()
                all_preds.extend(pred[:length])
                all_labels.extend(gold[:length].tolist())

    from sklearn.metrics import classification_report, f1_score, accuracy_score
    labels = list(tag2id.values())
    target_names = [id2tag[i] for i in labels]
    print(f"\n{name} Classification Report:")
    print(classification_report(all_labels, all_preds, labels=labels, target_names=target_names))

    f1 = f1_score(all_labels, all_preds, average='macro')
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"{name} F1 Score (macro): {f1:.4f}")
    print(f"{name} Accuracy: {accuracy:.4f}\n")


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
crf_model = CRFTagger(len(vocab), len(tag2id))
bilstm_crf_model = BiLSTM_CRF(len(vocab), len(tag2id))

train_and_evaluate(crf_model, "CRF")
train_and_evaluate(bilstm_crf_model, "BiLSTM-CRF")


Training CRF model


  score = torch.where(mask[i].unsqueeze(1), next_score, score)


Epoch 1, Loss: 3110.0081
Epoch 2, Loss: 1783.2852
Epoch 3, Loss: 1269.2930
Epoch 4, Loss: 1026.9514
Epoch 5, Loss: 887.0492
Epoch 6, Loss: 794.4233
Epoch 7, Loss: 729.7925
Epoch 8, Loss: 680.0085
Epoch 9, Loss: 640.1908
Epoch 10, Loss: 605.2791
Epoch 11, Loss: 579.0151
Epoch 12, Loss: 554.8577
Epoch 13, Loss: 533.6405
Epoch 14, Loss: 515.2682
Epoch 15, Loss: 499.5163
Epoch 16, Loss: 485.3540
Epoch 17, Loss: 471.6583
Epoch 18, Loss: 460.1391
Epoch 19, Loss: 451.6211
Epoch 20, Loss: 441.7846

CRF Classification Report:
              precision    recall  f1-score   support

           O       0.93      0.96      0.94     15231
       B-PER       0.32      0.30      0.31       588
       I-PER       0.29      0.37      0.33       189
       B-ORG       0.52      0.22      0.31       430
       I-ORG       0.51      0.25      0.34       144
       B-LOC       0.53      0.23      0.32       415
       I-LOC       0.48      0.27      0.34       120

    accuracy                           0.89

In [None]:
from collections import Counter
tag_counts = Counter(tag for tags in ds_filtered['ner_tags'] for tag in tags)
print({tag_list[i]: c for i, c in tag_counts.items()})

{'O': 75816, 'B-LOC': 1877, 'B-PER': 2955, 'B-ORG': 2309, 'I-ORG': 803, 'I-PER': 1084, 'I-LOC': 636}


In [None]:
import torch

# Choose a tweet index from the test set
index = 569 # You can change this to inspect different samples
example = test_ds[index]

# Reconstruct words from token IDs
inv_vocab = {v: k for k, v in vocab.items()}
tokens = [inv_vocab.get(tid, "<UNK>") for tid in example['input_ids']]
true_tags = [id2tag[label] for label in example['labels']]

# Prepare input and mask for model
input_tensor = torch.tensor([example['input_ids']], dtype=torch.long).to(device)
mask_tensor = torch.tensor([[1 if t != 0 else 0 for t in example['input_ids']]], dtype=torch.uint8).to(device)

# Run predictions
crf_preds = crf_model(input_tensor, mask_tensor)[0]
bilstm_preds = bilstm_crf_model(input_tensor, mask_tensor)[0]

# Decode predicted tag IDs
crf_tags = [id2tag[p] for p in crf_preds]
bilstm_tags = [id2tag[p] for p in bilstm_preds]

# Print formatted comparison
print(f"{'Token':<15}{'True':<12}{'CRF':<12}{'BiLSTM-CRF'}")
print("-" * 55)
for tok, true, crf, bilstm in zip(tokens, true_tags, crf_tags, bilstm_tags):
    print(f"{tok:<15}{true:<12}{crf:<12}{bilstm}")


Token          True        CRF         BiLSTM-CRF
-------------------------------------------------------
<UNK>          B-ORG       B-ORG       B-LOC
3-0            O           I-ORG       O
manchester     B-ORG       B-ORG       B-ORG
united         I-ORG       I-ORG       I-ORG
...            O           O           I-ORG
hahahahahahahahahahahahahahaO           O           O


In [None]:
# Index you want to inspect
index = 569

# Retrieve the example
example = test_ds[index]

# Convert token IDs back to words
inv_vocab = {v: k for k, v in vocab.items()}
tokens = [inv_vocab.get(tid, "<UNK>") for tid in example['input_ids']]

# Convert label IDs to tag names
true_tags = [id2tag[label] for label in example['labels']]

# Print token-tag pairs
print(f"{'Token':<15}{'True Tag'}")
print("-" * 30)
for token, tag in zip(tokens, true_tags):
    print(f"{token:<15}{tag}")


Token          True Tag
------------------------------
<UNK>          B-ORG
3-0            O
manchester     B-ORG
united         I-ORG
...            O
hahahahahahahahahahahahahahaO
