In [5]:
import pandas as pd

### 1.1. Download Data

In [6]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("naseralqaydeh/named-entity-recognition-ner-corpus")

print("Path to dataset files:", path)

Path to dataset files: /Users/emulie/.cache/kagglehub/datasets/naseralqaydeh/named-entity-recognition-ner-corpus/versions/3


### 1.2. Read Data

In [7]:
data = pd.read_csv(f"{path}/ner.csv")

In [8]:
print(data['Sentence'].iloc[0])
print(data['POS'].iloc[0])
print(data['Tag'].iloc[0])

Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'IN', 'NNP', 'CC', 'VB', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'IN', 'DT', 'NN', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [9]:
len(data)

47959

In [10]:
sentences = data['Sentence']
pos = data['POS']
tags = data['Tag']

In [11]:
# --- read tags as array instead of string
tags = [tag[2:-2].split("', '") for tag in tags]

### 1.3. Split the data

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_tmp, y_train, y_tmp = train_test_split(sentences, tags, test_size=0.5, random_state=420)
X_val, X_test, y_val, y_test = train_test_split(X_tmp, y_tmp, test_size=0.2, random_state=420)


### 2.1. Preprocessing - Tokenization

In [13]:
import spacy

tokenizer = spacy.load("en_core_web_sm")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.4 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/py

In [14]:
doc = tokenizer(sentences[0])

In [15]:
from collections import defaultdict 

vocab = defaultdict(lambda: len(vocab))
UNK_TOKEN = "<unk>"
PAD_TOKEN = "<pad>"
vocab[UNK_TOKEN]
vocab[PAD_TOKEN]

1

In [16]:
def tokenize_sentence(sentence, vocab):
    doc = tokenizer(sentence)
    return [vocab[token.text.lower()] for token in doc]

In [17]:
tokenized_sentences = [tokenize_sentence(sentence, vocab) for sentence in sentences]

In [18]:
# --- OPTIONAL: pad sequence to be the same length (for Neural Network Solution)

In [19]:
longest = max([len(seq) for seq in tokenized_sentences])
padded_sentences = [seq + [vocab[PAD_TOKEN]] * (longest - len(seq)) for seq in tokenized_sentences]

### 2.2. Preprocessing - Encoding Label

In [20]:
labels = defaultdict(lambda: len(labels))
PAD_TAG = "B-UNK"
labels[PAD_TAG]

0

In [24]:
labels

defaultdict(<function __main__.<lambda>()>,
            {'B-UNK': 0,
             '<pad>': 1,
             'O': 2,
             'B-geo': 3,
             'B-gpe': 4,
             'B-per': 5,
             'I-geo': 6,
             'B-org': 7,
             'I-org': 8,
             'B-tim': 9,
             'B-art': 10,
             'I-art': 11,
             'I-per': 12,
             'I-gpe': 13,
             'I-tim': 14,
             'B-nat': 15,
             'B-eve': 16,
             'I-eve': 17,
             'I-nat': 18})

In [21]:
padded_tags = [seq + [labels[PAD_TOKEN]] * (longest - len(seq)) for seq in tags]

In [22]:
def tokenize_label(tag, labels):
    return [labels[t] for t in tag]

In [23]:
tokenized_tags = [tokenize_label(tag, labels) for tag in tags]

In [None]:
print(len(vocab))
print(len(labels))

### 3.0. Defining Masked Loss and Masked Accuracy Functions

Notes:
- We need to define a "masked" loss and "masked" accuracy to account for sequence with
  different lengths. Because neural network need fixed-size input, we need to pad shorter
  sequence with a special token
- This padding shouldn't be taken into account toward loss and accuracy, so we need to
  ignore these "padded" parts during loss and accuracy calculation

In [20]:
def masked_loss(y_pred, y_true, ignore_index=0):
    
    pass

In [21]:
def masked_accuracy(y_pred, y_true, ignore_index=0):
    pass

### 3.1. Model - Bidirectional LSTM

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [23]:
class NERLSTM(nn.Module):
    """
    1. Embedding => vocab_size, embedding_dim
    2. LSTM => 
    3. Classifier (Linear Layer) =>
    """

    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, padding_idx=0, 
                device='cpu'):
        super(NERLSTM, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, 
            padding_idx=padding_idx).to(device)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, 
                            batch_first=True).to(device)
        self.fc = nn.Linear(in_features=hidden_dim, out_features=output_dim).to(device)

    def forward(self, x):
        x = x.to(device)
        embedded = self.embedding(x)
        output, _ = self.lstm(x)
        pred = self.linear(output)
        return F.log_softmax(pred, dim=1)

In [24]:
device = torch.device('mps')

In [25]:
model = NERLSTM(vocab_size=len(vocab), embedding_dim=128, hidden_dim=256, 
                output_dim=len(labels), device=device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [26]:
X = torch.tensor(padded_sentences).long()

In [27]:
x0 = X[:32].to(device)

NameError: name 'vocab' is not defined

In [None]:
model(x0)