In [None]:
import pandas as pd

### 1.1. Download Data

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("naseralqaydeh/named-entity-recognition-ner-corpus")

print("Path to dataset files:", path)

In [None]:
data = pd.read_csv(f"{path}/ner.csv")

In [None]:
print(data['Sentence'].iloc[0])
print(data['POS'].iloc[0])
print(data['Tag'].iloc[0])

In [None]:
# --- read tags as array instead of string
data['Tag'] = [tag[2:-2].split("', '") for tag in data['Tag']]

### 2. Create Train/Test Dataset

#### 2.1. Create Dataset in spacy format

Create train data for what spacy NER expect: ` ("Some text here", {"entities": [(start_char, end_char, "LABEL")]}),`

In [None]:
def bio_to_spacy_format(data):
    formatted_data = []
    
    for i in range(len(data)):
        sentence = data['Sentence'][i]
        pos_tags = data['POS'][i]      # Not needed for NER
        ner_tags = data['Tag'][i]
        
        words = sentence.split()       # Assuming space tokenization
        entities = []
        start, end = 0, 0

        # print(sentence, ner_tags)
        
        for word, tag in zip(words, ner_tags):
            # print(word, tag)
            if tag.startswith('B-'):    # Beginning of an entity
                label = tag[2:]
                end += len(word) + 1
            elif tag.startswith('I-'):
                end += len(word) + 1
            elif tag.startswith('O') and start != end:
                entities.append((start, end, label))
                end += len(word) + 1
                start = end
            else: 
                end += len(word) + 1
                start = end

        formatted_data.append((sentence, {"entities": entities}))
    
    return formatted_data

In [None]:
DATASET = bio_to_spacy_format(data)

In [None]:
DATASET[:5]

#### 2.2. Split Train/Test

In [None]:
N = int(len(DATASET) * 0.8)
TRAIN_DATA = DATASET[:N]
TEST_DATA = DATASET[N:]

### 3. Create Spacy Model from scratch

#### 3.1. Init spacy model

In [None]:
nlp = spacy.blank("en")

if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

#### 3.2. Add labels to training data

In [None]:
for _, annotations in TRAIN_DATA:
    for _, _, label in annotations.get("entities"):
        ner.add_label(label)

#### 3.3. Create DocBin

In [None]:
import spacy
from spacy.tokens import DocBin

doc_bin = DocBin()
for text, annotations in TRAIN_DATA:
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annotations.get("entities"):
        span = doc.char_span(start, end, label=label)
        if span is None:
            print(f"Skipping entity: ({start}, {end}, {label}) in '{text}'")
        else:
            ents.append(span)
    doc.ents = ents
    doc_bin.add(doc)

#### 3.4. Train Model

In [None]:
import random
from spacy.training.example import Example

n_iterations = 30
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for iteration in range(n_iterations):
        random.shuffle(TRAIN_DATA)
        losses = {}

        for text, annotations in TRAIN_DATA:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], drop=0.5, sgd=optimizer, losses=losses)

        print(f"Iteration {iteration + 1}: Losses {losses}")

In [None]:
OUTPUT_MODEL = "./custom_ner_model"
nlp.to_disk(OUTPUT_MODEL)

### 4. Compare with out-of-box solution

#### 4.1. Custom Model Performance

In [None]:
t0 = TEST_DATA[0][0]
nlp(t0)

#### 4.2. Out-of-Box Performance