In [2]:
import pandas as pd

### 1.1. Download Data

In [4]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("naseralqaydeh/named-entity-recognition-ner-corpus")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /Users/emulie/.cache/kagglehub/datasets/naseralqaydeh/named-entity-recognition-ner-corpus/versions/3


In [5]:
data = pd.read_csv(f"{path}/ner.csv")

In [6]:
print(data['Sentence'].iloc[0])
print(data['POS'].iloc[0])
print(data['Tag'].iloc[0])

Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'IN', 'NNP', 'CC', 'VB', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'IN', 'DT', 'NN', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [31]:
# --- read tags as array instead of string
data['Tag'] = [tag[2:-2].split("', '") for tag in data['Tag']]

### 2. Create Train/Test Dataset

#### 2.1. Create Dataset in spacy format

Create train data for what spacy NER expect: ` ("Some text here", {"entities": [(start_char, end_char, "LABEL")]}),`

In [81]:
def bio_to_spacy_format(data):
    formatted_data = []
    
    for i in range(len(data)):
        sentence = data['Sentence'][i]
        pos_tags = data['POS'][i]      # Not needed for NER
        ner_tags = data['Tag'][i]
        
        words = sentence.split()       # Assuming space tokenization
        entities = []
        start, end = 0, 0

        # print(sentence, ner_tags)
        
        for word, tag in zip(words, ner_tags):
            # print(word, tag)
            if tag.startswith('B-'):    # Beginning of an entity
                label = tag[2:]
                end += len(word) + 1
            elif tag.startswith('I-'):
                end += len(word) + 1
            elif tag.startswith('O') and start != end:
                entities.append((start, end, label))
                end += len(word) + 1
                start = end
            else: 
                end += len(word) + 1
                start = end

        formatted_data.append((sentence, {"entities": entities}))
    
    return formatted_data

In [82]:
DATASET = bio_to_spacy_format(data)

In [60]:
DATASET[:5]

[('Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .',
  {'entities': [(48, 55, 'geo'), (77, 82, 'geo'), (111, 119, 'gpe')]}),
 ('Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as " Bush Number One Terrorist " and " Stop the Bombings . "',
  {'entities': [(109, 114, 'per')]}),
 ('They marched from the Houses of Parliament to a rally in Hyde Park .',
  {'entities': [(57, 67, 'geo')]}),
 ('Police put the number of marchers at 10,000 while organizers claimed it was 1,00,000 .',
  {'entities': []}),
 ("The protest comes on the eve of the annual conference of Britain 's ruling Labor Party in the southern English seaside resort of Brighton .",
  {'entities': [(57, 65, 'geo'),
    (75, 87, 'org'),
    (103, 111, 'gpe'),
    (129, 138, 'geo')]})]

#### 2.2. Split Train/Test

In [79]:
N = int(len(DATASET) * 0.8)
TRAIN_DATA = DATASET[:N]
TEST_DATA = DATASET[N:]

### 3. Create Spacy Model from scratch

#### 3.1. Init spacy model

In [62]:
nlp = spacy.blank("en")

if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

#### 3.2. Add labels to training data

In [67]:
for _, annotations in TRAIN_DATA:
    for _, _, label in annotations.get("entities"):
        ner.add_label(label)

#### 3.3. Create DocBin

In [69]:
import spacy
from spacy.tokens import DocBin

doc_bin = DocBin()
for text, annotations in TRAIN_DATA:
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annotations.get("entities"):
        span = doc.char_span(start, end, label=label)
        if span is None:
            print(f"Skipping entity: ({start}, {end}, {label}) in '{text}'")
        else:
            ents.append(span)
    doc.ents = ents
    doc_bin.add(doc)

Skipping entity: (48, 55, B-geo) in 'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'
Skipping entity: (77, 82, B-geo) in 'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'
Skipping entity: (111, 119, B-gpe) in 'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'
Skipping entity: (109, 114, B-per) in 'Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as " Bush Number One Terrorist " and " Stop the Bombings . "'
Skipping entity: (57, 67, B-geo) in 'They marched from the Houses of Parliament to a rally in Hyde Park .'


#### 3.4. Train Model

In [73]:
import random
from spacy.training.example import Example

n_iterations = 30
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for iteration in range(n_iterations):
        random.shuffle(TRAIN_DATA)
        losses = {}

        for text, annotations in TRAIN_DATA:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], drop=0.5, sgd=optimizer, losses=losses)

        print(f"Iteration {iteration + 1}: Losses {losses}")



Iteration 1: Losses {'ner': np.float32(68.996544)}
Iteration 2: Losses {'ner': np.float32(60.278744)}
Iteration 3: Losses {'ner': np.float32(41.84245)}
Iteration 4: Losses {'ner': np.float32(19.052116)}
Iteration 5: Losses {'ner': np.float32(6.4499598)}
Iteration 6: Losses {'ner': np.float32(0.6770032)}
Iteration 7: Losses {'ner': np.float32(0.0059725484)}
Iteration 8: Losses {'ner': np.float32(0.0043411567)}
Iteration 9: Losses {'ner': np.float32(0.0016240174)}
Iteration 10: Losses {'ner': np.float32(4.35259e-09)}
Iteration 11: Losses {'ner': np.float32(3.9147862e-11)}
Iteration 12: Losses {'ner': np.float32(1.074587e-11)}
Iteration 13: Losses {'ner': np.float32(1.2737922e-09)}
Iteration 14: Losses {'ner': np.float32(1.9643908e-13)}
Iteration 15: Losses {'ner': np.float32(1.4210276e-10)}
Iteration 16: Losses {'ner': np.float32(2.5201201e-05)}
Iteration 17: Losses {'ner': np.float32(3.8844406e-15)}
Iteration 18: Losses {'ner': np.float32(5.472135e-13)}
Iteration 19: Losses {'ner': np.f

In [74]:
OUTPUT_MODEL = "./custom_ner_model"
nlp.to_disk(OUTPUT_MODEL)

### 4. Compare with out-of-box solution

#### 4.1. Custom Model Performance

In [75]:
TEST_DATA

[('Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .',
  {'entities': [(48, 55, 'B-geo'), (77, 82, 'B-geo'), (111, 119, 'B-gpe')]}),
 ('Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as " Bush Number One Terrorist " and " Stop the Bombings . "',
  {'entities': [(109, 114, 'B-per')]}),
 ('They marched from the Houses of Parliament to a rally in Hyde Park .',
  {'entities': [(57, 67, 'B-geo')]}),
 ('Police put the number of marchers at 10,000 while organizers claimed it was 1,00,000 .',
  {'entities': []})]

#### 4.2. Out-of-Box Performance