In [1]:
%run -i "../util/lang_utils.ipynb"

In [2]:
article = "../data/nvidia2025q3.txt"
with open(article, 'r', encoding='utf-8') as f:
    text = f.read()
doc = small_model(text)

In [3]:
for ent in doc.ents[:15]:
    print(f"{ent.text:50}: {ent.label_}")
print(f"\nTotal number of entities: {len(doc.ents)}")

Nvidia                                            : GPE
NVDA                                              : ORG
Q3 2025                                           : PERSON
5:00 p.m.                                         : TIME
afternoon                                         : TIME
Jay                                               : PERSON
today                                             : DATE
NVIDIA                                            : ORG
third-quarter                                     : DATE
Stewart Stecker                                   : PERSON
Stewart Stecker                                   : PERSON
afternoon                                         : TIME
NVIDIA                                            : ORG
the third quarter of fiscal 2025                  : DATE
today                                             : DATE

Total number of entities: 683


In [4]:
doc2 = large_model(text)
for ent in doc2.ents[:15]:
    print(f"{ent.text:50}: {ent.label_}")
print(f"\nTotal number of entities: {len(doc.ents)}")

Nvidia                                            : ORG
NVDA                                              : ORG
2025                                              : DATE
5:00 p.m. ET                                      : TIME
afternoon                                         : TIME
Jay                                               : PERSON
today                                             : DATE
NVIDIA                                            : ORG
third-quarter                                     : DATE
Stewart Stecker                                   : PERSON
Stewart Stecker                                   : PERSON
afternoon                                         : TIME
NVIDIA                                            : ORG
the third quarter of fiscal 2025                  : DATE
today                                             : DATE

Total number of entities: 683


## Train a custom spaCy NER model

In [5]:
import pandas as pd
from spacy.cli.train import train
from spacy.cli.evaluate import evaluate
from spacy.tokens import DocBin
from sklearn.model_selection import train_test_split

In [6]:
music_ner_df = pd.read_csv("../data/music_ner.csv")
music_ner_df.head()

Unnamed: 0,id,text,start_offset,end_offset,label
0,13434,i love radioheads kid a something similar | ki...,7,17,Artist_known
1,13434,i love radioheads kid a something similar | ki...,61,71,Artist_or_WoA_deduced
2,13435,anything similar to i fight dragons,20,35,WoA_deduced
3,13436,music similar to ccrs travelin band,17,30,Artist_deduced
4,13437,songs similar to blackout by boris,17,25,WoA_deduced


In [7]:
def change_label(input_label):
    input_label = input_label.replace("_deduced", "")
    return input_label
music_ner_df['label'] = music_ner_df['label'].apply(change_label)
music_ner_df.head()

Unnamed: 0,id,text,start_offset,end_offset,label
0,13434,i love radioheads kid a something similar | ki...,7,17,Artist_known
1,13434,i love radioheads kid a something similar | ki...,61,71,Artist_or_WoA
2,13435,anything similar to i fight dragons,20,35,WoA
3,13436,music similar to ccrs travelin band,17,30,Artist
4,13437,songs similar to blackout by boris,17,25,WoA


In [8]:
len(music_ner_df)

427

In [9]:
train_db = DocBin()
test_db = DocBin()

In [10]:
# Get a list of unique IDs
ids = list(set(music_ner_df['id'].values))
print(len(ids))
train_ids, test_ids = train_test_split(ids, test_size=0.2, random_state=42)
print(len(train_ids), len(test_ids))

226
180 46


In [11]:
# Extract named entities and add to train_db and test_db
for id in ids:
    entity_rows = music_ner_df[music_ner_df['id'] == id]
    text = entity_rows.head(1)['text'].values[0]
    doc = small_model(text)
    ents = []
    for _, row in entity_rows.iterrows():
        start = row['start_offset']
        end = row['end_offset']
        label = row['label']
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is not None:
            ents.append(span)
    doc.ents = ents
    if id in train_ids:
        train_db.add(doc)
    else:
        test_db.add(doc)

train_db.to_disk("../data/music_ner_train.spacy")
test_db.to_disk("../data/music_ner_test.spacy")

In [12]:
# You can run the following from Jupyter Notebook: train("../data/spacy_config_ner.cfg", output_path="../models/spacy_music_ner", use_gpu=-1)
# However, it's preferred to run the following at the command line:
# You need to ensure that you are in the correct directory nlp_cookbook/notebooks/ when you run the command below
# python -m spacy train ../data/spacy_config_ner.cfg --output ../models/spacy_music_ner --gpu-id -1

In [13]:
# Load the trained model and evaluate on the test set
import spacy

nlp = spacy.load("../models/spacy_music_ner/model-last")
first_test_id = test_ids[0]
test_rows = music_ner_df[music_ner_df['id'] == first_test_id]
test_text = test_rows.head(1)['text'].values[0]
print(f"Test text: {test_text}\n")
print("Ground truth entities:")
for _, row in test_rows.iterrows():
    print(f"{row['label']:10}: {test_text[row['start_offset']:row['end_offset']]}")
print("\nPredicted entities:")
doc = nlp(test_text)
for ent in doc.ents:
    print(f"{ent.label_:10}: {ent.text}")


Test text: looking for similar tracks to amon tobin & kid koala untitled

Ground truth entities:
Artist    : amon tobin & kid koala
WoA       : untitled

Predicted entities:
Artist    : amon
Artist    : tobin
Artist_or_WoA: kid koala


In [14]:
evaluate("../models/spacy_music_ner/model-last", "../data/music_ner_test.spacy")

{'token_acc': 1.0,
 'token_p': 1.0,
 'token_r': 1.0,
 'token_f': 1.0,
 'tag_acc': 0.7984031936127745,
 'sents_p': 0.7407407407407407,
 'sents_r': 0.8333333333333334,
 'sents_f': 0.7843137254901961,
 'dep_uas': 0.7212787212787213,
 'dep_las': 0.6073926073926074,
 'dep_las_per_type': {'csubj': {'p': 0.0, 'r': 0.0, 'f': 0.0},
  'prep': {'p': 0.8, 'r': 0.7671232876712328, 'f': 0.7832167832167832},
  'amod': {'p': 0.7894736842105263,
   'r': 0.7317073170731707,
   'f': 0.7594936708860759},
  'pobj': {'p': 0.7887323943661971, 'r': 0.8115942028985508, 'f': 0.8},
  'compound': {'p': 0.5432098765432098,
   'r': 0.6470588235294118,
   'f': 0.5906040268456375},
  'cc': {'p': 0.5625, 'r': 0.6, 'f': 0.5806451612903225},
  'conj': {'p': 0.125, 'r': 0.15384615384615385, 'f': 0.13793103448275862},
  'root': {'p': 0.7037037037037037,
   'r': 0.7916666666666666,
   'f': 0.7450980392156864},
  'advmod': {'p': 0.3333333333333333,
   'r': 0.23809523809523808,
   'f': 0.2777777777777778},
  'dep': {'p': 0.0