In [18]:
import spacy
from spacy.tokens import DocBin
from spacy.util import minibatch, compounding
from pathlib import Path

In [19]:
TRAIN_DATA = [
    ("Barack Obama was born in Hawaii.", {"entities": [(0, 12, "PERSON"), (25, 31, "GPE")]}),
    ("Apple is looking at buying a U.K. startup.", {"entities": [(0, 5, "ORG"), (27, 30, "GPE")]}),
    ("Elon Musk founded SpaceX.", {"entities": [(0, 9, "PERSON"), (18, 24, "ORG")]}),
    ("Google is a tech company.", {"entities": [(0, 6, "ORG")]}),
]

In [20]:
nlp = spacy.blank("en")

In [21]:
doc_bin = DocBin()

In [22]:
for text, annot in TRAIN_DATA:
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label)
        if span:
            ents.append(span)
    doc.ents = ents
    doc_bin.add(doc)
doc_bin.to_disk("train.spacy")
doc_bin.to_disk("dev.spacy")

In [26]:
!python -m spacy init config config.cfg --lang en --pipeline ner --force

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [27]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy --verbose

[2025-05-04 09:57:19,891] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev']
[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[2025-05-04 09:57:20,864] [INFO] Set up nlp object from config
[2025-05-04 09:57:20,893] [DEBUG] Loading corpus from path: dev.spacy
[2025-05-04 09:57:20,897] [DEBUG] Loading corpus from path: train.spacy
[2025-05-04 09:57:20,898] [INFO] Pipeline: ['tok2vec', 'ner']
[2025-05-04 09:57:20,905] [INFO] Created vocabulary
[2025-05-04 09:57:20,905] [INFO] Finished initializing nlp object

Load the table in your config with:

[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]

[2025-05-04 09:57:21,094] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[2025-05-04 09:57:21,136] [DEBUG] Loading corpus from path: dev.spacy
[2025-05-04 09:57:21,143] [DEBUG] Loading corpus from path: 

In [28]:
import spacy

nlp_ner = spacy.load("output/model-best")
doc = nlp_ner("Barack Obama visited Berlin in 2008.")

print("\nEntities found:")
for ent in doc.ents:
    print(ent.text, "->", ent.label_)


Entities found:
Barack Obama -> PERSON
2008 -> GPE


In [31]:
! pip install datasets

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (1

In [32]:
from datasets import load_dataset
dataset = load_dataset("conll2003")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [33]:
label_list = dataset["train"].features["ner_tags"].feature.names
print("NER labels:", label_list)

NER labels: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


In [34]:
from tqdm import tqdm
def convert_to_spacy_format(dataset_split, nlp):
    doc_bin = DocBin()
    for example in tqdm(dataset_split):
        words = example["tokens"]
        tags = example["ner_tags"]
        doc = nlp.make_doc(" ".join(words))
        ents = []
        start = 0
        for word, tag in zip(words, tags):
            word_start = doc.text.find(word, start)
            word_end = word_start + len(word)
            if tag != 0:  # 0 = "O" (no entity)
                ents.append(doc.char_span(word_start, word_end, label=label_list[tag]))
            start = word_end
        ents = [e for e in ents if e is not None]
        doc.ents = ents
        doc_bin.add(doc)
    return doc_bin

In [35]:
nlp_blank = spacy.blank("en")

doc_bin_train = convert_to_spacy_format(dataset["train"].select(range(1000)), nlp_blank)
doc_bin_train.to_disk("train.spacy")

doc_bin_dev = convert_to_spacy_format(dataset["validation"].select(range(200)), nlp_blank)
doc_bin_dev.to_disk("dev.spacy")

100%|██████████| 1000/1000 [00:01<00:00, 542.37it/s]
100%|██████████| 200/200 [00:00<00:00, 1615.27it/s]


In [36]:
!python -m spacy init config config.cfg --lang en --pipeline ner --force



[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[2025-05-04 10:05:18,394] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev']
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[2025-05-04 10:05:19,132] [INFO] Set up nlp object from config
[2025-05-04 10:05:19,150] [DEBUG] Loading corpus from path: dev.spacy
[2025-05-04 10:05:19,153] [DEBUG] Loading corpus from path: train.spacy
[2025-05-04 10:05:19,153] [INFO] Pipeline

In [39]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy --verbose

[2025-05-04 10:13:50,403] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev']
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[2025-05-04 10:13:51,057] [INFO] Set up nlp object from config
[2025-05-04 10:13:51,073] [DEBUG] Loading corpus from path: dev.spacy
[2025-05-04 10:13:51,075] [DEBUG] Loading corpus from path: train.spacy
[2025-05-04 10:13:51,076] [INFO] Pipeline: ['tok2vec', 'ner']
[2025-05-04 10:13:51,079] [INFO] Created vocabulary
[2025-05-04 10:13:51,079] [INFO] Finished initializing nlp object

Load the table in your config with:

[initialize.lookups]
@misc = "spacy.LookupsDataLoader.v1"
lang = ${nlp.lang}
tables = ["lexeme_norm"]

[2025-05-04 10:13:51,716] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[2025-05-04 10:13:51,736] [DEBUG] Loading corpus from path: dev.spacy
[2025-05-04 10:13:51,739] [DEBUG] Loading corpus from path: train.spacy
[2025-05-04 10:13:51,741] [DEBUG] Re

In [40]:
nlp_trained = spacy.load("./output/model-best")


In [41]:
test_text = "Barack Obama visited Germany in 2008."
print("\nEntities found:")
for ent in doc.ents:
    print(ent.text, "->", ent.label_)


Entities found:
Barack Obama -> PERSON
2008 -> GPE
