In [1]:
import spacy

In [2]:
# Downloads the spacy model for the english language with a bash command

!python3 -m spacy download en_core_web_lg

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:05[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [3]:
# Loads the model

nlp = spacy.load("en_core_web_lg")
nlp

  from .autonotebook import tqdm as notebook_tqdm


<spacy.lang.en.English at 0x7f456d931ed0>

In [4]:
import json

# Load the training data

with open('training data/ner spacy data/training_data_NER.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [5]:
# Data example:

data[15]['text']

'How many orders were placed in Lyon?'

In [6]:
# Creates a dictionary of training data for the NER model

training_data = []
for obj in data:
    temp_dict = {}
    temp_dict['text'] = obj['text']
    temp_dict['entities'] = []
    for ent in obj['entities']:
        start = ent['startIndex']
        end = ent['endIndex']
        label = ent['tag'].upper()
        temp_dict['entities'].append((start, end, label))
    training_data.append(temp_dict)

training_data[0]

{'text': 'What is the total amount of payments received from Singapore?',
 'entities': [(51, 59, 'CITY')]}

In [7]:
from spacy.tokens import DocBin
from tqdm import tqdm

# Load a new spacy model

nlp = spacy.blank("en")
doc_bin_train = DocBin()
doc_bin_val = DocBin()

In [10]:
from spacy.util import filter_spans
import random


# Saves the training data as a spacy binary file

for training_example in tqdm(training_data): 
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text) 
    ents = []
    x = 0
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            x += 1
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    
    rand = random.randint(1, 100)
    if rand < 80: 
        doc_bin_train.add(doc)
    else:
        doc_bin_val.add(doc)

doc_bin_train.to_disk("train.spacy")
doc_bin_val.to_disk("val.spacy")
print("number of nulls: " + str(x))

100%|██████████| 4807/4807 [00:00<00:00, 13061.16it/s]


number of nulls: 0


In [11]:
# Creates a configuration file based on a preconfig downloaded from the official spacy website

!python3 -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [12]:
# Trains the model using the training and validation data

!python3 -m spacy train config.cfg --output ./spacy --paths.train ./train.spacy --paths.dev ./val.spacy

[38;5;4mℹ Saving to output directory: spacy[0m
[38;5;4mℹ Using CPU[0m
[1m
[2023-07-08 02:13:12,818] [INFO] Set up nlp object from config
[2023-07-08 02:13:12,831] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-07-08 02:13:12,834] [INFO] Created vocabulary
[2023-07-08 02:13:17,503] [INFO] Added vectors: en_core_web_lg
[2023-07-08 02:13:17,503] [INFO] Finished initializing nlp object
[2023-07-08 02:13:19,383] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     55.82    0.00    0.00    0.00    0.00
  0     200         17.27   1654.44   88.71   91.97   85.67    0.89
  0     400         18.75    526.75   92.93   92.22   93.65    0.93
  1     600         10.96    483.84   92.98   97.60   88.78    0