### prepare training data
prepare NER training data using doccano tool
it should look like samples.json

In [1]:
from spacy.util import filter_spans
import json
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

In [2]:
with open('samples.json', 'r') as f:
    data = json.load(f)
    
print(data[0])

{'id': 1, 'text': 'Yes, it is a product', 'label': [[13, 20, 'PRODUCT']], 'Comments': []}


In [3]:
training_data = {
    'classes' : ['PRODUCT', "SERVICE", "PROCESS"],
    'annotations' : []
}

for example in data:
  data_row = {}
  data_row['text'] = example['text']
  data_row['entities'] = []

  for annotation in example['label']:
    start = annotation[0]
    end = annotation[1]
    label = annotation[2]
    data_row['entities'].append((start, end, label))
  training_data['annotations'].append(data_row)
  
print(training_data['annotations'][1])

{'text': 'Yes, sure, it is a process', 'entities': [(19, 26, 'PROCESS')]}


In [4]:
nlp = spacy.blank("en") 

doc_bin = DocBin()

In [5]:
for training_row  in tqdm(training_data['annotations']): 
    text = training_row['text']
    labels = training_row['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is not None:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

    
doc_bin.to_disk("training.spacy")

100%|█████████████████████████████████████████| 40/40 [00:00<00:00, 3300.59it/s]


### run commands for training model

In [None]:
!python -m spacy init fill-config base_config.cfg config.cfg

!python -m spacy train config.cfg --output ./ --paths.train ./training.spacy --paths.dev ./training.spacy

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[2023-06-08 18:30:17,322] [INFO] Set up nlp object from config
INFO:spacy:Set up nlp object from config
[2023-06-08 18:30:17,328] [INFO] Pipeline: ['tok2vec', 'ner']
INFO:spacy:Pipeline: ['tok2vec', 'ner']
[2023-06-08 18:30:17,330] [INFO] Created vocabulary
INFO:spacy:Created vocabulary
[2023-06-08 18:30:17,330] [INFO] Finished initializing nlp object
INFO:spacy:Finished initializing nlp object
[2023-06-08 18:30:17,423] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
INFO:spacy:Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  L

In [8]:
nlp_ner = spacy.load("model-best")

doc = nlp_ner("It’s a complicated question, but let’s start with a product. It’s a complicated question, but let’s start with a service. It’s a complicated question, but let’s start with a process")

colors = {"PRODUCT": "#F67DE3", "MSERVICE": "#7DF6D9", "PROCESS":"#FFFFFF"}
options = {"colors": colors} 

spacy.displacy.render(doc, style="ent", options= options, jupyter=True)