In [11]:
# Import required libraries and install any necessary packages
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
import json

# Check the installed version of spaCy
spacy.__version__

'3.7.5'

In [10]:
with open('./ner_dataset/annotations/annotations.json', 'r') as f:
    data = json.load(f)
    annotations = []

    for intent, values in data['annotations'].items():
        annotations.extend(values)

print(len(annotations))

2868


In [None]:
# Define a function to create spaCy DocBin objects from the annotated data
def get_spacy_doc(file, data):
  # Create a blank spaCy pipeline
  nlp = spacy.blank('en')
  db = DocBin()

  # Iterate through the data
  for text, annot in tqdm(data):
    doc = nlp.make_doc(text)
    annot = annot['entities']

    ents = []
    entity_indices = []

    # Extract entities from the annotations
    for start, end, label in annot:
      skip_entity = False
      for idx in range(start, end):
        if idx in entity_indices:
          skip_entity = True
          break
      if skip_entity:
        continue

      entity_indices = entity_indices + list(range(start, end))
      try:
        span = doc.char_span(start, end, label=label, alignment_mode='strict')
      except:
        continue

      if span is None:
        # Log errors for annotations that couldn't be processed
        err_data = str([start, end]) + "    " + str(text) + "\n"
        file.write(err_data)
      else:
        ents.append(span)

    try:
      doc.ents = ents
      db.add(doc)
    except:
      pass

  return db

In [None]:
# Split the annotated data into training and testing sets
from sklearn.model_selection import train_test_split
# train, test = train_test_split(cv_data, test_size=0.2)

# Display the number of items in the training and testing sets
# len(train), len(test)

# Open a file to log errors during annotation processing
file = open('./ner_dataset/train_file.txt','w')

# Create spaCy DocBin objects for training and testing data
db = get_spacy_doc(file, annotations)
db.to_disk('/ner_dataset/train_data.spacy')

# db = get_spacy_doc(file, test)
# db.to_disk('/content/drive/MyDrive/Custom_NER/trained_models/test_data.spacy')

# Close the error log file
file.close()

In [None]:
!python -m spacy train /config.cfg  --output /content/drive/MyDrive/Custom_NER/trained_models/output  --paths.train /ner_dataset/train_data.spacy --gpu-id 0