In [1]:
import spacy
import json
from spacy.tokens import DocBin
from tqdm import tqdm
import inflect
from transformers import pipeline



In [2]:
f = open('annotations.json')
TRAIN_DATA = json.load(f) #Load the tagged data

In [3]:
nlp = spacy.blank("en") # load a new spacy model
db = DocBin() # create a DocBin object
for text, annot in tqdm(TRAIN_DATA['annotations']): #Convert the data into DocBin object
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents
    db.add(doc)

db.to_disk("./training_data.spacy") # save the docbin object

100%|██████████| 90/90 [00:00<00:00, 1920.24it/s]

Skipping entity





In [4]:
#This is Spacy model configuration. You need to uncomment & run it the first time uisng this file. The next run you dont need to run it
##! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

In [5]:
#This is to train and build model pipeline. Uncomment & run it the first time using this file. Next run you can skip this and in rder to update your model using your new dataset you will need to run this
##! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

In [6]:
nlp_ner = spacy.load("model-best") #Load the model pipeline

In [7]:
input = '''employee only can be hired by manager'''

In [8]:
# fix_spelling = pipeline("text2text-generation",model="oliverguhr/spelling-correction-english-base")
input = input.lower()
# input2 = fix_spelling(input)
# input2

In [9]:
doc = nlp_ner(input)
spacy.displacy.render(doc, style="ent", jupyter=True)



In [10]:
p = inflect.engine()

actors = set()
activities = set()

for ent in doc.ents:
    if ent.label_ == "ACTOR":
        # Convert plural form to singular form
        singular_actor = p.singular_noun(ent.text) or ent.text
        actors.add(singular_actor)
    elif ent.label_ == "ACTIVITY":
        activities.add(ent.text)

In [11]:
print("Actors:", actors)
print("Activities:", activities)

Actors: set()
Activities: set()


In [12]:
nlp_sents = spacy.blank("en")
sentencizer = nlp_sents.add_pipe("sentencizer")


actors_dict = {}
current_actor = None

for sent in nlp_sents(input).sents:
    for ent in nlp_ner(sent.text).ents:
        if ent.label_ == "ACTOR":
            current_actor = p.singular_noun(ent.text) or ent.text
        elif ent.label_ == "ACTIVITY":
            actor_key = current_actor or actors_dict.get(None)
            if actor_key not in actors_dict:
                actors_dict[actor_key] = set()
            actors_dict[actor_key].add(ent.text)


print("Output:", actors_dict)

Output: {}
