### **Named Entity Recogtition Model**
1. spaCy
2. Spark NLP

In [25]:
import spacy
from spacy.tokens import DocBin
from spacy.util import filter_spans
import torch
from tqdm import tqdm
import sparknlp
import pyspark
import utils
import json

In [17]:
torch.cuda.is_available()

# Check the installed version of spaCy
spacy.__version__

'3.7.5'

In [11]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("declare a new variable named name and assign it the value of 'John'")
 
print(doc.ents)

spacy.displacy.render(doc, style="ent")


(John,)


### Prepare Dataset

In [50]:
utils.reformat_json("./ner_dataset/annotations/array_operations.json")

In [24]:
# Load annotated data
# Load the annotated data from a JSON file
variabel_declaration_data = json.load(open('./annotations/annotations.json','r'))

# Display the number of items in the dataset
print(len(variabel_declaration_data["annotations"]))

# Display the first item in the dataset
variabel_declaration_data["annotations"][0]

41


["temp equals 'string'.\r", {'entities': [[0, 4, 'VAR'], [12, 20, 'VAL']]}]

In [26]:
nlp = spacy.blank("en") # load a new spacy model
doc_bin = DocBin() # create a DocBin object

In [30]:
for example in tqdm(variabel_declaration_data["annotations"]):
    text = example[0] # get the text
    entities = example[1]["entities"] # get the entities

    doc = nlp.make_doc(text) # create a Doc object from the text

    ents = []

    for start, end, label in entities:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
        
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

doc_bin.to_disk("training_data.spacy") # save the docbin object

100%|██████████| 41/41 [00:00<00:00, 1773.23it/s]


In [31]:
# load the model
nlp_ner = spacy.load("model-best")

In [49]:
# test the model
doc = nlp_ner("declare a new variable of type string named name and assign it the value of John")

colors = {"VAR": "#F67DE3", "VAL": "#7DC2F6", "TYPE":"#C99BFA"}
options = {"colors": colors} 

spacy.displacy.render(doc, style="ent", options= options, jupyter=True)