In [1]:
import json
from pathlib import Path
from spacy.tokens import DocBin
import spacy

def convert_json_to_spacy(input_path, output_path, nlp, labels):
    with open(input_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    doc_bin = DocBin()
    for entry in data:
        text = entry["data"]["text"]
        entities = []
        for ann in entry["annotations"]:
            for result in ann["result"]:
                start = result["value"]["start"]
                end = result["value"]["end"]
                label = result["value"]["labels"][0]
                if label in labels:
                    entities.append((start, end, label))
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in entities:
            span = doc.char_span(start, end, label=label)
            if span:
                ents.append(span)
        doc.ents = ents
        doc_bin.add(doc)

    doc_bin.to_disk(output_path)

if __name__ == "__main__":
    input_path = "jd_data.json"
    output_path = "jd_data.spacy"
    labels = ["SKILL_REQUIRED", "SKILL_PREFERRED", "SKILL_BONUS"]
    nlp = spacy.blank("en")
    convert_json_to_spacy(input_path, output_path, nlp, labels)


In [2]:
import spacy

# Load your trained model from the output directory
nlp = spacy.load("output/model-best")  # or "output/model-last" if you prefer

# Test text sample
test_text = """
We are looking for someone with .NET/C# experience, solid SQL knowledge,
and familiarity with REST APIs, Azure, AWS, and DevOps tools. React or Angular is a plus.
"""

# Process the text
doc = nlp(test_text)

# Print entities
print("Entities detected:")
for ent in doc.ents:
    print(f"{ent.text} -> {ent.label_}")


Entities detected:
.NET/C# -> SKILL_REQUIRED
SQL -> SKILL_REQUIRED
REST APIs -> SKILL_PREFERRED
Azure -> SKILL_REQUIRED
DevOps tools -> SKILL_BONUS
Angular -> SKILL_BONUS
