In [4]:
import spacy
from spacy.training import Example
import random

# Refined and expanded training data
TRAIN_DATA = [
    ("Patient complains of severe headache and dizziness.", {"entities": [(21, 29, "SYMPTOM"), (34, 43, "SYMPTOM")]}),
    ("He has been experiencing nausea and vomiting.", {"entities": [(25, 31, "SYMPTOM"), (36, 44, "SYMPTOM")]}),
    ("She reports a persistent cough and shortness of breath.", {"entities": [(18, 23, "SYMPTOM"), (28, 47, "SYMPTOM")]}),
    ("Patient has a fever and chills.", {"entities": [(13, 18, "SYMPTOM"), (23, 29, "SYMPTOM")]}),
    ("He is suffering from back pain and leg cramps.", {"entities": [(21, 30, "SYMPTOM"), (35, 45, "SYMPTOM")]}),
    ("She has had a sore throat and swollen glands.", {"entities": [(16, 27, "SYMPTOM"), (32, 45, "SYMPTOM")]}),
    ("Patient complains of abdominal pain and diarrhea.", {"entities": [(21, 35, "SYMPTOM"), (40, 48, "SYMPTOM")]}),
    ("He has been experiencing chest tightness and palpitations.", {"entities": [(25, 40, "SYMPTOM"), (45, 58, "SYMPTOM")]}),
    ("The patient reports fatigue and a sore throat.", {"entities": [(21, 28, "SYMPTOM"), (33, 44, "SYMPTOM")]}),
    ("She complains of fatigue and body aches.", {"entities": [(17, 24, "SYMPTOM"), (29, 39, "SYMPTOM")]}),
    ("Patient suffers from a persistent cough and fever.", {"entities": [(20, 35, "SYMPTOM"), (40, 45, "SYMPTOM")]}),
    ("He has been experiencing weight loss and night sweats.", {"entities": [(25, 36, "SYMPTOM"), (41, 52, "SYMPTOM")]}),
    # Adding more examples to improve boundary detection
    ("The patient reports feeling tired and has a sore throat.", {"entities": [(21, 26, "SYMPTOM"), (36, 47, "SYMPTOM")]}),
    ("Fatigue and headaches have been persistent.", {"entities": [(0, 7, "SYMPTOM"), (12, 21, "SYMPTOM")]}),
    ("Experiencing sore throat and fatigue.", {"entities": [(13, 24, "SYMPTOM"), (29, 36, "SYMPTOM")]}),
]

# Load the spaCy model
nlp = spacy.blank('en')

# Add the NER component to the pipeline
ner = nlp.add_pipe('ner')

# Add labels to the NER
for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# Disable other pipelines during training to speed up
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for i in range(30):  # Increased number of iterations
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            example = Example.from_dict(nlp.make_doc(text), annotations)
            nlp.update([example], losses=losses, drop=0.35, sgd=optimizer)  # Added dropout
        print(f"Iteration {i} - Losses: {losses}")

# Save the model
nlp.to_disk("symptom_ner_model")

# Debugging: Verify the model has been saved
print("Model saved successfully to 'symptom_ner_model'")

# Load the model for prediction
nlp = spacy.load("symptom_ner_model")

# Debugging: Verify the model has been loaded
print("Model loaded successfully from 'symptom_ner_model'")

# Example prediction
example_text = "The patient reports experiencing fatigue and a sore throat."
doc = nlp(example_text)

# Debugging: Print the text and entities found
print("Example text:", example_text)
print("Entities found:")
for ent in doc.ents:
    print(ent.text, ent.label_)

# If no entities are found, print a message
if not doc.ents:
    print("No entities found in the text.")






Iteration 0 - Losses: {'ner': 58.661210727412254}
Iteration 1 - Losses: {'ner': 27.579522136237582}
Iteration 2 - Losses: {'ner': 26.783685070355006}
Iteration 3 - Losses: {'ner': 28.039269514012688}
Iteration 4 - Losses: {'ner': 18.86654132587911}
Iteration 5 - Losses: {'ner': 14.916660465507949}
Iteration 6 - Losses: {'ner': 7.889804914272909}
Iteration 7 - Losses: {'ner': 5.230797578994348}
Iteration 8 - Losses: {'ner': 2.944683617489017}
Iteration 9 - Losses: {'ner': 2.0396894199997146}
Iteration 10 - Losses: {'ner': 0.028142137888378597}
Iteration 11 - Losses: {'ner': 0.029320044172177602}
Iteration 12 - Losses: {'ner': 0.3837996164736159}
Iteration 13 - Losses: {'ner': 0.3473510399120913}
Iteration 14 - Losses: {'ner': 0.2519995595679265}
Iteration 15 - Losses: {'ner': 2.4493962630755077}
Iteration 16 - Losses: {'ner': 0.012599696186268777}
Iteration 17 - Losses: {'ner': 0.00011818684129091169}
Iteration 18 - Losses: {'ner': 0.00029061461746272935}
Iteration 19 - Losses: {'ner': 