**Code for performing named entity recognition on very simple labeled patient health records**

In [18]:
import json
import spacy

# from google.colab import drive
# drive.mount('/content/drive')

with open("consolidated_chia_outfile.json", "r") as outfile:
  data = json.load(outfile)

with open("data.json", "r") as outfile:
  data2 = json.load(outfile)


In [3]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [27]:
data["NCT00894712_inc"]["Annotations"][0]

'Procedure 10 24--*--pathologically'

In [28]:
data["NCT00894712_inc"]["Text"]

'Must have pathologically confirmed invasive adenocarcinoma or ductal carcinoma in situ of the breast. . Patients must have undergone segmental mastectomy (i.e., lumpectomy). . Patients must not have received prior radiation therapy to the breast. . Patients must not have active local-regional disease prior to registration. . Patients must not be pregnant because of the potential for fetal harm as a result of radiation treatment. Women of child-bearing age will be given a serum pregnancy test prior to study entry to ensure they are not pregnant. They will also be counseled on the importance of avoiding pregnancy and hormonal contraception while undergoing radiation therapy. . Patients must not have a serious medical or psychiatric illness which prevents informed consent or compliance with treatment. . All patients must be informed of the investigational nature of this study and give written informed consent in accordance with institutional and federal guidelines. . '

In [19]:
def is_valid_text(text):
    # Check if the text is non-empty and non-null
    return text is not None and text.strip() != ''

def is_valid_annotation(annotation):
    # Check if the annotation follows the format "Label Start End--*--Text"
    parts = annotation.split('--*--')
    if len(parts) != 2:
        return False
    label_info, text = parts
    label_parts = label_info.split(' ')
    if len(label_parts) != 3:
        return False
    _, start, end = label_parts
    return start.isdigit() and end.isdigit() and text.strip() != ''

def clean_json(data):
    cleaned_data = {}
    for key, value in data.items():
        if 'Text' in value and is_valid_text(value['Text']):
            # Validate annotations
            valid_annotations = [ann for ann in value.get('Annotations', []) if is_valid_annotation(ann)]
            cleaned_data[key] = {
                'Text': value['Text'],
                'Annotations': valid_annotations
            }
    return cleaned_data

data1 = clean_json(data)

In [20]:
from spacy.tokens import DocBin

# nlp = spacy.blank("en")

nlp = spacy.load("en_core_web_lg")

doc_bin = DocBin()

In [23]:
# training from chia data
from spacy.util import filter_spans

test_data = []
count = 0;

for example in data1:
    
    

    if "Text" not in data1[example] and "Annotations" not in data1[example]:
        continue
        
    if (count<50 and len(data1[example]["Annotations"])>0):
        test_data.append(data1[example])
        count+=1
        continue

    temp_lst = []

    text = data1[example]["Text"]
    #   text = text.replace(".", " ")
#     text = text.strip()

    if text == '.' or text == '' or text == ',' or text == ':':
        continue;

    labels = []
    for entity in data1[example]["Annotations"]:
        if (entity == None or len(entity) == 0):
            continue
        sp = entity.split('--*--')[0].split(' ');
        start = sp[1]
        end = sp[2]
        label = sp[0]
        expected_label = entity.split('--*--')[1]        
        labels.append((start, end, label))


    doc = nlp.make_doc(text)
    ents = []

    for start, end, label in labels:
        if start.isdigit() and end.isdigit():
            span = doc.char_span(int(start), int(end), label=label)
            if span is not None and span.text is not None:
                ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    doc_bin.add(doc)

doc_bin.to_disk("train.spacy")


[{'Text': 'Must have pathologically confirmed invasive adenocarcinoma or ductal carcinoma in situ of the breast. . Patients must have undergone segmental mastectomy (i.e., lumpectomy). . Patients must not have received prior radiation therapy to the breast. . Patients must not have active local-regional disease prior to registration. . Patients must not be pregnant because of the potential for fetal harm as a result of radiation treatment. Women of child-bearing age will be given a serum pregnancy test prior to study entry to ensure they are not pregnant. They will also be counseled on the importance of avoiding pregnancy and hormonal contraception while undergoing radiation therapy. . Patients must not have a serious medical or psychiatric illness which prevents informed consent or compliance with treatment. . All patients must be informed of the investigational nature of this study and give written informed consent in accordance with institutional and federal guidelines. . ', 'Annota

In [5]:
# from spacy.util import filter_spans

# training_data = []

# for example in data1["data"]:
#   temp_lst = []

#   text = example['entities'][0]['text']

#   labels = []
#   for entity in example['entities']:
#     start = entity['indexes'].split(',')[0]
#     end = entity['indexes'].split(',')[1]
#     label = entity['label']
#     labels.append((start, end, label))

#   doc = nlp.make_doc(text)
#   ents = []

#   for start, end, label in labels:
#     span = doc.char_span(int(start), int(end), label=label, alignment_mode="contract")
#     if span is None:
#       print("Skipping entity")
#     else:
#       ents.append(span)
#   filtered_ents = filter_spans(ents)
#   doc.ents = filtered_ents
#   doc_bin.add(doc)

# doc_bin.to_disk("train.spacy")

In [24]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [25]:
!python -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     77.41    0.00    0.00    0.00    0.00
  0     200       1188.28   8357.38   12.67   36.50    7.67    0.13
  0     400       1645.19   8427.26   14.47   16.62   12.81    0.14
  0     600        831.48   7351.95   20.04   41.80   13.18    0.20
  0     800       3458.30   7953.56   31.40   47.59   23.43    0.31
  0    1000       1079.01   8314.19   35.84   52.49   27.21    0.36
  0    1200        894.21   8803.76   38.99   44.67   34.60    0.39
^C


In [26]:
nlp_ner = spacy.load("model-best")

In [66]:
# doc = nlp_ner("Severe cancer and Leukemia Group A. Cardiac MRI, and/or lung function tests (PFT) performed and reviewed by transplant center (for individuals with an ejection fraction and diffusing capacity [DLCO] of 40-50%, the appropriate cardiology or pulmonary consultations should be considered if the individual has severe heart or lung disease at the initiation of therapy) Sufficient number of umbilical cord blood units available for transplantation If male, willing to use contraception.")
# colors = {"CONDITION": "#F67DE3", "QUALIFIER": "#7DF6D9", "PROCEDURE":"#a6e22d"}
# options = {"colors": colors}
# spacy.displacy.render(doc, style="ent", options= options, jupyter=True)
# print(doc.ents)

patient_data = []
patientidx = 0
for x in test_data:
    text = x["Text"]
    entities = x["Annotations"]
    attributes = []
    for entity in entities:
        sp = entity.split('--*--')[0].split(' ');
        start = sp[1]
        end = sp[2]
        label = sp[0]
        expected_value = entity.split('--*--')[1]
        attributes.append((label,expected_value))
    patient_data.append(attributes)

def generateScore(attributes,doc):
    txtarr = []
    score = 0
    scoretotal = 0
    for ent in doc.ents:
        txtarr.append(ent.text)
    for attribute in attributes:
        score+=1
        scoretotal+=1
        label = attribute[0]
        value = attribute[1]
        if value not in txtarr:
            score-=1
    return score/scoretotal


testidx = 0
resultarr = []
for test in test_data:
    doc = nlp_ner(test["Text"])
    maxi = 0
    matchidx = 1000
    for i in range(len(patient_data)):
        score = generateScore(patient_data[i],doc)
        if (score>maxi):
            matchidx = i
            maxi = score
    resultarr.append((testidx,matchidx))
    testidx+=1
    
finalscore = len(resultarr)
finaldenom = finalscore
for val in resultarr:
    if (val[0]!=val[1]):
        finalscore-=1
print("Final score: " + str(finalscore/finaldenom) + " proportion of ideal patients identified correctly")

            
        
    
#     print("expected values: ")
#     txtarr = []
#     for ent in doc.ents:
#         txtarr.append(ent.text)
#     for entity in entities:
#         score+=1
#         finalscore+=1
#         sp = entity.split('--*--')[0].split(' ');
#         label = sp[0]
#         expected_value = entity.split('--*--')[1]
#         print(expected_value + " should have label " + label)
#         if expected_value in txtarr:
#             print(expected_value + " correctly identified ")
#         else:
#             print(expected_value + " not identified")
#             score-=1
            
#     print("actual identifications of model: ")
#     for ent in doc.ents:
#         print(ent.text + " has label " + ent.label_)

# print("Final Score: " + str(score/finalscore))
        
        
   
            

    


Final score: 0.9 proportion of ideal patients identified correctly


In [61]:
#Patient data to test




        





NameError: name 'Object' is not defined