Paper: https://dl.acm.org/doi/pdf/10.1145/3459930.3469560  
Performance: https://github.com/ctgatecci/Clinical-trial-eligibility-criteria-NER  
Code: https://github.com/uf-hobi-informatics-lab/ClinicalTransformerNER  

In [1]:
import os
import re
from more_itertools import locate

import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
Paths = {
    'Data': './../Data/',
    'Chia_w_scope': './../Data/chia_with_scope/',
    'Chia_wo_scope': './../Data/chia_without_scope/',
    'output_path': './../Data/output/'
}

In [3]:
os.listdir(Paths['Chia_w_scope'])[:5]

['NCT02984475_exc.ann',
 'NCT02952378_inc.txt',
 'NCT03058835_exc.txt',
 'NCT02283905_exc.ann',
 'NCT02721017_exc.txt']

In [4]:
inputfiles = set()
for f in os.listdir(Paths['Chia_w_scope']):
    if f.endswith('.ann'):
        inputfiles.add(f.split('.')[0].split('_')[0])
# for f in os.listdir(Paths['Chia_wo_scope']):
#     if f.endswith('.ann'):
#         inputfiles.append(f.split('.')[0].split('_')[0])
len(inputfiles)

1000

In [5]:
entity_types = ['Condition', 'Drug', 'Procedure', 'Measurement', 'Observation', 'Person', 'Device', \
    'Value', 'Temporal', 'Qualifier', 'Negation'] # Database Columns for which we want the relationship

# Other domains worth checking
# Mood

relation_type = ['OR', 'AND', 'Has_qualifier', 'Has_value', 'Has_negation', 'Has_temporal', 'Has_context']

# Has_scope, Subsumes, multi, Has_index

In [7]:
# ann = Paths['Chia_w_scope'] + 'NCT02348918_inc.ann'
# txt = Paths['Chia_w_scope'] + 'NCT02348918_inc.txt'
ann = Paths['Chia_w_scope'] + 'NCT01320579_exc.ann'
txt = Paths['Chia_w_scope'] + 'NCT01320579_exc.txt'

# ann = Paths['Chia_w_scope'] + 'NCT00050349_inc.ann'
ann, txt

('./../Data/chia_with_scope/NCT01320579_exc.ann',
 './../Data/chia_with_scope/NCT01320579_exc.txt')

In [8]:
def get_annotation_entities(ann_file, entity_types=None):
    entities = []
    with open(ann_file, "r", encoding="utf-8") as f:
        for line in f:
            if line.startswith('T'):
                assert len(line.strip().split('\t')) == 3
                entity_identity = line.strip().split('\t')[0]
                entity_token = line.strip().split('\t')[-1]
                term = line.strip().split('\t')[1].split()
                if (entity_types != None) and (term[0] not in entity_types): continue
                if int(term[-1]) <= int(term[1]): print('Starting and Ending Indices are off.')
                entities.append((entity_identity, int(term[1]), int(term[-1]), term[0], entity_token))
                
    return sorted(entities, key=lambda x: (x[2]))

sorted_entities= get_annotation_entities(ann, entity_types)
sorted_entities

Starting and Ending Indices are off.


[('T3', 0, 7, 'Observation', 'History'),
 ('T2', 17, 28, 'Qualifier', 'significant'),
 ('T1', 29, 41, 'Condition', 'skin disease'),
 ('T6', 46, 65, 'Condition', 'skin manifestations'),
 ('T4', 69, 85, 'Condition', 'allergic illness'),
 ('T5', 95, 117, 'Condition', 'dermatologic condition'),
 ('T13', 119, 125, 'Negation', 'except'),
 ('T10', 126, 142, 'Qualifier', 'chronic moderate'),
 ('T9', 146, 152, 'Qualifier', 'severe'),
 ('T8', 153, 170, 'Condition', 'atopic dermatitis'),
 ('T14', 327, 340, 'Condition', 'skin diseases'),
 ('T16', 342, 348, 'Negation', 'except'),
 ('T15', 349, 374, 'Condition', 'chronic atopic dermatitis'),
 ('T17',
  381,
  442,
  'Qualifier',
  'could disturb the study assessment and evaluation of the skin'),
 ('T22', 463, 469, 'Temporal', 'active'),
 ('T21', 470, 489, 'Drug', 'systemic medication'),
 ('T19', 494, 519, 'Condition', 'chronic atopic dermatitis'),
 ('T20', 520, 536, 'Temporal', 'within one month'),
 ('T26', 553, 559, 'Temporal', 'active'),
 ('T25', 

In [9]:
def remove_overlap_entities(sorted_entities):
#     keep_entities = []
#     for idx, entity in enumerate(sorted_entities):
#         if idx == 0:
#             keep_entities.append(entity)
#             last_keep = entity
#             continue
#         if entity[1] < last_keep[2]:
#             if entity[2]-entity[1] > last_keep[2]-last_keep[1]:
#                 last_keep = entity
#                 keep_entities[-1] = last_keep
#         elif entity[1] == last_keep[2]:
#             last_keep = (last_keep[1], entity[2], last_keep[-1])
#             keep_entities[-1] = last_keep
#         else:
#             last_keep = entity
#             keep_entities.append(entity)
    
    keep_entities = sorted_entities
    
    uniqueEntity = []        
    for ent in keep_entities:
        uniqueEntity.append(ent[0])
        
    return keep_entities, uniqueEntity

keep_entities, uniqueEntity = remove_overlap_entities(sorted_entities)
keep_entities

[('T1', 0, 4, 'Person', 'Male'),
 ('T2', 8, 14, 'Person', 'female'),
 ('T4', 28, 31, 'Person', 'age'),
 ('T3', 16, 40, 'Value', '18 years or older'),
 ('T6', 57, 79, 'Qualifier', 'clinically significant'),
 ('T5', 80, 108, 'Condition', 'diabetic macular edema (DME)'),
 ('T7', 114, 140, 'Measurement', 'central subfield thickness'),
 ('T8', 141, 148, 'Value', '≥ 350µm'),
 ('T9', 152, 171, 'Procedure', 'spectral domain OCT'),
 ('T10', 172, 207, 'Measurement', 'Best corrected visual acuity (BCVA)'),
 ('T11', 211, 243, 'Value', '20/50 to 20/320 ETDRS equivalent'),
 ('T12', 245, 269, 'Value', '65 letters to 23 letters'),
 ('T13', 271, 287, 'Qualifier', 'in the study eye'),
 ('T15', 340, 355, 'Observation', 'Treatment naïve'),
 ('T16', 363, 365, 'Negation', 'no'),
 ('T17', 366, 374, 'Temporal', 'previous'),
 ('T18', 375, 394, 'Procedure', 'anti-VEGF treatment'),
 ('T19', 395, 411, 'Qualifier', 'in the study eye'),
 ('T20', 415, 417, 'Negation', 'no'),
 ('T21', 418, 437, 'Procedure', 'anti-VEG

In [10]:
def checkEntityValue(e):
    if e.startswith('T'):
        return e.strip()
    else:
        return e.split(':')[-1].strip()


# https://datagy.io/python-list-find-all-index/
def get_annotation_relations(ann_file, uniqueEntity):
    relations = []
    with open(ann_file, "r", encoding="utf-8") as f:
        for line in f:
            if line.startswith('R') or line.startswith('*'):
                assert len(line.strip().split('\t')) == 2
                
                if line.strip().split('\t')[1].split()[0] not in relation_type: continue
                
                rel = line.strip().split('\t')[0]
                rel_type = line.strip().split('\t')[1].split()[0]
                entities = line.strip().split('\t')[1].split()[1:]
                entities= [checkEntityValue(e) for e in entities]
                if (entities[0] not in uniqueEntity) or (entities[1] not in uniqueEntity): continue           
                entities = ' '.join(entities)
                relations.append((rel, rel_type, entities))
                
    return relations
get_annotation_relations(ann, uniqueEntity)

[('*', 'OR', 'T1 T2'),
 ('R1', 'Has_value', 'T4 T3'),
 ('R2', 'Has_qualifier', 'T5 T6'),
 ('R3', 'AND', 'T9 T7'),
 ('R4', 'Has_value', 'T7 T8'),
 ('R5', 'AND', 'T5 T9'),
 ('R7', 'Has_qualifier', 'T10 T13'),
 ('R8', 'Has_value', 'T10 T11'),
 ('R9', 'Has_qualifier', 'T18 T19'),
 ('R10', 'Has_temporal', 'T18 T17'),
 ('R11', 'Has_negation', 'T18 T16'),
 ('R13', 'Has_temporal', 'T21 T22'),
 ('R14', 'Has_negation', 'T21 T20'),
 ('*', 'OR', 'T21 T18'),
 ('R17', 'Has_qualifier', 'T27 T26'),
 ('R18', 'Has_qualifier', 'T27 T28'),
 ('R20', 'Has_value', 'T35 T36'),
 ('R21', 'Has_qualifier', 'T35 T37'),
 ('R22', 'Has_value', 'T33 T34'),
 ('R23', 'Has_negation', 'T38 T39'),
 ('R24', 'Has_qualifier', 'T38 T40')]

In [11]:
with open(txt, "r", encoding="utf-8") as f:
        text_array = f.readlines()
        file = 'NCT02348918'
        if file in ['NCT02348918_exc', 'NCT02348918_inc', 'NCT01735955_exc']: # Inconsistent offsets
            text = ' '.join([i.strip() for i in text_array])
        else:
            text = '  '.join([i.strip() for i in text_array])
            
text

"Male or female, 18 years of age or older.  Study eye with clinically significant diabetic macular edema (DME) with central subfield thickness ≥ 350µm on spectral domain OCT  Best corrected visual acuity (BCVA) of 20/50 to 20/320 ETDRS equivalent (65 letters to 23 letters) in the study eye, with BCVA decrement primarily attributable to DME.  Treatment naïve, i.e., no previous anti-VEGF treatment in the study eye or no anti-VEGF treatment in the 45 days prior to study enrollment.  In the investigator's opinion, the subject still has significant intraretinal fluid with room for improvement in both macular edema and BCVA.  Intra-Ocular Pressure (IOP) is under control (i.e., IOP ≤ 25 mm in the study eye) and study eye is not receiving any IOP lowering drops.  Willing and able to return for all study visits.  Able to meet the extensive post-op evaluation regimen.  Understands and signs the informed consent form."

In [12]:
def removePunctuation(word):
    word = re.sub(r'^(\.|,|\(|\))', '', word)
    word = re.sub(r'(\.|,|\(|\))$', '', word)
    return word

In [13]:
globalText = []
offset = 0
for txt in text_array:
    textlen = len(txt)
    txt = txt.replace('.\n','')
    txt = txt.replace('\n', '')  
    
    globalText.append(([removePunctuation(w) for w in txt.split()], offset, offset + textlen)) 
    offset += textlen

globalText  

[(['Male', 'or', 'female', '18', 'years', 'of', 'age', 'or', 'older'], 0, 42),
 (['Study',
   'eye',
   'with',
   'clinically',
   'significant',
   'diabetic',
   'macular',
   'edema',
   'DME',
   'with',
   'central',
   'subfield',
   'thickness',
   '≥',
   '350µm',
   'on',
   'spectral',
   'domain',
   'OCT'],
  42,
  172),
 (['Best',
   'corrected',
   'visual',
   'acuity',
   'BCVA',
   'of',
   '20/50',
   'to',
   '20/320',
   'ETDRS',
   'equivalent',
   '65',
   'letters',
   'to',
   '23',
   'letters',
   'in',
   'the',
   'study',
   'eye',
   'with',
   'BCVA',
   'decrement',
   'primarily',
   'attributable',
   'to',
   'DME'],
  172,
  340),
 (['Treatment',
   'naïve',
   'i.e.',
   'no',
   'previous',
   'anti-VEGF',
   'treatment',
   'in',
   'the',
   'study',
   'eye',
   'or',
   'no',
   'anti-VEGF',
   'treatment',
   'in',
   'the',
   '45',
   'days',
   'prior',
   'to',
   'study',
   'enrollment'],
  340,
  480),
 (['In',
   'the',
   "investigat

In [33]:
for text in globalText:
    words = text[0]
    tags = ['O']*len(words)
    entity_identity = ['O']*len(words)
    sent_indices = set()
    for k in keep_entities:
        if k[1] >= text[1] and k[2] <= text[2]:
            break_down = [removePunctuation(v) for v in k[-1].split()]
            main_index = 0
            label = ''
            for i, w in enumerate(break_down):
                indices = list(locate(words, lambda x: x == w))

                if i == 0:
                    main_index = indices[0]
                    if len(break_down) > 1: label = 'B-'
                else:
                    label = 'I-'
                indices= list(filter(lambda x: x >= main_index, indices))
                indices= list(filter(lambda x: x not in sent_indices, indices))
                if len(indices) != 0:
                    sent_indices.add(indices[0])
                    tags[indices[0]] = label + k[3]
                    entity_identity[indices[0]] = k[0]
    print(words)
    print(tags)
    print(entity_identity)
    print('-'*20)

['Male', 'or', 'female', '18', 'years', 'of', 'age', 'or', 'older']
['Person', 'O', 'Person', 'B-Value', 'I-Value', 'O', 'Person', 'I-Value', 'I-Value']
['T1', 'O', 'T2', 'T3', 'T3', 'O', 'T4', 'T3', 'T3']
--------------------
['Study', 'eye', 'with', 'clinically', 'significant', 'diabetic', 'macular', 'edema', 'DME', 'with', 'central', 'subfield', 'thickness', '≥', '350µm', 'on', 'spectral', 'domain', 'OCT']
['O', 'O', 'O', 'B-Qualifier', 'I-Qualifier', 'B-Condition', 'I-Condition', 'I-Condition', 'I-Condition', 'O', 'B-Measurement', 'I-Measurement', 'I-Measurement', 'B-Value', 'I-Value', 'O', 'B-Procedure', 'I-Procedure', 'I-Procedure']
['O', 'O', 'O', 'T6', 'T6', 'T5', 'T5', 'T5', 'T5', 'O', 'T7', 'T7', 'T7', 'T8', 'T8', 'O', 'T9', 'T9', 'T9']
--------------------
['Best', 'corrected', 'visual', 'acuity', 'BCVA', 'of', '20/50', 'to', '20/320', 'ETDRS', 'equivalent', '65', 'letters', 'to', '23', 'letters', 'in', 'the', 'study', 'eye', 'with', 'BCVA', 'decrement', 'primarily', 'attrib