# configurations

In [4]:
import os
import json
import copy
from glob import glob
from tqdm import tqdm
import numpy as np
def read_json( file_path ) :
    with open( file_path, 'r' ) as f :
        return json.load( f )


# the following code will only touch the prediction files containing the target_model_name. 
# to include all models, set the target_model_name as ''
# this assumes you prediction files have this modelname in their name
target_model_name='YOUR-SYSTEM-MODEL'

reference_dir = 'YOUR-REFERENCE-DIR' #where you put the reference file
prediction_dir = 'YOUR-PREDICTION-DIR' #where you put your prediction e.g. gpt4vision-run1.json
score_dir = 'YOUR-SCORE-DIR' #where to output the score.json file
intermediate_dir = 'YOUR-INTERMEDIA_DIR-DIR' #where to store the intermediate files for assertion labels

# the directory where the UMLS concepts are installed.
#to setup QUICKUMLS, please see directions from: https://github.com/Georgetown-IR-Lab/QuickUMLS
quickumls_fp='YOUR-QUICKUMLS-INSTALLATION'
UMLS_stop_words=read_json( os.path.join(reference_dir, f'UMLS_stop_words.json') )

# extracting UMLS concepts from the predictions

In [5]:
# configuring QuickUMLS

UMLS_semantics_types=['T116', 'T195', 'T123', 'T122', 'T200', 'T196', 'T126', 'T131', 'T125', 'T129', 'T130', 'T197', 'T114', 'T109', 'T121', 'T127', 'T020', 'T190', 'T049', 'T019', 'T047', 'T050', 'T033', 'T037', 'T048', 'T191', 'T046', 'T184', 'T060', 'T059', 'T063', 'T061']
UMLS_semantics_names=['Amino Acid, Peptide, or Protein', 'Antibiotic', 'Biologically Active Substance', 'Biomedical or Dental Material', 'Clinical Drug', 'Element, Ion, or Isotope', 'Enzyme', 'Hazardous or Poisonous Substance', 'Hormone', 'Immunologic Factor', 'Indicator, Reagent, or Diagnostic Aid', 'Inorganic Chemical', 'Nucleic Acid, Nucleoside, or Nucleotide', 'Organic Chemical', 'Pharmacologic Substance', 'Vitamin', 'Acquired Abnormality', 'Anatomical Abnormality', 'Cell or Molecular Dysfunction', 'Congenital Abnormality', 'Disease or Syndrome', 'Experimental Model of Disease', 'Finding', 'Injury or Poisoning', 'Mental or Behavioral Dysfunction', 'Neoplastic Process', 'Pathologic Function', 'Sign or Symptom', 'Diagnostic Procedure', 'Laboratory Procedure', 'Molecular Biology Research Technique', 'Therapeutic or Preventive Procedure']
UMLS_type_map={
    "Treatment": ["Amino Acid, Peptide, or Protein", "Antibiotic", "Biologically Active Substance",
        "Biomedical or Dental Material", "Chemical", "Chemical Viewed Functionally",
        "Chemical Viewed Structurally", "Clinical Drug", "Element, Ion, or Isotope",
        "Enzyme",  "Hazardous or Poisonous Substance", "Hormone",
        "Immunologic Factor", "Indicator, Reagent, or Diagnostic Aid",
        "Inorganic Chemical", "Nucleic Acid, Nucleoside, or Nucleotide",
        "Organic Chemical", "Pharmacologic Substance",
        "Receptor", "Vitamin", "Therapeutic or Preventive Procedure"],
    "Disease": ["Acquired Abnormality", "Anatomical Abnormality",
        "Cell or Molecular Dysfunction", "Congenital Abnormality",
        "Disease or Syndrome", "Experimental Model of Disease", "Finding", "Injury or Poisoning",
        "Mental or Behavioral Dysfunction", "Neoplastic Process", "Pathologic Function", "Sign or Symptom"],
    "Test": [ "Diagnostic Procedure", "Laboratory Procedure", "Molecular Biology Research Technique"],
}

from quickumls import QuickUMLS
matcher = QuickUMLS(quickumls_fp,window=5,threshold=0.9,accepted_semtypes=UMLS_semantics_types)

2024-03-07 00:00:14.273124: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-07 00:00:14.460214: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-07 00:00:15.038602: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.4/lib64
2024-03-07 00:00:15.038668: W tensorflow/compiler/xla/stream

In [6]:
# extracting concepts from QuickUMLS
normalized_concepts_dir='final_UMLS_sets.json'
normalized_concepts=read_json(normalized_concepts_dir)


for exp in tqdm(glob(f'{prediction_dir}/*.json')):
    if target_model_name in exp:
        prediction = read_json(exp)
        for UMLS_source in ['UMLS_en']:
                response_key="content_en" if "en" in UMLS_source else "content_zh" #"content_zh2en"
                for idx,response in enumerate(prediction):
                    prediction[idx]["responses"][0][UMLS_source]=[]
                    text=response["responses"][0][response_key]
                    matches=matcher.match(text, ignore_syntax=True)
                    for match in matches:
                        dic={
                            'term':[],
                            'cui':[],
                            'semantic_types':[],
                            "type":"",
                            'index':[]
                        }
                        for m in match:
                            if m['cui'] not in UMLS_stop_words['cuis'] and \
                                m['term'] not in UMLS_stop_words['terms'] and \
                                any([w in  UMLS_semantics_types for w in m['semtypes']]) :
                                current_term= m['term'].lower()
                                for k,v in normalized_concepts.items():
                                    if current_term in v["concepts"] or m['cui'] in v["cui"]:
                                        current_term=k
                                        #print(exp,response['encounter_id'],m['term'].lower(),current_term)
                                        break
                                if current_term not in dic['term']:
                                    dic['term'].append(current_term)
                                if m['cui'] not in dic['cui']:
                                    dic['cui'].append(m['cui'])
                                if (m['start'],m['end']) not in dic['index']:
                                    dic['index'].append((m['start'],m['end']))
                                for w in m['semtypes']:
                                    if w in  UMLS_semantics_types:
                                        st=UMLS_semantics_names[UMLS_semantics_types.index(w)] 
                                        if st not in dic['semantic_types']:
                                            dic['semantic_types'].append(st)
                                if not dic['type']:
                                    dic['type']=[k for k,v in UMLS_type_map.items() if not set(dic['semantic_types']).isdisjoint(v)][0]
                            
                        if dic['type']!="":
                            prediction[idx]["responses"][0][UMLS_source].append(copy.deepcopy(dic))
        with open(exp,'w') as f:
            json.dump(prediction,f,indent=4)

100%|██████████| 16/16 [00:14<00:00,  1.08it/s]


# generating assertion labesl for those concepts

In [8]:
output=[]
for file in glob('{}/*.json'.format(prediction_dir)):
    if target_model_name not in file:
        continue
    id=file.split('/')[-1]
    dics=read_json(file)
    for dic in dics:
        for idx,r in enumerate(dic['responses']):
            if r["UMLS_en"]:
                text=list(r["content_en"])
                for concept in r["UMLS_en"]:
                    text[concept["index"][0][0]]="<"+text[concept["index"][0][0]]
                    text[concept["index"][0][1]-1]=text[concept["index"][0][1]-1]+f">({concept['type']})"

                output.append({
                    "id": '[SEP]'.join([id,dic['encounter_id'],str(idx)]),
                    "instruction": "Decide the status value for each medical problem event. Choose from present, absent, possible, conditional, hypothetical, not_patient.",
                    "input": ''.join(text),
                    "output": '[SEP]'.join([id,dic['encounter_id'],str(idx)])
                })
with open(f'{intermediate_dir}/{target_model_name}_assertion_input.json','w') as f:
    json.dump(output,f,indent=4)

In [None]:
# USE ASSERTION CLASSIFIER using the instruction above and output to a folder "intermediate_files"

In [9]:
# save the result to 
assertion_labels={}
for assertion_file in glob(f'{intermediate_dir}/*assertion_output.jsonl'):
    for line in open(assertion_file).readlines():
        if line.strip():
            dic=json.loads(line)
            assertion_labels[dic['label']]={}
            for w in dic['predict'].split('[SEP]'):
                if w.strip() and '<status>' in w and w.split('<status>')[1].strip() in ['present', 'absent', 'possible', 'conditional','hypothetical', 'not_patient']:
                    key,status=w.split('<status>')
                    assertion_labels[dic['label']][key.strip().lower()]=status.strip()
assertion_labels[dic['label']]

{'comedo': 'present',
 'acne': 'present',
 'blockage': 'present',
 'topical creams': 'hypothetical',
 'benzoyl peroxide': 'hypothetical',
 'salicylic acid': 'hypothetical',
 'exfoliation': 'present',
 'dermatological': 'hypothetical',
 'procedure': 'hypothetical'}

In [11]:
# read the assertion output, add the file
for file in glob('{}/*.json'.format(prediction_dir)):
    if target_model_name not in file:
        continue
    dics=read_json(file)
    for dic in dics:
        for idx,r in enumerate(dic['responses']):
            if r["UMLS_en"]:
                id=file.split('/')[-1]
                id='[SEP]'.join([id,dic['encounter_id'],str(idx)])
                for concept in r["UMLS_en"]:
                    concept_key=r["content_en"][concept["index"][0][0]:concept["index"][0][1]].strip().lower()
                    concept['status']=assertion_labels[id].get(concept_key,'present')
    with open(file,'w') as f:
        json.dump(dics,f,indent=4)

# calculating the UMLS set scores

In [12]:
for lang in  ['UMLS_en']:
    for file in glob(f'{prediction_dir}/*.json'):
        exp=file.split('/')[-1].split('.')[0]
        task='iiyi_test' if 'iiyi' in exp else 'reddit_test'

        truth = read_json( os.path.join(reference_dir, f'{task}.json') )
        prediction = read_json( os.path.join(prediction_dir, f'{exp}.json') )

        score_path = os.path.join(score_dir, f'{exp}.json')
        scores={} if not os.path.isfile(score_path) else read_json(score_path)

        all_scores=[]
        for pred, ref in zip(prediction,truth):
            assert pred["encounter_id"]==ref["encounter_id"],1
            NP=len(pred)
            max_F1=0
            
            for p in pred['responses']:
                p=set([(c["term"][0],c["type"],c.get('status','present')) for c in p[lang]])

                for r in ref['responses']:
                    r=set([(c['UMLS_term'],c["type"],c.get('status','present')) for c in r[lang]])

                    NT=len(r)
                    TP=len([t for t in p if t in r])
                    P=TP/NP if NP else 0
                    R=TP/NT if NT else 0
                    F1=R*P*2/(P+R) if P+R else 0
                    max_F1=max(max_F1,F1)
                    
            pred[lang+' F1']=max_F1
            all_scores.append(max_F1)

        with open(file,'w') as f:
            json.dump(prediction,f,indent=4)

        scores[lang+' F1'] = np.mean(all_scores)
        with open(score_path,'w') as f:
            json.dump(scores,f,indent=4)
