In [None]:
#install QuickUMLS
#!python -m quickumls.install <umls_installation_path> <destination_path> -E CHI
#Then move the MRCONSO.RRF to <umls_installation_path/META>, and replace the original MRCONSO.RRF file
#MRCONSO.RRF was created based on the manually collected UMLS_CHI_v02.xlsx file in QuickUMLS's format

## Extracting CHI Concepts

In [None]:
import pandas as pd
df = pd.read_excel('UMLS_CHI_v02.xlsx')
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df.head()

Unnamed: 0,concept_from_response,UMLS_term,cui,type,semantic_subtype,CHI_term,ENG_URL,CHI_URL,machine_translated,type_modified,CHI_term_modified,concept added
0,deformity,deformity,C0000768|C2117111|C0302142,Disease,",Congenital Abnormality ,Finding ,Anatomical A...",畸形,https://en.wikipedia.org/wiki/Deformity,https://zh.wikipedia.org/wiki/%E7%95%B8%E5%BD%A2,0,Disease,畸形,
1,malformation,deformity,C0000768|C2117111|C0302142,Disease,",Congenital Abnormality ,Finding ,Anatomical A...",畸形,https://en.wikipedia.org/wiki/Deformity,https://zh.wikipedia.org/wiki/%E7%95%B8%E5%BD%A2,0,Disease,畸形,
2,malformations,deformity,C0000768|C2117111|C0302142,Disease,",Congenital Abnormality ,Finding ,Anatomical A...",畸形,https://en.wikipedia.org/wiki/Deformity,https://zh.wikipedia.org/wiki/%E7%95%B8%E5%BD%A2,0,Disease,畸形,
3,abscess,abscess,C0000833,Disease,",Disease or Syndrome",脓疡,https://en.wikipedia.org/wiki/Abscess,https://zh.wikipedia.org/wiki/%E8%86%BF%E7%98%8D,0,Disease,脓疡,
4,acanthosis,acanthosis nigricans,C0000889|C0221270,Disease,",Finding",黑棘皮症,https://en.wikipedia.org/wiki/Acanthosis_nigri...,https://zh.wikipedia.org/wiki/%E9%BB%91%E6%A3%...,0,Disease,黑棘皮症,


In [None]:
# Get unique CHI_term and type combinations
df_aggregated = df.groupby(['CHI_term_modified', 'type_modified']).agg(lambda x: ','.join(set([str(i) for i in x if pd.notnull(i)]))).reset_index()

In [None]:
from quickumls import QuickUMLS

quickumls_filepath = '{}/data/quickUMLS_CHI'
matcher = QuickUMLS(quickumls_filepath, threshold=0.7, min_match_length=1, window = 20)

def find_matches(text, matcher, df_aggregated):
    matches = matcher.match(text, best_match=True, ignore_syntax=False)
    terms = {}
    for match in matches:
        for candidate in match:
            term = candidate['term']
            cui = candidate['cui']
            if term in terms:
                terms[term]['cui'].add(cui)
            else:
                terms[term] = {'cui': {cui}, 'type': None, 'semantic_subtype': None}
                
    for term in terms.keys():
        matching_row = df_aggregated[df_aggregated['CHI_term_modified'] == term]
        if not matching_row.empty:
            terms[term]['type'] = matching_row['type_modified'].iloc[0]
            terms[term]['semantic_subtype'] = matching_row['semantic_subtype'].iloc[0]
        
    extracted_terms = [
        {
            'term': [term], 
            'cui': list(terms[term]['cui']),
            'type': terms[term]['type'],
            'semantic_subtype': [subtype.strip() for subtype in terms[term]['semantic_subtype'].split(',') if subtype.strip()],
            'status': 'present'
        }
        for term in terms
    ]
    return extracted_terms


find_matches("银屑病，似与胸腔积液没有关系", matcher, df_aggregated)

In [5]:
# PERFORM CONCEPT EXTRACTION FOR YOUR SYSTEM FILE
import json

with open(SYSTEM_PREDICTION_FILE, 'r', encoding='utf-8') as file:
    data = json.load(file)

for item in data:
    for response in item['responses']:
        text = response['content_zh']
        matches = find_matches(text, matcher, df_aggregated)
        response['UMLS_zh'] = matches

with open(SYSTEM_PREDICTION_FILE, 'w', encoding='utf-8') as file:
    json.dump(data, file, ensure_ascii=False, indent=4)

In [None]:
# Please use gpt_Chinese_assertion.ipynb to insert assertion before running the following codes
# Inserting assertion must be done before normalizing UMLS terms
# F1 medcon score will be calculated based on the normalized UMLS terms

## Medical Terms Normalization

In [None]:
# IMPORT NORMALIZED UMLS DICTIONARY
with open('final_UMLS_sets_CHI.json', 'r', encoding='utf-8') as f:
    UMLS_set = json.load(f)

UMLS_set.items()

In [8]:
# NORMALIZING MEDICAL CONCEPTS FOR YOUR SYSTEM FILE

with open(SYSTEM_PREDICTION_FILE, 'r', encoding='utf-8') as file:
    data = json.load(file)

for item in data:
    for response in item['responses']:
        for term in response['UMLS_zh']:
            umls_term = term['term']
            for key, value in UMLS_set.items():
                if umls_term in value['concepts']:
                    term['term'] = key
                    break

with open(SYSTEM_PREDICTION_FILE, 'w', encoding='utf-8') as file:
    json.dump(data, file, ensure_ascii=False, indent=4)

## Chinese medcon F1 Calculation

In [None]:
from glob import glob
import os
import numpy as np
def read_json( file_path ) :
    with open( file_path, 'r' ) as f :
        return json.load( f )
    

for lang in  ['UMLS_zh']:
    for file in glob(f'{prediction_dir}/*.json'):
        exp=file.split('/')[-1].split('.')[0]
        task='iiyi_test'

        truth = read_json( os.path.join(reference_dir, f'{task}.json') )
        prediction = read_json( os.path.join(prediction_dir, f'{exp}.json') )

        score_path = os.path.join(score_dir, f'{exp}.json')
        scores={} if not os.path.isfile(score_path) else read_json(score_path)

        all_scores=[]
        for pred, ref in zip(prediction,truth):
            assert pred["encounter_id"]==ref["encounter_id"],1
            NP=len(pred)
            max_F1=0
            
            for p in pred['responses']:
                p=set([(c["term"][0],c["type"],c.get('status','present')) for c in p[lang]])

                for r in ref['responses']:
                    r=set([(c['UMLS_term'],c["type"],c.get('status','present')) for c in r[lang]])

                    NT=len(r)
                    TP=len([t for t in p if t in r])
                    P=TP/NP if NP else 0
                    R=TP/NT if NT else 0
                    F1=R*P*2/(P+R) if P+R else 0
                    max_F1=max(max_F1,F1)
                    
            pred[lang+' F1']=max_F1
            all_scores.append(max_F1)

        with open(os.path.join(prediction_dir, f'{exp}_updated.json'), 'w') as f:
            json.dump(prediction, f, indent=4)

        scores[lang + ' F1'] = np.mean(all_scores)
        with open(score_path, 'w') as f:
            json.dump(scores, f, indent=4)