In [1]:
import os
from tqdm import tqdm
from utils import load_json, load_jsonl_iteratively 

lang = "en_jstage"

In [6]:
import json 

CODE_SWITCH_ROOT = "/data/xzhao/dataset/roman-pretrain/datasets/medical/en_jstage/code-switch/"
def load_nered_docs(start=0, end=None):
    """
    Load NER documents by both bionlp and scibert, from the specified range in iterativel way
    Parameters: 
    - start: The starting index (inclusive)
    - end: The ending index (exclusive)
    Return: A generator that yields tuples of (bionlp_doc, scibert_doc)
    """
    end = 99999999 if end is None else end
    assert end > start
    bionlp_filename = f"{CODE_SWITCH_ROOT}/bionlp_merged.full.jsonl"
    scibert_filename = f"{CODE_SWITCH_ROOT}/scibert_merged.full.jsonl"
    with open(bionlp_filename, 'r', encoding="utf8") as f1, open(scibert_filename, 'r', encoding="utf8") as f2:
        cnt = 0
        for line1, line2 in tqdm(zip(f1, f2), "Processing NER documents to perform code-switching"):
            # Remove trailing newlines if needed
            bionlp_doc = json.loads(line1.rstrip('\n'))
            scibert_doc = json.loads(line2.rstrip('\n'))
            assert bionlp_doc["docid"] == scibert_doc["docid"], "IDs do not match"
            if cnt >= start and (end is None or cnt < end):
                yield bionlp_doc, scibert_doc
            cnt += 1
            if cnt > end:
                break
            

In [7]:
en2ja_codes = {}
for ner_tool in ["scibert", "bionlp"]:
    codeswitch_datapath = f"/data/xzhao/dataset/roman-pretrain/datasets/medical/en_jstage/code-switch/cui_ja-{ner_tool}.jsonl"
    for item in tqdm(load_jsonl_iteratively(codeswitch_datapath)):
        if len(item["translations"]) == 0:
            continue
        en2ja_codes.setdefault(item["cui"], {}).update({ner_tool: item["translations"]['JPN']})

bionlp_cnt, scibert_cnt = 0, 0
for cui in en2ja_codes:
    if "bionlp" in en2ja_codes[cui]:
        bionlp_cnt += 1
    if "scibert" in en2ja_codes[cui]:
        scibert_cnt += 1
print(f"bionlp: {bionlp_cnt}, scibert: {scibert_cnt}, all: {len(en2ja_codes)}")


215900it [00:01, 196604.83it/s]
128145it [00:00, 200361.85it/s]

bionlp: 27123, scibert: 36680, all: 36917





In [26]:
import re
import nltk
import random
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


FILTER_PATTERNS = {
    "en": [" (qualifier value)", " (observable entity)"],
    "ja": ["Not Translated["]
}

def _filter_alias_by_patterns(aliases, lang):
    if lang not in FILTER_PATTERNS:
        return aliases
    else:
        return [alias for alias in aliases if not any(pat in alias for pat in FILTER_PATTERNS[lang])]
    
def _has_hankaku_katakana(text):
    return bool(re.search(r'[\uff65-\uff9f]', text))

def get_aliases_by_cuis(cuis, cui_codes):
    """Get aliases from cui_codes based on the provided CUIs and filter patterns."""
    aliases = set()
    for cui in cuis:
        if cui in cui_codes:
            for _aliases in cui_codes[cui].values():
                aliases.update(_aliases)
    return list(aliases)

def get_umls_cuis(
        cuis, strategy, 
        exact_match=False, target_text=None, 
        random_seed=None, **kwargs):

    """Get UMLS CUIs from a list of CUIs based on the specified strategy."""
    cuis = [cui for cui in cuis if 'aliases' in cui and len(cui['aliases']) > 0]
    
    filtered_cuis = []
    if exact_match:
        assert target_text is not None, "Target text must be provided for exact match"
        for cui in cuis:
            lowered_aliases = set(alias.lower() for alias in cui['aliases'])
            if target_text.lower() in lowered_aliases:
                filtered_cuis.append(cui)
    else:
        filtered_cuis = cuis

    if len(filtered_cuis) == 0:
        return []
    
    # print("===> Filtered CUIs: ", len(filtered_cuis))
    if strategy == "max_score":
        max_score = -1
        for cui_ent in filtered_cuis:
            if cui_ent['score'] > max_score:
                best_cand = cui_ent
        return [best_cand['cui']]
    elif strategy == "random":
        if random_seed is not None:
            random.seed(random_seed)
        return [random.choice(filtered_cuis)['cui']]
    elif strategy == "all":
        return [cui_ent['cui'] for cui_ent in filtered_cuis]
    elif strategy == "threshold":
        assert "threshold" in kwargs, "Threshold value is required for 'threshold' strategy"
        return [cui_ent['cui'] for cui_ent in filtered_cuis if cui_ent['score'] >= kwargs["threshold"]]
    else:
        raise NotImplementedError("Unknown strategy: {} for sampling CUI".format(strategy))
    
def choice_cui_alias(aliases, strategy, replaced_text, lang='en', random_seed=None):
    """Choose one alias from a list of aliases based on the specified strategy."""
    if random_seed is not None:
        random.seed(random_seed)
    
    # Filter aliases
    aliases = _filter_alias_by_patterns(aliases, lang)    
    if lang == 'ja':
        aliases = [alias for alias in aliases if not _has_hankaku_katakana(alias)]
    if len(aliases) == 0:
        return None 
    
    
    if strategy == 'random':
        return random.choice(aliases)
    elif strategy.endswith('_diff'):
        y = set(replaced_text.lower().split())
        diffs = []
        for alias in aliases:
            if lang == 'en':
                x = set(alias.lower().split())
            else:
                raise NotImplementedError("Language {} not supported for strategy 'max_diff' in function choice_cui_alias".format(lang))
            diffs.append(1 - len(x.intersection(y)) / len(x.union(y)))
        if strategy == 'max_diff':
            cands = [alias for alias in aliases if diffs[aliases.index(alias)] == max(diffs)]
        elif strategy == 'min_diff':
            cands = [alias for alias in aliases if diffs[aliases.index(alias)] == min(diffs)]
        else:
            raise NotImplementedError("Unknown strategy: {} for selecting alias from CUI retrieval".format(strategy))
        return random.choice(cands) if cands else None
    else:
        raise NotImplementedError("Unknown strategy: {} for selecting alias from CUI retrieval".format(strategy))

def code_switch(text, switch_recipe):
    offset = 0
    for switch in switch_recipe:
        # print(switch)
        text = text[:switch['start'] + offset] + switch['code'] + text[switch['end'] + offset:]
        offset += len(switch['code']) - (switch['end'] - switch['start'])
    return text

def process_docs_by_code_switching(bionlp_doc, scibert_doc, cui_codes, switch_ratio=1.0, random_seed=None):
    if random_seed is not None:
        random.seed(random_seed)
    
    switch_recipe, examined_indexes = [], []
    merged_entities = bionlp_doc['entities'] + scibert_doc['entities']
    for i, ent in enumerate(merged_entities):
        if len(ent['ents']) == 0:
            continue
        
        # Skip if this entity overlaps with any already examined entity
        if next((True for start, end in examined_indexes if not (ent['end'] < start or ent['start'] > end)), False):
            continue

        examined_indexes.append((ent['start'], ent['end']))
        cuis = get_umls_cuis(
            cuis=ent['ents'], strategy="threshold", random_seed=random_seed+i,
            exact_match=True, target_text=ent['text'], threshold=0.8)
        filtered_aliases = get_aliases_by_cuis(cuis=cuis, cui_codes=cui_codes)
        # print("All aliases: ", filtered_aliases)
        ent_alias = choice_cui_alias(
            aliases=filtered_aliases, strategy="random", 
            replaced_text=ent["text"], lang='ja', random_seed=random_seed+i)
        if ent_alias is None:
            continue
        
        if random.random() > switch_ratio:
            continue
        switch_recipe.append({
            "start": ent['start'],
            "end": ent['end'],
            "raw": ent["text"],
            "code": ent_alias,
            "type": "en->ja"
        })
        print(f"Switching '{ent['text']}' to '{ent_alias}'")
    sorted_recipe = sorted(switch_recipe, key=lambda x: x['start'])
    switched_text = code_switch(text=bionlp_doc['text'], switch_recipe=sorted_recipe)
    return switched_text
    

In [28]:
def switch_code(doc1, doc2):
    assert "text" in doc1 and "text" in doc2, "Both documents must contain 'text' field"
    assert isinstance(doc1['text'], str) and isinstance(doc2['text'], str), "Both 'text' fields must be strings"
    assert doc1['text'] == doc2['text'], "Text fields do not match"
    return ""

for bionlp_doc, scibert_doc in load_nered_docs(start=0, end=10):
    print("====" * 20)
    assert bionlp_doc["docid"] == scibert_doc["docid"], "Document IDs do not match"
    new_doc = {
        "docid": bionlp_doc["docid"], 
        "keywords": [], "subjects": [], "sentences": []}
    bionlp_doc = bionlp_doc['ner']
    scibert_doc = scibert_doc['ner'] 
    
    random_seed = 42
    switch_recipe, examined_indexes = [], []
    merged_entities = bionlp_doc["abstract"]['entities'] + scibert_doc["abstract"]['entities']
    for i, ent in enumerate(merged_entities):
        if len(ent['ents']) == 0:
            continue
        
        # Skip if this entity overlaps with any already examined entity
        if next((True for start, end in examined_indexes if not (ent['end'] < start or ent['start'] > end)), False):
            continue

        examined_indexes.append((ent['start'], ent['end']))
        cuis = get_umls_cuis(
                cuis=ent['ents'], strategy="all", random_seed=random_seed+i,
                exact_match=True, target_text=ent['text'], threshold=0.8)
        if ent['text'] == "increased":
            print("===========>", cuis)
        
        filtered_aliases = get_aliases_by_cuis(cuis=cuis, cui_codes=en2ja_codes)
        # print("All aliases: ", filtered_aliases)
        ent_alias = choice_cui_alias(
            aliases=filtered_aliases, strategy="random", 
            replaced_text=ent["text"], lang='ja', random_seed=random_seed+i)
        if ent_alias is None:
            continue

        switch_recipe.append({
            "start": ent['start'],
            "end": ent['end'],
            "raw": ent["text"],
            "code": ent_alias,
            "type": "en->ja"
        })
        print(f"'{ent['text']}' -> '{ent_alias}'")
        # if ent['text'] == "increased":
        #     break

    print("===> Count of CUIS:", len(cuis))
    print("===> Count of switched entities:", len(switch_recipe))
    sorted_recipe = sorted(switch_recipe, key=lambda x: x['start'])
    switched_text = code_switch(text=bionlp_doc["abstract"]['text'], switch_recipe=sorted_recipe)
    print("===> Original text: ", bionlp_doc["abstract"]['text'])
    print("===> Switched text: ", switched_text)

Processing NER documents to perform code-switching: 0it [00:00, ?it/s]

Processing NER documents to perform code-switching: 10it [00:00, 67.90it/s]

'superoxide dismutase' -> 'ヘモクプレイン'
'SOD' -> '銅亜鉛スーパーオキシドジスムターゼ'
'catalase' -> 'ヘムカタラーゼ'
'SOD' -> 'スーパーオキサイドディスムターゼ'
'SOD' -> 'ヘモクプレイン'
'SOD' -> '銅亜鉛スーパーオキシドジスムターゼ'
'mental stress' -> 'ストレス-精神的'
'physiological values' -> '生理学的現象'
'mental stress' -> 'ストレス-心理的'
'activity' -> '身体的活動'
'mental stress' -> '情動性ストレス'
'activity' -> '運動活動'
'cortisol' -> 'コルチゾール'
'activity' -> '身体的活動'
'saliva' -> '唾液'
'mental stress' -> 'ストレス-精神的'
'cortisol' -> '17-ヒドロキシコルチコステロン'
'activity' -> '運動活性'
'mental stress' -> '情動緊張'
'cortisol' -> 'ハイドロコルチゾン'
'activity' -> '運動活性'
'health' -> '健康'
'anxiety' -> '不安障害'
'mental stress' -> 'ストレス-心理的'
===> Count of CUIS: 1
===> Count of switched entities: 24
===> Original text:  Recent studies showed that makeup reduces mental stress, indicated by not only psychological but also physiological values. In this study, we examined the relationship between mental stress and the activity of reactive oxygen scavenging enzymes such as superoxide dismutase (SOD) and catalase (CAT), and




In [61]:
for cui in cuis:
    print(cui, en2ja_codes.get(cui, None))

C0205217 None
C0442805 None
C5236002 None
C0151904 {'scibert': ['アスパラギン酸アミノトランスフェラーゼ増加', 'ｱｽﾊﾟﾗｷﾞﾝｻﾝｱﾐﾉﾄﾗﾝｽﾌｪﾗｰｾﾞｿﾞｳｶ', 'ｱｽﾊﾟﾗｷﾞﾝｻﾝｱﾐﾉﾄﾗﾝｽﾌｪﾗｰｾﾞｿﾞｳｶ', 'ｹｯｾｲGOTｿﾞｳｶ', 'アスパラギン酸アミノトランスフェラーゼ増加', 'ＡＳＴ増加', 'ＧＯＴ増加', '血清グルタミン酸オキサロ酢酸トランスアミナーゼ増加', 'ASTｿﾞｳｶ', 'ｸﾞﾙﾀﾐﾝｻﾝｵｷｻﾛｻｸｻﾝﾄﾗﾝｽｱﾐﾅｰｾﾞｿﾞｳｶ', '血清ＧＯＴ増加', 'ｹｯｾｲｸﾞﾙﾀﾐﾝｻﾝｵｷｻﾛｻｸｻﾝﾄﾗﾝｽｱﾐﾅｰｾﾞｿﾞｳｶ', 'GOTｿﾞｳｶ', 'グルタミン酸オキサロ酢酸トランスアミナーゼ増加'], 'bionlp': ['アスパラギン酸アミノトランスフェラーゼ増加', 'ｱｽﾊﾟﾗｷﾞﾝｻﾝｱﾐﾉﾄﾗﾝｽﾌｪﾗｰｾﾞｿﾞｳｶ', 'ｱｽﾊﾟﾗｷﾞﾝｻﾝｱﾐﾉﾄﾗﾝｽﾌｪﾗｰｾﾞｿﾞｳｶ', 'ｹｯｾｲGOTｿﾞｳｶ', 'アスパラギン酸アミノトランスフェラーゼ増加', 'ＡＳＴ増加', 'ＧＯＴ増加', '血清グルタミン酸オキサロ酢酸トランスアミナーゼ増加', 'ASTｿﾞｳｶ', 'ｸﾞﾙﾀﾐﾝｻﾝｵｷｻﾛｻｸｻﾝﾄﾗﾝｽｱﾐﾅｰｾﾞｿﾞｳｶ', '血清ＧＯＴ増加', 'ｹｯｾｲｸﾞﾙﾀﾐﾝｻﾝｵｷｻﾛｻｸｻﾝﾄﾗﾝｽｱﾐﾅｰｾﾞｿﾞｳｶ', 'GOTｿﾞｳｶ', 'グルタミン酸オキサロ酢酸トランスアミナーゼ増加']}
C0221106 {'scibert': ['ｐＨ上昇', 'ｱﾙｶﾘｹｯｼｮｳ', 'pHｼﾞｮｳｼｮｳ', 'アルカリ血症', 'ｱﾙｶﾘｹﾂｼｮｳ'], 'bionlp': ['ｐＨ上昇', 'ｱﾙｶﾘｹｯｼｮｳ', 'pHｼﾞｮｳｼｮｳ', 'アルカリ血症', 'ｱﾙｶﾘｹﾂｼｮｳ']}


In [85]:
import spacy
nlp = spacy.load("en_core_web_sm")

lemmatizer.lemmatize("increased")

  from .autonotebook import tqdm as notebook_tqdm


'increased'

In [69]:
ent

{'text': 'increased',
 'label': 'ENTITY',
 'start': 583,
 'end': 592,
 'ents': [{'cui': 'C0205217',
   'score': 0.9793238043785095,
   'tui': ['T081'],
   'aliases': ['Increases',
    'increased',
    'Increased by',
    'augmented',
    'augment',
    'Increased',
    'Augmented',
    'Increased (qualifier value)']},
  {'cui': 'C0442805',
   'score': 0.9793238043785095,
   'tui': ['T169'],
   'aliases': ['Increase (qualifier value)',
    'Heightened',
    'increases',
    'increase',
    'Increased',
    'Increase']},
  {'cui': 'C5236002',
   'score': 0.9793238043785095,
   'tui': ['T033'],
   'aliases': ['Increased']},
  {'cui': 'C0151904',
   'score': 0.8560683131217957,
   'tui': ['T033'],
   'aliases': ['SGOT, ELEVATED',
    'ast elevated',
    'GLUTAMIC-OXALOACETIC TRANSAM INCR',
    'Elevated serum AST',
    'AST INCREASED',
    'Elevated circulating aspartate aminotransferase concentration',
    'Elevated serum glutamic oxaloacetic transaminase',
    'elevated ast',
    'Aspart

In [12]:
switched_text = process_docs_by_code_switching(
    bionlp_doc=bionlp_doc['abstract'], 
    scibert_doc=scibert_doc['abstract'], 
    cui_codes=en2ja_codes, 
    switch_ratio=1, random_seed=42)
print("Before switching: ", bionlp_doc['abstract']['text'])
print("After switching: ", switched_text)

Switching 'superoxide dismutase' to 'スーパーオキサイドディスムターゼ'
Switching 'SOD' to '銅-亜鉛スーパーオキシドジスムターゼ'
Switching 'catalase' to 'カタラーゼ'
Switching 'CAT' to 'ヘムカタラーゼ'
Switching 'SOD' to '銅亜鉛スーパーオキシドジスムターゼ'
Switching 'SOD' to 'スーパーオキサイドディスムターゼ'
Switching 'SOD' to '銅-亜鉛スーパーオキシドジスムターゼ'
Switching 'CAT' to 'ヘムカタラーゼ'
Switching 'mental stress' to 'メンタルストレス'
Switching 'psychological' to '心理検査'
Switching 'physiological values' to '生理学的現象'
Switching 'mental stress' to '心理ストレス'
Switching 'activity' to '身体活動'
Switching 'mental stress' to '心理的ストレス'
Switching 'activity' to '身体的活動'
Switching 'cortisol' to 'コルチゾン酢酸エステル'
Switching 'activity' to '身体活動'
Switching 'saliva' to '流涎'
Switching 'mental stress' to 'メンタルストレス'
Switching 'cortisol' to '酢酸コルチゾン'
Switching 'increased' to 'アスパラギン酸アミノトランスフェラーゼ増加'
Switching 'activity' to '運動活動'
Switching 'decreased' to 'プロトロンビン時間短縮'
Switching 'mental stress' to '心的ストレス'
Switching 'cortisol' to 'コルチゾン'
Switching 'decreased' to 'プロトロンビン時間短縮'
Switching 'activity' to '運動活動'
Switchin

In [None]:
span2cuis = {}
sent1, sent2 = bionlp_doc['title'], scibert_doc['title']
text = sent1['text']
filter_words = [word for word in nlp(text) if word.pos_ == "VERB"]
for doc in [sent2, sent1]:
    for ent_item in doc["entities"]:
        if ent_item["ents"] == []:
            continue
        if next((
                word for word in filter_words 
                if word.idx == ent_item["start"] and word.idx + len(word) == ent_item["end"]), 
                None):
            print(ent_item)
            continue
        avaliable_cuis = [ent for ent in ent_item['ents'] if ent['cui'] in cui_codes]
        if len(avaliable_cuis) == 0:
            continue
        best_cui = sorted(avaliable_cuis, key=lambda x: x["score"], reverse=True)[0]
        span2cuis[(ent_item["start"], ent_item["end"])] = {
            "text": ent_item["text"],
            "cui": best_cui["cui"],
            "start": ent_item["start"],
            "end": ent_item["end"]
        }
sorted_codes = sorted(span2cuis.values(), key=lambda x: x["start"])



In [None]:
# start_indice = sorted_codes[0]['start']


    

In [15]:
bionlp_doc['title']

{'text': 'Makeup Inhibits Reduction of Reactive Oxygen Scavenging Enzyme Activity Induced by Mental Stress',
 'entities': [{'text': 'Reactive Oxygen',
   'label': 'SIMPLE_CHEMICAL',
   'start': 29,
   'end': 44,
   'ents': [{'cui': 'C0162772',
     'score': 0.8334328532218933,
     'tui': ['T123', 'T196'],
     'aliases': ['Reactive Oxygen Intermediates',
      'Active Oxygen Species',
      'oxygen radicals',
      'Oxygen Radical',
      'Oxygen Species, Reactive',
      'reactive oxygen species',
      'active oxygen',
      'Active oxygen',
      'Pro Oxidants',
      'ROS',
      'Reactive Oxygen Species',
      'oxygen radical',
      'pro oxidant',
      'Oxygen, Active',
      'Active Oxygen',
      'Oxygen Radicals',
      'Radical, Oxygen',
      'oxygen reactive species',
      'Pro-Oxidant',
      'Pro Oxidant',
      'Pro-Oxidants']},
    {'cui': 'C0205332',
     'score': 0.7410761713981628,
     'tui': ['T080'],
     'aliases': ['Reactive', 'Reactive (qualifier value)', '

In [None]:
for ner_tool in ["bionlp", "scibert"]:
    entity_path = f"/data/xzhao/dataset/roman-pretrain/datasets/medical/en_jstage/code-switch/{ner_tool}_merged.jsonl"
    for item_with_ent in tqdm(load_jsonl_iteratively(entity_path, request_num=None)):
        if item_with_ent["docid"] not in umls_data:
            umls_data[item_with_ent["docid"]] = [{} for _ in range(len(item_with_ent["sentences"]))]

        for idx, sent in enumerate(item_with_ent["sentences"]):
            for ent_item in sent["entities"]:
                if ent_item["ents"] == []:
                    continue
                avaliable_cuis = [ent for ent in ent_item['ents'] if ent['cui'] in cui_codes]
                if len(avaliable_cuis) == 0:
                    continue
                best_cui = sorted(avaliable_cuis, key=lambda x: x["score"], reverse=True)[0]
                if best_cui["cui"] not in umls_data[item_with_ent["docid"]][idx]:
                    umls_data[item_with_ent["docid"]][idx][best_cui["cui"]] = []
                umls_data[item_with_ent["docid"]][idx][best_cui["cui"]].append({
                    "text": ent_item["text"],
                    "label": ent_item["label"],
                    "score": best_cui["score"],
                    "indice": (ent_item["start"], ent_item["end"]),
                    "tool": ner_tool,
                })



In [63]:
from tqdm import tqdm
from utils import load_jsonl
umls_data_cachepath = "./caches/umls_by_doc.jsonl"
if os.path.exists(umls_data_cachepath):
    print(f"Loading cached UMLS data from {umls_data_cachepath}")
    umls_data = load_jsonl(umls_data_cachepath, verbose=True)
    umls_data = {umls['docid']: umls for umls in umls_data}
else:
    umls_data = {}
    for ner_tool in ["bionlp", "scibert"]:
        entity_path = f"/data/xzhao/dataset/roman-pretrain/datasets/medical/en_jstage/code-switch/{ner_tool}_merged.jsonl"
        for item_with_ent in tqdm(load_jsonl_iteratively(entity_path, request_num=None)):
            if item_with_ent["docid"] not in umls_data:
                umls_data[item_with_ent["docid"]] = [{} for _ in range(len(item_with_ent["sentences"]))]

            for idx, sent in enumerate(item_with_ent["sentences"]):
                for ent_item in sent["entities"]:
                    if ent_item["ents"] == []:
                        continue
                    avaliable_cuis = [ent for ent in ent_item['ents'] if ent['cui'] in cui_codes]
                    if len(avaliable_cuis) == 0:
                        continue
                    best_cui = sorted(avaliable_cuis, key=lambda x: x["score"], reverse=True)[0]
                    if best_cui["cui"] not in umls_data[item_with_ent["docid"]][idx]:
                        umls_data[item_with_ent["docid"]][idx][best_cui["cui"]] = []
                    umls_data[item_with_ent["docid"]][idx][best_cui["cui"]].append({
                        "text": ent_item["text"],
                        "label": ent_item["label"],
                        "score": best_cui["score"],
                        "indice": (ent_item["start"], ent_item["end"]),
                        "tool": ner_tool,
                    })

    from utils import dump_jsonl
    umls_data_ls = [{"docid": key, "code-switch": item} for key, item in umls_data.items()]
    dump_jsonl(umls_data_ls, umls_data_cachepath)


Loading cached UMLS data from ./caches/umls_by_doc.jsonl


Loading JSONL from ./caches/umls_by_doc.jsonl: 404364it [01:23, 4865.09it/s] 


In [None]:
import re
import spacy
import random

nlp = spacy.load("en_core_web_sm")
def has_hankaku_katakana(text):
    return bool(re.search(r'[\uff65-\uff9f]', text))

def get_code_map(nered_sents):
    """"
    Create a mapping from entity text to UMLS codes.
    Parameters: 
    - nerted_sents: the sentences with the NER information preprocessed
    Output:
    - code_map: a dictionary mapping entity text to UMLS codes
    """
    codes = {}
    for sent in nered_sents:
        for cui, cui_items in sent.items():
            for cui_item in cui_items:
                if cui_item['text'] in codes:
                    continue
                available_codes = set()
                for codes_per_cui in cui_codes[cui].values():
                    available_codes.update([
                        code 
                        for code in codes_per_cui 
                        if isinstance(code, str) and not has_hankaku_katakana(code)])

                codes[cui_item['text']] = random.choice(list(available_codes))
    return codes

def get_switch_recipe(sent, code_index):
    """  Generate a switch recipe for the given sentence based on the code index.
    Parameters:
    - sent: The target sentence to be processed
    - code_index: the index of code-switching dictionary with the first character as key. 
        The index is created to reduce computational complexity
    Return: 
    - switch_recipe: a list of dictionaries containing the switch information
    """ 
    sent_seqs = nlp(sent)
    switch_recipe = []
    for i, token in enumerate(sent_seqs):
        if token.text[0] not in code_index:
            continue
        if token.pos_ == "VERB": 
            # UMLS alias includes hyponymy for verbs (e.g, decreased -> プロトロンビン時間短縮)
            continue
        
        for en_code, ja_code, en_code_tokens in code_index[token.text[0]]:
            if len(sent_seqs) <= i + len(en_code_tokens):
                continue
            
            raw_tokens = [token.text for token in sent_seqs[i:i + len(en_code_tokens)]]
            if raw_tokens == en_code_tokens:
                target_token_seqs = sent_seqs[i:i + len(en_code_tokens)]
                start = target_token_seqs[0].idx
                end = target_token_seqs[-1].idx + len(target_token_seqs[-1])
                assert sent[start:end] == en_code
                switch_recipe.append({
                    "start": start,
                    "end": end,
                    "en_code": en_code,
                    "ja_code": ja_code
                })
    return switch_recipe

data_path = f"/data/xzhao/dataset/roman-pretrain/datasets/medical/{lang}/full.jsonl"
for item in tqdm(load_jsonl_iteratively(data_path, request_num=100)):
    random.seed(item["docid"])
    code_switch = {"sentences": [], "causal": {}, "qa": []}
    nered_sents = umls_data[item["docid"]]['code-switch']
    codes_per_item = get_code_map(nered_sents)
    
    # Create index for codes using the first character to reduce complexity
    code_index = {}
    for en_code in codes_per_item:
        code_index.setdefault(en_code[0], []).append(
            (en_code, codes_per_item[en_code], [token.text for token in nlp(en_code)]))
    
    ## Code-switching for sentences
    for raw_sent in item["raw"]["sentences"]:
        switch_recipe = get_switch_recipe(raw_sent, code_index)

    


100it [00:14,  6.70it/s]


In [67]:
item["raw"]

{'title': 'Effectiveness of edaravone in patients with minor ischemic stroke and hyperglycemia',
 'abstract': 'It has been reported that hyperglycemia associated with neurological deterioration in patients with acute ischemic stroke. We evaluated the effectiveness of edaravone in minor stroke patients with hyperglycemia. We retrospectively analyzed 32 consecutive acute stroke patients with hyperglycemia of over 200 mg/dl and neurological symptoms of between 1 and 4 on the National Institutes of Health Stroke Scale (NIHSS) score. We investigated which factors were associated with a good outcome of 0 or 1 on the modified Rankin Scale at hospital discharge. A good outcome was significantly related to a lower score on the NIHSS at admission, the administration of edaravone, and abnormality of serum creatinine. Multiple logistic regression analysis demonstrated a significant association of both a low score on the NIHSS and the administration of edaravone with a good outcome. The results obt

In [None]:
sent_seqs = nlp(sent)
switch_recipe = []
for i, token in enumerate(sent_seqs):
    if token.text[0] not in code_index:
        continue
    if token.pos_ == "VERB": 
        # UMLS alias includes hyponymy for verbs (e.g, decreased -> プロトロンビン時間短縮)
        continue
    
    for en_code, ja_code, en_code_tokens in code_index[token.text[0]]:
        if len(sent_seqs) <= i + len(en_code_tokens):
            continue
        
        raw_tokens = [token.text for token in sent_seqs[i:i + len(en_code_tokens)]]
        if raw_tokens == en_code_tokens:
            target_token_seqs = sent_seqs[i:i + len(en_code_tokens)]
            start = target_token_seqs[0].idx
            end = target_token_seqs[-1].idx + len(target_token_seqs[-1])
            assert sent[start:end] == en_code
            switch_recipe.append({
                "start": start,
                "end": end,
                "en_code": en_code,
                "ja_code": ja_code
            })

In [None]:
sent = item['raw']['abstract']


