In [3]:
import pandas as pd
import json
import re
import pronto

In [7]:
flatten = lambda items: [i for sublist in items if sublist for i in sublist]

def get_asynonyms_from_wikidata(entry):
    '''
    arg: json of WikiData entry
    return: English and Japanese synonyms list
    '''
    synonyms = []
    
    aliases_dict = list(entry['entities'].values())[0]['aliases']
    if aliases_dict and aliases_dict.get('en'):
        synonyms += [alias['value'] for alias in aliases_dict.get('en')]
    if aliases_dict and aliases_dict.get('ja'):
        synonyms += [alias['value'] for alias in aliases_dict.get('ja')]
    if get_title_from_wikidata(entry, lang='en'):
        synonyms += [get_title_from_wikidata(entry, lang='en')]
    
    return synonyms

def get_title_from_wikidata(entry, lang='ja'):
    label_dict = list(entry['entities'].values())[0]['labels']
    if label_dict.get(lang):
        return label_dict.get(lang)['value']
    return None

def get_CID(article: dict) -> str:
    if article.get('Record') and article.get('Record').get('RecordNumber'):
        return str(article.get('Record').get('RecordNumber'))
    else:
        return None
    
def get_synonyms_from_PubChem(article):
    synonym_list = \
    [synonyms['StringValueList']
     for section in article['Record']['Section'] if section['TOCHeading'] == 'Names and Identifiers'
     for sub_section in section['Section'] if sub_section['TOCHeading'] == 'Synonyms'
     for synonym_info in sub_section['Section'] if synonym_info['TOCHeading'] == 'MeSH Entry Terms'
     for synonyms in synonym_info['Information']
    ]
    
    return flatten(synonym_list)

def append_dict(dic, key, data):
    if not isinstance(data, list):
        data = [data]
    
    _dic = dic.copy()
    if _dic.get(key) is None:
        _dic[key] = data
    elif isinstance(_dic.get(key), list):
        _dic[key] += data
    else:
        _dic[key] = [_dic[key]] + data
        
    return _dic

def remove_disambiguation(word):
    return re.sub(r'\s\(.+\)$|\s（.+）$', '', word)

def reverse_table(table: dict):
    if not isinstance(list(table.values())[0], list):
        return {v: k for k, v in table.items()}
    
    new_table = {}
    for k, values in table.items():
        for v in values:
            new_table = append_dict(new_table, v, k)
                
    return new_table

In [12]:
# WikiDataから同義語データを取得
synonym_count = 0
with open("../../data/compound_wikidata.jsonl", 'r', encoding='utf-8') as f:
    for line in f:
        entry = json.loads(line)
        title = get_title_from_wikidata(entry)
        if title is None:
            continue
        synonyms = get_asynonyms_from_wikidata(entry)
        synonym_count += len(synonyms)
        
print(synonym_count)

35010


In [9]:
# ChEBI Ontology から同義語データを取得
ont = pronto.Ontology("../../data/chebi.owl")

with open("../../data/pageid2ChEBI.json", 'r') as f:
    pageid2ChEBI_table = json.load(f)

ChEBI2pageid_table = reverse_table(pageid2ChEBI_table)

synonym_count = 0
for entry in ont:    
    pageid_list = ChEBI2pageid_table.get(entry.id)
    if pageid_list is None:
        continue
    
    synonym_count += len(entry.synonyms)

print(synonym_count)

In [8]:
# PubChem から同義語データを取得
with open("../../data/pageID2CID_using_wikidata.json", 'r') as f:
    cid2pageid_table = reverse_table(json.load(f))

synonym_count = 0
with open("../../data/pubchem_articles.jsonl", 'r') as f:
    for line in f:
        entry = json.loads(line)
        
        cid = get_CID(entry)
        if cid is None: continue
        
        pageid_list = cid2pageid_table.get(cid)
        if pageid_list is None: continue
            
        synonyms = get_synonyms_from_PubChem(entry)
        synonym_count += len(synonyms)

print(synonym_count)

18274


In [13]:
with open("../../data/compound_synonyms.json", 'r') as f:
    synonyms_dict = json.load(f)

In [14]:
print(len(flatten(synonyms_dict.values())))

80921
