In [2]:
import pandas as pd
import json
import re
import pronto
import mojimoji

In [3]:
with open("../../data/compound_train.json", 'r') as f:
    raw_train = json.load(f)

In [5]:
with open("../../data/ENE_compound_name.txt", 'r', encoding='utf-8') as f:
    pageid2title = {str(json.loads(line)['pageid']): json.loads(line)['title'] for line in f}
    entry_title = list(pageid2title.values())

In [2]:
train_sentence_df = pd.read_pickle("../../dump/Production_train_split_words_using_compound_dict.pkl")
train_sentence_df.head()

Unnamed: 0,_id,label,repl_sentence,sentence,title,words
0,10166,False,[title-compound] (英: [title-compound]) は分子式が N...,アンモニア (英: ammonia) は分子式が NH 3 で表される無機化合物。,アンモニア,"[アンモニア, (, 英, :, ammonia, ), は, 分子, 式, が, NH, ..."
1,10166,False,常温常圧では無色の気体で、特有の強い刺激臭を持つ。,常温常圧では無色の気体で、特有の強い刺激臭を持つ。,アンモニア,"[常温, 常, 圧, で, は, 無色, の, 気体, で, 、, 特有, の, 強い, 刺..."
2,10166,False,水に良く溶けるため、水溶液（[compound]）として使用されることも多く、化学工業では基...,水に良く溶けるため、水溶液（アンモニア水）として使用されることも多く、化学工業では基礎的な窒...,アンモニア,"[水, に, 良く, 溶ける, ため, 、, 水溶液, （, アンモニア水, ）, として,..."
3,10166,False,塩基の程度は[compound]より弱い。,塩基の程度は水酸化ナトリウムより弱い。,アンモニア,"[塩基, の, 程度, は, 水酸化ナトリウム, より, 弱い, 。]"
4,10166,False,窒素原子上の孤立電子対のはたらきにより、金属錯体の配位子となり、その場合はアンミンと呼ばれる。,窒素原子上の孤立電子対のはたらきにより、金属錯体の配位子となり、その場合はアンミンと呼ばれる。,アンモニア,"[窒素, 原子, 上, の, 孤立, 電子, 対, の, はたらき, により, 、, 金属,..."


In [8]:
flatten = lambda items: [i for sublist in items if sublist for i in sublist]

def get_asynonyms_from_wikidata(entry):
    '''
    arg: json of WikiData entry
    return: English and Japanese synonyms list
    '''
    synonyms = []
    
    aliases_dict = list(entry['entities'].values())[0]['aliases']
    if aliases_dict and aliases_dict.get('en'):
        synonyms += [alias['value'] for alias in aliases_dict.get('en')]
    if aliases_dict and aliases_dict.get('ja'):
        synonyms += [alias['value'] for alias in aliases_dict.get('ja')]
    if get_title_from_wikidata(entry, lang='en'):
        synonyms += [get_title_from_wikidata(entry, lang='en')]
    
    return synonyms

def get_title_from_wikidata(entry, lang='ja'):
    label_dict = list(entry['entities'].values())[0]['labels']
    if label_dict.get(lang):
        return label_dict.get(lang)['value']
    return None

def get_CID(article: dict) -> str:
    if article.get('Record') and article.get('Record').get('RecordNumber'):
        return str(article.get('Record').get('RecordNumber'))
    else:
        return None
    
def get_synonyms_from_PubChem(article):
    synonym_list = \
    [synonyms['StringValueList']
     for section in article['Record']['Section'] if section['TOCHeading'] == 'Names and Identifiers'
     for sub_section in section['Section'] if sub_section['TOCHeading'] == 'Synonyms'
     for synonym_info in sub_section['Section'] if synonym_info['TOCHeading'] == 'MeSH Entry Terms'
     for synonyms in synonym_info['Information']
    ]
    
    return flatten(synonym_list)

def append_dict(dic, key, data):
    if not isinstance(data, list):
        data = [data]
    
    _dic = dic.copy()
    if _dic.get(key) is None:
        _dic[key] = data
    elif isinstance(_dic.get(key), list):
        _dic[key] += data
    else:
        _dic[key] = [_dic[key]] + data
        
    return _dic

def remove_disambiguation(word):
    return re.sub(r'\s\(.+\)$|\s（.+）$', '', word)

def reverse_table(table: dict):
    if not isinstance(list(table.values())[0], list):
        return {v: k for k, v in table.items()}
    
    new_table = {}
    for k, values in table.items():
        for v in values:
            new_table = append_dict(new_table, v, k)
                
    return new_table

## 同義語辞書作成

In [5]:
# 同義語辞書 {title: synonyms}
synonyms_dict = {}

In [6]:
# WikiDataから同義語データを取得
with open("../../data/compound_wikidata.jsonl", 'r', encoding='utf-8') as f:
    for line in f:
        entry = json.loads(line)
        title = get_title_from_wikidata(entry)
        if title is None:
            continue
        synonyms = get_asynonyms_from_wikidata(entry)
        synonyms_dict = append_dict(synonyms_dict, title, synonyms)

In [7]:
# ChEBI Ontology から同義語データを取得
ont = pronto.Ontology("../../data/chebi.owl")

with open("../../data/pageid2ChEBI.json", 'r') as f:
    pageid2ChEBI_table = json.load(f)

ChEBI2pageid_table = reverse_table(pageid2ChEBI_table)

for entry in ont:    
    pageid_list = ChEBI2pageid_table.get(entry.id)
    if pageid_list is None:
        continue
    
    synonyms = list(entry.synonyms)
    if len(synonyms) is 0:
        continue
    synonyms = [synonym.desc for synonym in synonyms]
    
    for pageid in pageid_list:
        title = remove_disambiguation(pageid2title.get(pageid))
        synonyms_dict = append_dict(synonyms_dict, title, synonyms)

In [8]:
# PubChem から同義語データを取得
with open("../../data/pageID2CID_using_wikidata.json", 'r') as f:
    cid2pageid_table = reverse_table(json.load(f))

with open("../../data/pubchem_articles.jsonl", 'r') as f:
    for line in f:
        entry = json.loads(line)
        
        cid = get_CID(entry)
        if cid is None: continue
        
        pageid_list = cid2pageid_table.get(cid)
        if pageid_list is None: continue
            
        synonyms = get_synonyms_from_PubChem(entry)
        for pageid in pageid_list:
            title = remove_disambiguation(pageid2title.get(pageid))
            synonyms_dict = append_dict(synonyms_dict, title, synonyms)

## 置換

In [3]:
with open("../../data/compound_synonyms.json", 'r') as f:
    synonyms_dict = json.load(f)

In [6]:
# 日化辞辞書読み込み
nikkaji_compounds = pd.read_csv("../../data/mecab_nikkaji.csv", encoding='cp932')[['Surface form']].rename(columns={'Surface form': 'name'})
## 全角英数字を半角に
nikkaji_compounds.name = nikkaji_compounds.name.apply(lambda x: mojimoji.zen_to_han(x, kana=False))

In [9]:
compound_name_list = [remove_disambiguation(compound) for compound in entry_title]
compound_name_list += nikkaji_compounds.name.tolist()
compound_name_list += flatten(list(synonyms_dict.values()))
compound_name_list = list(set(compound_name_list))

compound_name_list.sort(key=len)
for i, name in enumerate(compound_name_list):
    if len(name) > 1:
        compound_name_list = compound_name_list[i:]
        break
compound_name_list.remove("生産")

In [27]:
def repl_compound(word: str, title: str):
    synonyms = (synonyms_dict.get(title) if synonyms_dict.get(title) else [])
    if (word == title) or (word in synonyms):
        return "[title-compound]"
    elif word in compound_name_list:
        return "[compound]"
    else:
        return word

In [28]:
%%time
replace_compound_train_df = train_sentence_df.copy()
replace_compound_train_df['repl_words'] = replace_compound_train_df.apply(
    lambda x: [repl_compound(w, x.title) for w in x.words]
    , axis=1
)

CPU times: user 8min 21s, sys: 152 ms, total: 8min 21s
Wall time: 8min 21s


In [36]:
replace_compound_train_df = replace_compound_train_df.drop(columns=['repl_sentence'])

In [37]:
replace_compound_train_df.to_pickle("../../data/Production_train_split_word_with_repl_compound.pkl")

In [32]:
for label, g in replace_compound_train_df.groupby('label'):
    count_sentence = len(g)
    count_sentence_in_compound = g.repl_words.apply(lambda x: '[compound]' in x).sum()
    print(label, "\t in compound rate:", count_sentence_in_compound / count_sentence)

False 	 in compound rate: 0.3634526215669956
True 	 in compound rate: 0.7431972789115646


In [33]:
for label, g in replace_compound_train_df.groupby('label'):
    count_sentence = len(g)
    count_sentence_in_compound = g.repl_words.apply(lambda x: '[title-compound]' in x).sum()
    print(label, "\t in compound rate:", count_sentence_in_compound / count_sentence)

False 	 in compound rate: 0.3601236476043277
True 	 in compound rate: 0.36904761904761907
