In [1]:
import pandas as pd
import json
import re

In [65]:
def append_dict(dic, key, data):
    if not isinstance(data, list):
        data = [data]
    
    _dic = dic.copy()
    if _dic.get(key) is None:
        _dic[key] = data
    elif isinstance(_dic.get(key), list):
        _dic[key] += data
    else:
        _dic[key] = [_dic[key]] + data
        
    return _dic

def reverse_table(table: dict):
    if not isinstance(list(table.values())[0], list):
        return {v: k for k, v in table.items()}
    
    new_table = {}
    for k, values in table.items():
        for v in values:
            new_table = append_dict(new_table, v, k)
                
    return new_table

def remove_disambiguation(word):
    return re.sub(r'\s\(.+\)$|\s（.+）$', '', word)

def get_CID(s):
    patt = r'<a class=\"pubchem\-internal\-link CID\-([\d]+)\"'
    return list(set(re.findall(patt, s)))

def CID2compound(cid):
    compound_names = []
    pageids = cid2pageid_table.get(cid, [])
    for pageid in pageids:
        compound_name = remove_disambiguation(pageid2title[pageid])
        compound_names += (synonyms_dict[compound_name] + [compound_name])
        
    return compound_names

def get_manufacturing_compounds(page_id):
    cids = pageid2cid_table.get(page_id, [])
    manufacturing_str = ''.join([''.join(manufacturings.get(cid, [])) for cid in cids]) 
    
    compound_names = []
    for inner_cid in get_CID(manufacturing_str):
        compound_names += CID2compound(inner_cid)

    return list(set(compound_names))

def repl_compound(words, repl_words, compounds, ignore_words=[], repl="[PubChem-compound]"):
    return [repl if (word in compounds) and (repl_word not in ignore_words) else repl_word \
            for word, repl_word in zip(words, repl_words)]

In [34]:
with open("../../data/raw_manufacturings_using_wikidata.json", 'r') as f:
    manufacturings = json.load(f)

with open("../../data/pageID2CID_using_wikidata.json", 'r') as f:
    pageid2cid_table = json.load(f)
    cid2pageid_table = reverse_table(pageid2cid_table)

with open("../../data/compound_synonyms.json", 'r') as f:
    synonyms_dict = json.load(f)

with open("../../data/ENE_compound_name.txt", 'r', encoding='utf-8') as f:
    pageid2title = {str(json.loads(line)['pageid']): json.loads(line)['title'] for line in f}

In [111]:
ignore_words = ["[title-compound]"]

In [144]:
train_df = pd.read_pickle("../../data/train_IOB_repl_compound.pkl")
test_df = pd.read_pickle("../../data/test_IOB_repl_compound.pkl")

In [147]:
new_df = pd.DataFrame()
for _id, g in test_df.groupby('_id'):
    article_df = g.copy()
    manu_compounds = get_manufacturing_compounds(_id)
    article_df['repl_pubchem'] = \
    article_df.apply(
        lambda x: repl_compound(x.words, x.repl_words, manu_compounds, ignore_words, repl="[PubChem-compound]")
        , axis=1
    )
    new_df = new_df.append(article_df)

In [148]:
new_df.to_pickle("../../data/test_IOB_repl_compound-list+pubchem.pkl")