In [166]:
import pandas as pd
import re
import pickle
import json
import requests
import time
from tqdm import tqdm

In [2]:
flatten = lambda l: [i for sub_l in l for i in sub_l]

def get_CID(article: dict) -> str:
    if article.get('Record') and article.get('Record').get('RecordNumber'):
        return str(article.get('Record').get('RecordNumber'))
    else:
        return None

In [175]:
methods_df = pd.read_pickle("../../data/manufacturing_words_using_wikidata.pkl")

wiki_df = pd.read_csv("../../data/train_split_words.csv")
wiki_df = wiki_df.append(pd.read_csv("../../data/valid_split_words.csv"))
wiki_df._id = wiki_df._id.astype(str)

In [4]:
train_ids = wiki_df._id.unique()

In [5]:
true_ids = wiki_df[wiki_df.label == True]._id.unique()

In [176]:
with open("../../data/pageID2CID_using_wikidata.json", 'r') as f:
    pageid2cid_table = json.load(f)

In [188]:
len(set([pageid for pageid in train_ids if pageid2cid_table.get(pageid)]))

453

In [189]:
print("Wikipedia記事のうち製造方法が記載されている化合物", len(wiki_df[wiki_df.label == True]._id.unique()))
print("うちPubChemに製造方法が記載されている化合物:", len(methods_df[methods_df._id.isin(train_ids)]._id.unique()))

Wikipedia記事のうち製造方法が記載されている化合物 305
うちPubChemに製造方法が記載されている化合物: 211


In [139]:
cid_set = \
list(set(flatten(
[cas2cid_table.get(cas)
    for pageid in train_ids if pageid2cas_table.get(pageid) is not None
    for cas in pageid2cas_table.get(pageid) if cas2cid_table.get(cas) is not None
])))

print(len(cid_set))

669


In [8]:
with open("../../data/pageID2wikidataID.json", 'r') as f:
    pageid2wikidataid_table = json.load(f)

wikidata_dict = {pageid: pageid2wikidataid_table.get(pageid) for pageid in train_ids if pageid2wikidataid_table.get(pageid) is not None}

len(wikidata_dict)

533

In [43]:
compound_wikidata = {}
with open("../../data/compound_wikidata.jsonl", 'r') as f:
    line = f.readline()
    while line:
        try:
            page = json.loads(line).get('entities')
            _id = list(page.keys())[0]
            compound_wikidata[_id] = page[_id]
        except json.JSONDecodeError:
            print('Decode Error.')
        finally:
            line = f.readline()

In [120]:
wd_pageid2cid_table = {}
for page_id, wikidata_id in pageid2wikidataid_table.items():
    if not compound_wikidata.get(wikidata_id): continue
    if not compound_wikidata.get(wikidata_id).get('claims'): continue
    if not compound_wikidata.get(wikidata_id).get('claims').get('P662'): continue
    try:
        cid = [info['mainsnak']['datavalue']['value'] for info in compound_wikidata.get(wikidata_id).get('claims').get('P662')]
        wd_pageid2cid_table[str(page_id)] = cid
    except KeyError:
        continue

In [170]:
print("Number of pages having CID:", len(wd_pageid2cid_table))

Number of pages having CID: 4485


In [137]:
wd_train_cids = list(set(flatten([wd_pageid2cid_table.get(page_id) for page_id in train_ids if wd_pageid2cid_table.get(page_id)])))

In [142]:
with open("../../data/pageID2CID_using_wikidata.json", 'w') as f:
    json.dump(wd_pageid2cid_table, f)

In [150]:
def get_manufacturing(article: dict):
    methods = \
    [method.get('StringValue') 
        for content in article.get('Record').get('Section') if content.get('TOCHeading') == "Use and Manufacturing"
        for section in content.get('Section') if section.get('TOCHeading') in ["Methods of Manufacturing", "Sampling Procedures", "Formulations/Preparations"]
        for method in section.get('Information')
    ]
    
    return methods

def load_articles(filepath, cid_set):
    articles = []
    with open(filepath, 'r') as f:
        for line in f:
            article = json.loads(line)
            cid = get_CID(article)
            
            if cid is None or cid not in cid_set:
                continue
            
            articles.append(article)
    
    print("Number of Articles:", len(articles))
    
    return articles

In [168]:
wd_cid_set = set(flatten(wd_pageid2cid_table.values()))
articles = load_articles("../../data/pubchem_articles.jsonl", wd_cid_set)

Number of Articles: 4490


In [169]:
none_article_cids = wd_cid_set - set([get_CID(article) for article in articles])
len(none_article_cids)

0

In [None]:
with open("../../data/pubchem_articles.jsonl", 'a') as f:
    for cid in tqdm(none_article_cids):
        i = 0
        while True:
            if i > 5: break
            try:
                i += 1
                resp = requests.get('https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON'.format(**locals()))
                time.sleep(0.3)
                break
            except requests.HTTPError as e:
                print("HTTP error.")
                break
            except requests.ConnectionError as e:
                print("Connection error.")
                break
            except requests.ConnectTimeout as t:
                print("Connection timeout.\nRetry...")
                time.sleep(10)
                continue

        if resp is None or resp.status_code == 404:
            continue
        
        try:
            print(cid)
            article = resp.json()
            json.dump(article, f)
            f.write('\n')
        except json.JSONDecodeError:
            continue

In [171]:
manufacturings = {get_CID(article): get_manufacturing(article) for article in articles}

In [174]:
with open("../../data/raw_manufacturings_using_wikidata.json", 'w') as f:
    json.dump(manufacturings, f)