In [1]:
import sys
sys.path.append('../util/')

In [2]:
import pandas as pd
import requests
import json
import time
from tqdm import tqdm

import pubchem_util as pil

In [3]:
flatten = lambda item: [i for sub in item for i in sub]

In [3]:
with open("../../data/cas_number.json") as f:
    cas_json = json.load(f)

In [4]:
CAS_list = flatten([cas for cas in cas_json.values() if cas])

In [5]:
print("化合物全記事数:", len(cas_json))
print("CAS番号が存在するWikipedia記事数:", len([cas for cas in cas_json.values() if cas]))
print("CAS番号の数:", len(CAS_list))

化合物全記事数: 5820
CAS番号が存在するWikipedia記事数: 4780
CAS番号の数: 5342


In [6]:
CAS_CID_dict = {}
for CAS in tqdm(CAS_list):
    i = 0
    while True:
        if i > 5: break
        try:
            i += 1
            resp = requests.get(f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{CAS}/cids/JSON')
            break
        except requests.HTTPError as e:
            print("HTTP error.")
            break
        except requests.ConnectionError as e:
            print("Connection error.")
            break
        except requests.ConnectTimeout as t:
            print("Connection timeout.\nRetry...")
            time.sleep(10)
            continue
    
    if resp is None or resp.status_code == 404:
        continue
    
    resp_json = resp.json() 
    if resp_json.get('IdentifierList').get('CID'):
        CAS_CID_dict[CAS] = resp_json['IdentifierList']['CID']

100%|██████████| 5342/5342 [1:15:53<00:00,  1.17it/s]


In [7]:
print("CIDと対応しているCAS番号の数:", len(CAS_CID_dict))
print("CID数:", len(flatten(CAS_CID_dict.values())))

CIDと対応しているCAS番号の数: 4843
CID数: 6726


In [None]:
compound_articles = {}
for CAS, CIDs in tqdm(CAS_CID_dict.items()):
    articles = []
    for CID in CIDs:
        i = 0
        while True:
            if i > 5: break
            try:
                i += 1
                resp = requests.get(f'https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{CID}/JSON')
                time.sleep(0.3)
                break
            except requests.HTTPError as e:
                print("HTTP error.")
                break
            except requests.ConnectionError as e:
                print("Connection error.")
                break
            except requests.ConnectTimeout as t:
                print("Connection timeout.\nRetry...")
                time.sleep(10)
                continue

        if resp is None or resp.status_code == 404:
            continue

        articles.append(resp.json())
    
    compound_articles[CAS] = articles

In [11]:
with open("../data/pubchem_articles.json", 'w') as f:
    json.dump(compound_articles, f)