In [60]:
import pandas as pd
import json
import re
import pickle

In [61]:
flatten = lambda l: [i for sub_l in l for i in sub_l]

In [62]:
def get_CID(article: dict) -> str:
    if article.get('Record') and article.get('Record').get('RecordNumber'):
        return str(article.get('Record').get('RecordNumber'))
    else:
        return None

In [63]:
def get_manufacturing(article: dict):
    methods = \
    [method.get('StringValue') 
        for content in article.get('Record').get('Section') if content.get('TOCHeading') == "Use and Manufacturing"
        for section in content.get('Section') if section.get('TOCHeading') in ["Methods of Manufacturing", "Sampling Procedures", "Formulations/Preparations"]
        for method in section.get('Information')
    ]
    
    return methods

In [64]:
def load_articles(filepath):
    articles = []
    with open(filepath, 'r') as f:
        for line in f:
            article = json.loads(line)
            cid = get_CID(article)
            
            if cid is None or cid not in cid_set:
                continue
            
            articles.append(article)
    
    print("Number of Articles:", len(articles))
    
    return articles

## データ読み込み

In [65]:
wiki_sentence_df = pd.read_csv("../../data/wikitext_split_sentence.csv")

In [66]:
with open("../../data/cas_number.json", 'r') as f:
    pageid2cas_table = json.load(f)

with open("../../data/cas2cid.json", 'r') as f:
    cas2cid_table = json.load(f)

In [67]:
cid_set = set(flatten(cas2cid_table.values()))
print(len(cid_set))

6624


In [68]:
articles = load_articles("../../data/pubchem_articles.jsonl")

Number of Articles: 6624


## Use and Manufacturing　に存在する見出し一覧

In [69]:
heading_set = \
[section.get('TOCHeading')
    for article in articles
    for content in article.get('Record').get('Section') if content.get('TOCHeading') == "Use and Manufacturing"
    for section in content.get('Section')
]

heading_set = set(heading_set)

heading_set

{'Consumption',
 'Formulations/Preparations',
 'Impurities',
 'Methods of Manufacturing',
 'Sampling Procedures',
 'U.S. Exports',
 'U.S. Imports',
 'U.S. Production',
 'Uses'}

- 製造方法の場合
    - Formulations/Preparations
    - Methods of Manufacturing
    - Sampling Procedures

## PubChemのデータから製造方法を抽出

In [70]:
manufacturings = {get_CID(article): get_manufacturing(article) for article in articles}

In [72]:
with open("../../data/manufacturings.pkl", 'wb') as f:
    pickle.dump(manufacturings, f)