In [11]:
import pandas as pd
import json
import re
import pickle

from gensim.parsing.preprocessing import strip_tags, split_alphanum, remove_stopwords, strip_multiple_whitespaces, strip_punctuation
from gensim.summarization.textcleaner import clean_text_by_sentences

In [19]:
flatten = lambda l: [i for sub_l in l for i in sub_l]

def get_CID(article: dict) -> str:
    if article.get('Record') and article.get('Record').get('RecordNumber'):
        return str(article.get('Record').get('RecordNumber'))
    else:
        return None

def get_manufacturing(article: dict):
    methods = \
    [method.get('StringValue') 
        for content in article.get('Record').get('Section') if content.get('TOCHeading') == "Use and Manufacturing"
        #for section in content.get('Section') if section.get('TOCHeading') in ["Methods of Manufacturing", "Sampling Procedures", "Formulations/Preparations"]
        for section in content.get('Section') if section.get('TOCHeading') == "Methods of Manufacturing"
        for method in section.get('Information')
    ]
    
    return methods

def load_articles(filepath):
    articles = []
    with open(filepath, 'r') as f:
        for line in f:
            article = json.loads(line)
            cid = get_CID(article)
            
            if cid is None or cid not in cid_set:
                continue
            
            articles.append(article)
    
    print("Number of Articles:", len(articles))
    
    return articles

## データ読み込み

In [65]:
wiki_sentence_df = pd.read_csv("../../data/wikitext_split_sentence.csv")

In [3]:
with open("../../data/pageID2CID_using_wikidata.json", 'r') as f:
    pageid2cid_table = json.load(f)

In [4]:
cid_set = set(flatten(pageid2cid_table.values()))
print(len(cid_set))

4490


In [5]:
articles = load_articles("../../data/pubchem_articles.jsonl")

Number of Articles: 4490


## Use and Manufacturing　に存在する見出し一覧

In [69]:
heading_set = \
[section.get('TOCHeading')
    for article in articles
    for content in article.get('Record').get('Section') if content.get('TOCHeading') == "Use and Manufacturing"
    for section in content.get('Section')
]

heading_set = set(heading_set)

heading_set

{'Consumption',
 'Formulations/Preparations',
 'Impurities',
 'Methods of Manufacturing',
 'Sampling Procedures',
 'U.S. Exports',
 'U.S. Imports',
 'U.S. Production',
 'Uses'}

- 製造方法の場合
    - Formulations/Preparations
    - Methods of Manufacturing
    - Sampling Procedures

In [47]:
cid2pageid_table = {}
for page_id, cids in pageid2cid_table.items():
    for cid in cids:
        if cid2pageid_table.get(cid):
            cid2pageid_table[cid] += [page_id]
        else:
            cid2pageid_table[cid] = [page_id]

## PubChemのデータから製造方法を抽出

In [6]:
manufacturings = {get_CID(article): get_manufacturing(article) for article in articles}

In [9]:
# とりあえず生データをdump
with open("../../data/raw_manufacturings_using_wikidata.json", 'w') as f:
    json.dump(manufacturings, f)

In [45]:
def clean_sentence(s):
    _s = strip_tags(s.lower())
    _s = remove_stopwords(_s)
    _s = strip_punctuation(_s)
    _s = strip_multiple_whitespaces(_s)
    
    return _s

def remove_triple_period(s):
    return re.sub(r'\.{2,}', '', s)

In [46]:
# 英文を文ごとに分割しクリーニングする
m_sentences = {}
for cid, methods in manufacturings.items():
    m_sentences[cid] = [clean_sentence(s.text) for doc in methods for s in clean_text_by_sentences(remove_triple_period(doc))]

In [49]:
manufacturing_df = pd.DataFrame()
for cid, methods in m_sentences.items():
    if not methods or not cid2pageid_table.get(cid):
        continue
    
    # 1つのCIDに複数のWikipedia記事が紐付けられている場合があるので，page_idとCID（と製造方法）でcross join的なことをする
    for page_id in cid2pageid_table.get(cid):
        manufacturing_df = manufacturing_df.append(
                                pd.DataFrame({
                                    '_id': [page_id] * len(methods), 'CID': [cid] * len(methods), 'manufacturing': methods
                                })
                            )

manufacturing_df.reset_index(drop=True, inplace=True)

manufacturing_df['words'] = manufacturing_df.manufacturing.str.split()

In [51]:
manufacturing_df.to_pickle("../../data/manufacturing_words_using_wikidata.pkl")