In [45]:
import numpy as np
import pandas as pd
import re
import json

import shinra_util as util

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
flatten = lambda x: [j for i in x if i for j in i]

In [3]:
def get_manufacturing(articles: dict):
    methods = \
    [[get_CID_from_article(article), get_manufacturing_from_article(article)]
        for article in list(articles.values())[0]
    ]
    
    manufacturing_df = pd.DataFrame()
    for m in methods:
        manufacturing_df = manufacturing_df.append(pd.DataFrame({'CID': m[0], 'method': m[1]}))
    
    return manufacturing_df.assign(CAS = get_key(articles))

def get_manufacturing_from_article(article: dict):
    methods = \
    [method.get('StringValue') 
        for content in article.get('Record').get('Section') if content.get('TOCHeading') == "Use and Manufacturing"
        for section in content.get('Section') if section.get('TOCHeading') in ["Methods of Manufacturing", "Sampling Procedures"]
        for method in section.get('Information')
    ]
    
    return methods
    
def get_link_CID(method: str):
    r = re.compile(r'<a class=.+?CID\-([\d]+)?.*?>')
    m = re.findall(r, method)
    
    return m

def get_CID(articles: dict):
    return [str(article.get('Record').get('RecordNumber')) for article in list(articles.values())[0]]

def get_CID_from_article(article: dict):
    return str(article.get('Record').get('RecordNumber'))

def direct_sum(CIDs: list):
    return list(set(CIDs))

def cid2pageid(CIDs: list):
    return flatten([cid2cas_table.get(cid) for cid in CIDs if cid2cas_table.get(cid)])

def cas2pageid(CASes: list):
    return flatten([cas2pageid_table.get(cas) for cas in CASes if cas2pageid_table.get(cas)])

def pageid2name(ids: list):
    return [pageid2name_table.get(_id) for _id in ids if pageid2name_table.get(_id)]

def cid2name(CIDs: list):
    return pageid2name(cas2pageid(cid2pageid(CIDs)))

def get_key(d: dict, index=0):
    return list(d.keys())[0]

In [4]:
def extract_CID_from_methods(methods: pd.DataFrame):
    CIDs = \
    methods.assign(
        link_CIDs = methods.method.apply(
            lambda x: get_link_CID(x)
        )
    ).groupby('CID').apply(
        lambda x: list(set(direct_sum(x.link_CIDs.sum())) - set(x.CID.tolist()))
    ).tolist()
    
    return CIDs

In [5]:
with open("../data/cid2cas.json", 'r') as f:
    cid2cas_table = json.load(f)

with open("../data/cas2pageID.json", 'r') as f:
    cas2pageid_table = json.load(f)    

with open("../data/ENE_compound_name.txt", 'r') as f:
    pageid2name_table = {}
    line = f.readline()
    while line:
        ENE = json.loads(line)
        pageid2name_table[str(ENE['pageid'])] = re.sub(r'_(.+)$', '', ENE['title']) # 曖昧さ回避ワードを除去
        line = f.readline()

In [6]:
# CAS -> 製造に用いられる化合物名　のDataFrameを作成
with open("../data/pubchem_articles.jsonl", 'r') as f:
    compounds_in_method = pd.DataFrame()
    line = f.readline()
    while line:
        articles = json.loads(line)
        
        methods_df = get_manufacturing(articles)
        if methods_df.empty:
            line = f.readline()
            continue
        
        CAS = get_key(articles)
        link_cids = extract_CID_from_methods(methods_df)
        compounds = [cid2name(cids) for cids in link_cids]
        
        compounds_in_method = \
        compounds_in_method.append(
            pd.DataFrame({'CAS': CAS, 'compounds': compounds})
        )
        
        line = f.readline()

In [7]:
# CAS番号でグルーピング
compounds_in_method_each_CAS = \
pd.DataFrame(
    compounds_in_method.groupby('CAS').compounds.sum().apply(
        lambda x: direct_sum(x)
    )
).reset_index()

In [8]:
# CAS番号 -> wikipedia_id のDataFrameを作成
cas2pageid_df = pd.DataFrame()
for cas, pageid in cas2pageid_table.items():
    cas2pageid_df = cas2pageid_df.append(pd.DataFrame({'CAS': cas, 'page_id': pageid}))

In [48]:
# pageid -> 製造に用いられる化合物名 の辞書を作成
method_using_compounds = \
pd.DataFrame(
    pd.merge(
        cas2pageid_df
        , compounds_in_method_each_CAS
        , on='CAS'
    ).groupby('page_id').compounds.sum()
).to_dict()['compounds']

for pageid, compounds in method_using_compounds.items():
    method_using_compounds[pageid] = list(set(compounds) - set(pageid2name_table[pageid]) - set("水"))

In [51]:
# トレーニングデータとWikipediaデータの読み込み
wiki_sentence_df = pd.read_csv("../data/wikitext_split_sentence.csv", dtype={'_id': str})

with open("../data/compound_train.json", 'r') as f:
    train_raw = json.load(f)['entry']

production_dict = util.train2dict(train_raw, '製造方法')
train_production_df = wiki_sentence_df.loc[wiki_sentence_df._id.isin(production_dict.keys())]
train_production_df = util.labeling(train_production_df, production_dict)

In [52]:
def is_contains_compounds(x: pd.Series):
    if not method_using_compounds.get(x._id):
        return False
    if re.search(util.contains_patt(method_using_compounds.get(x._id)), x.sentence):
        return True
    else:
        return False

In [53]:
extraction_production_df = \
train_production_df.loc[
train_production_df.apply(
    lambda x: is_contains_compounds(x)
    , axis=1
)]

In [54]:
n_positive = len(train_production_df.loc[train_production_df.label == True])
n_extraction = len(extraction_production_df)
TP = len(extraction_production_df.loc[extraction_production_df.label == True])
FP = len(extraction_production_df.loc[extraction_production_df.label == False])
precision = TP / n_extraction
recall = TP / n_positive
f1 = 2 * precision * recall / (precision + recall)

print("Num of Positive:", n_positive)
print("Extraction:", n_extraction)
print("TP:", TP)
print("FP:", FP)
print("Precision:", precision, "\tRecall:", recall, "F1:", f1)

Num of Positive: 593
Extraction: 501
TP: 138
FP: 363
Precision: 0.2754491017964072 	Recall: 0.2327150084317032 F1: 0.2522851919561243


In [55]:
extraction_production_df.to_csv("../output/extracted_manufacturing_by_pattern_matching.csv", index=False)

In [56]:
method_using_compounds

{'10014': ['二酸化窒素', 'トリエタノールアミン'],
 '1001720': ['エチレングリコール', 'エチレンオキシド'],
 '1015774': ['メタノール', '水酸化カリウム'],
 '101640': ['酢酸エチル',
  'アクリロニトリル',
  'シアン化水素',
  '硫酸',
  'アセトアルデヒド',
  'アクロレイン',
  'ベンゼン',
  'プロピレン',
  'アセチレン',
  '一酸化炭素',
  'エチレンオキシド'],
 '10166': ['ポリエチレン', '水酸化ナトリウム', '二酸化炭素', '硫酸', 'エチレン', 'メタン', '一酸化炭素'],
 '1020032': ['チオシアン酸', 'イソチオシアン酸メチル'],
 '1021184': ['塩酸', 'メタノール', '塩化水素', 'ホルムアルデヒド'],
 '1022955': ['アセト酢酸エチル', 'アンモニア', 'ホルムアルデヒド'],
 '1023103': ['ケテン',
  '樟脳',
  'アセトン',
  'N-ブチルリチウム',
  'ジエチルアミン',
  'ゲラニオール',
  'リナロール',
  'ジケテン',
  'ピネン',
  'イソプレン',
  'ビニルエーテル',
  '2-メトキシプロペン',
  'メチルヘプテノン',
  'ネロール',
  '過酸化水素',
  'アセチレン',
  'パラジウム炭素',
  'ミルセン'],
 '1027018': ['クロシン', 'エタノール'],
 '1029897': ['クロロベンゼン',
  'メタノール',
  'テトラクロロエチレン',
  '硝酸',
  'パラジクロロベンゼン',
  'ベンゼン',
  '酢酸',
  'イソオクタン',
  'カルバミン酸エチル'],
 '1030002': ['イソシアン酸メチル'],
 '1033011': ['グリセリン', 'アントラキノン', 'アントラセン', '硫酸'],
 '1035924': ['塩酸', 'アセチレン', '三塩化ヒ素', '塩化水素'],
 '1036011': ['アセトン'],
 '1036383': ['コデイン', 'テバイン', '

In [60]:
len(cid2cas_table)

6624