In [19]:
import pandas as pd
import json
import re
import pickle
import MeCab

from gensim.parsing.preprocessing import strip_tags, split_alphanum, remove_stopwords, strip_multiple_whitespaces, strip_punctuation
from gensim.summarization.textcleaner import clean_text_by_sentences

In [20]:
flatten = lambda multi_list: [item for sublist in multi_list for item in sublist if (not isinstance(item, str)) or (len(item) is not 0)]
mecab = MeCab.Tagger("-Owakati")

def labeling(sentence_df: pd.DataFrame, train_dict: dict):
    _sentence_df = sentence_df.assign(label = False)
    for _id, train_values in train_dict.items():
        if len(train_values) is 0:
            continue

        _sentence_df.loc[_sentence_df._id == _id, 'label'] = \
            _sentence_df.loc[_sentence_df._id == _id].sentence.str.contains(isin_pat(train_values))

    return _sentence_df

def get_annotation(annotation_data: list, attribute: str):
    train_dict = {}
    for entry in annotation_data:
        train_dict[str(entry['WikipediaID'])] = flatten([re.findall(r'([^。]+)', item) for item in entry['Attributes'][attribute]])

    return train_dict

def isin_pat(matching: [str, list]):
    if isinstance(matching, str):
        return re.escape("%s" % str)
    elif isinstance(matching, list):
        return "|".join([re.escape(t) for t in matching])

def wakati(s: str):
    return [w for w in mecab.parse(s).strip().split()]

In [21]:
def clean_sentence(s):
    _s = strip_tags(s.lower())
    _s = split_alphanum(_s)
    _s = remove_stopwords(_s)
    _s = strip_punctuation(_s)
    _s = strip_multiple_whitespaces(_s)
    
    return _s

In [22]:
# load data
train_df = pd.read_csv("../../data/train.csv", dtype={'_id': str})
valid_df = pd.read_csv("../../data/valid.csv", dtype={'_id': str})

with open("../../data/compound_train.json", 'r', encoding='utf-8') as f:
    train_raw = json.load(f)['entry']
    
with open("../../data/cas_number.json", 'r') as f:
    pageid2cas_table = json.load(f)

with open("../../data/cas2cid.json", 'r') as f:
    cas2cid_table = json.load(f)

## 製造方法

In [23]:
train_manufacturing_dict = get_annotation(train_raw, '製造方法')
train_df = labeling(train_df, train_manufacturing_dict)
valid_df = labeling(valid_df, train_manufacturing_dict)

print("Number of train sentences:", len(train_df))
print("True:", len(train_df[train_df.label == True]), "\tFalse:", len(train_df[train_df.label == False]))
print("Number of valid sentences:", len(valid_df))
print("True:", len(valid_df[valid_df.label == True]), "\tFalse:", len(valid_df[valid_df.label == False]))

Number of train sentences: 7435
True: 508 	False: 6927
Number of valid sentences: 1564
True: 88 	False: 1476


In [24]:
with open("../../data/manufacturings.pkl", 'rb') as f:
    manufacturings = pickle.load(f)

In [25]:
# 英文を文ごとに分割しクリーニングする
m_sentences = {}
for cid, methods in manufacturings.items():
    m_sentences[cid] = [clean_sentence(s.text) for doc in methods for s in clean_text_by_sentences(doc)]

In [42]:
m_sentences['25352']

['personal sampler gases air adapted measurement nitrogen dioxide ',
 ' nitrogen dioxide ',
 'analyte nitrogen dioxide matrix air collection tea triethanolamine coated molecular sieve desorption tea ',
 ' nitrogen dioxide ',
 'sampler passive palmes tube triethanolamine treated screens sampling time min 15 minutes 5 ppm max 8 hr 10 ppm ',
 'sample stability use sampler 1 mo preparation analyze 1 month sampling ',
 'range studied 1 2 80 ppm hr 0 13 8 5 ug nitrogen dioxide sample overall precision 0 06 ',
 ' nitrogen dioxide ']

In [43]:
pageid2manufacturing_table = \
{pageid: m_sentences.get(str(cid))
    for pageid, cas_list in pageid2cas_table.items()
    for cas in cas_list
    for cid in (cas2cid_table.get(cas) if cas2cid_table.get(cas) else [])
    if m_sentences.get(str(cid))
}

manufacturing_df = pd.DataFrame()
for pageid, methods in pageid2manufacturing_table.items():
    manufacturing_df = manufacturing_df.append(pd.DataFrame({'_id': [pageid] * len(methods), 'manufacturing': methods}))

manufacturing_df.reset_index(drop=True, inplace=True)

In [44]:
train_df = pd.merge(train_df, manufacturing_df, on='_id')
valid_df = pd.merge(valid_df, manufacturing_df, on='_id')

In [46]:
# 英文の分かち書き
train_df['manufacturing_words'] = train_df.manufacturing.str.split()
valid_df['manufacturing_words'] = valid_df.manufacturing.str.split()

In [47]:
# 日本語文の分かち書き
train_df['words'] = train_df.sentence.apply(lambda x: wakati(x))
valid_df['words'] = valid_df.sentence.apply(lambda x: wakati(x))

In [50]:
train_df.to_csv("../../data/train_split_words.csv", index=False)
valid_df.to_csv("../../data/valid_split_words.csv", index=False)