In [144]:
import numpy as np
import pandas as pd
import json
import pickle
import re
import lightgbm as lgb
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
from sklearn.model_selection import cross_validate, StratifiedKFold
import matplotlib.pyplot as plt
from collections import Counter

import shinra_util as util
import word_entropy
import feature

import warnings
warnings.filterwarnings('ignore')

plt.rcParams['font.family'] = 'IPAGothic'

In [145]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
with open("../data/compound_train.json", 'r') as f:
    raw_train = json.load(f)['entry']

train_dict = util.train2dict(raw_train, "用途")

In [10]:
all_wiki_df = pd.read_csv("../data/wikitext_split_sentence_with_subtitle.csv")

## Trainデータ作成

In [12]:
train_df = all_wiki_df.loc[all_wiki_df._id.isin(train_dict.keys())].reset_index(drop=True)
train_df._id = train_df._id.astype(str)
train_df = util.labeling(train_df, train_dict)
train_df.head()

Unnamed: 0,_id,sentence,heading,label
0,2662912,ハロン (halon) は、炭化水素の水素原子（一部または全て）がハロゲン原子で置換されたハ...,NO_SUBTITLE,False
1,2662912,ハロゲン化炭化水素 (halogenated hydrocarbon) が語源で、アメリカ陸...,NO_SUBTITLE,False
2,2662912,ハロン類 (halons)、ハロン化合物 (halon compounds) ともいう。,NO_SUBTITLE,False
3,2662912,ハロンに対し、臭素を含まず、ハロゲンがフッ素と塩素のみの化合物を、フロン（クロロフルオロカー...,NO_SUBTITLE,False
4,2662912,ただし、フロンが日本特有の語であるのに対し、ハロンは国際的に通用する名である。,NO_SUBTITLE,False


In [13]:
train_df.groupby('label').count()

Unnamed: 0_level_0,_id,sentence,heading
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,6701,6701,6701
True,1524,1524,1524


In [36]:
# ブートストラップ法で得た手がかり語で学習データをフィルタリング

with open("../dump/clue_words.pickle", 'br') as f:
    clue_word_by_BS = pickle.load(f)

print("train length :", len(train_df))
print("label 1 :", len(train_df.loc[train_df.label == 1]), "\tlabel 0 :", len(train_df.loc[train_df.label == 0]))

train_df = train_df.loc[train_df.sentence.str.contains("|".join(clue_word_by_BS))].reset_index(drop=True)
print("train length (filtering) :", len(train_df))
print("label 1 :", len(train_df.loc[train_df.label == 1]), "\tlabel 0 :", len(train_df.loc[train_df.label == 0]))


train length : 8225
label 1 : 1524 	label 0 : 6701
train length (filtering) : 4107
label 1 : 1051 	label 0 : 3056


## Entropyを用いた手がかり語抽出

In [82]:
is_clue_word = lambda hinshi: (hinshi[0] == "名詞" and hinshi[1] == "サ変接続") or (hinshi[0] == "動詞" and hinshi[1] == "自立")

In [83]:
clue_word_df = \
train_df.assign(
    clue_word = 
    train_df.apply(
        lambda x: util.get_word_list(x.sentence, is_clue_word)
        , axis=1
    )
)[["clue_word", "label"]]

In [84]:
clue_word_entropy = word_entropy.word_entropy(clue_word_df)

In [106]:
alpha = 1.3
entropy_clue_words = \
clue_word_entropy[
    (clue_word_entropy.entropy_positive > alpha * clue_word_entropy.entropy_negative) & (clue_word_entropy.entropy_negative > 0)
].clue_word.tolist()
entropy_clue_words

['代替',
 '利用',
 '添加',
 '染色',
 '保護',
 '増強',
 '応用',
 '鎮静',
 '硬化',
 '吸着',
 '組み合わせ',
 '混ぜ',
 '便秘',
 '検出',
 '改善',
 '使用',
 '解熱',
 '呈する',
 '通過',
 '助ける',
 '内服',
 '補助',
 '調整',
 '軽減',
 '予防',
 '嘔吐',
 '固定',
 '成人',
 '測定',
 '出血',
 '保存',
 '認め',
 '感染',
 '殺菌',
 '呼吸',
 '使わ',
 '治療',
 '湿',
 '用い',
 '抑える',
 '洗浄',
 '発揮',
 '承認']

## 特徴量作成

In [107]:
# Entropyで得た手がかり語が文中に含まれているかどうか
train_X = feature.contains_clue_word(train_df, entropy_clue_words + ['用途', '効果', '目的'])

# サブタイトル中にEntropyで得た手がかり語が含まれているかどうか
train_X["subtitle_cat"] = feature.subtitle_cat(train_df, entropy_clue_words + ['用途', '効果', '目的'])

# 文中にカテゴリ名・記事タイトル名と一致する名詞が含まれているどうか
noun_list = pd.read_csv("../data/noun_list_in_category_and_title.csv").noun.tolist()
train_X["is_noun_cat"] = train_df.sentence.str.contains(util.contains_patt(noun_list)).tolist()
#train_X["n_noun"] = train_df.sentence.str.findall(util.contains_patt(noun_list)).apply(lambda x: len(x))


In [104]:
train_y = train_df.label

In [108]:
model = lgb.LGBMClassifier()

scores = cross_validate(model, train_X, train_y, scoring=['f1', 'precision', 'recall'], cv=5)

print("f1: ", scores['test_f1'].mean())
print("precision: ", scores['test_precision'].mean())
print("recall: ", scores['test_recall'].mean())

f1:  0.6496185734913474
precision:  0.7224633100732283
recall:  0.5927781539155947


## 用途の抽出

In [126]:
def remove(df):
    # 元素名リスト(2)
    element_list = pd.read_csv("../data/element_list.csv").name.tolist()

    # カテゴリページをクローリングして得た名詞リストを取得(3)
    noun_list = pd.read_csv("../data/noun_list_in_category_and_title.csv").noun.values
    noun_list = \
    list(
        set(util.flatten([[noun, re.sub(r'[\(（].+[\)）]', '', noun)] for noun in noun_list]))
    )

    # タイトル化合物の名称のリストを作成(4)
    compound_list = pd.read_csv("../data/compound_list.csv")['compound'].tolist()
    compound_list = list(set(util.flatten([[compound, re.sub(r'[\(（].+[\)）]', '', compound)] for compound in compound_list])))

    # 学習データ（特性，種類）のリストを作成(5)
    train_character_list = util.flatten([entry['Attributes']['特性'] for entry in raw_train])
    train_type_list = util.flatten([entry['Attributes']['種類'] for entry in raw_train])

    # (3)の名詞を含んでいる名詞だけを抽出
    use_df = df[df.use.str.contains('|'.join(noun_list))]

    # (2),(4),(5)と完全一致する名詞は除外
    use_df = \
    use_df[use_df.apply(
        lambda x: True if x.use not in (train_character_list + train_type_list + compound_list + element_list) else False
        , axis=1
    )]

    # 末尾が化合物名で終わる名詞は除外
    patt = '.*(\w{1,2}化)?物?((' \
            + '|'.join(compound_list + element_list).replace('(', '\(').replace(')', '\)') \
            + ')化?)+(化物|化合物|イオン|塩|酸)*$'
    use_df = use_df[~use_df.use.str.match(patt)]

    # 末尾が「化合物」で終わる名詞は除外
    # 「〜の化合物」といったものが除去できる
    patt = '.*化合物$'
    use_df = use_df[~use_df.use.str.match(patt)]

    return use_df

In [128]:
def get_use_list(_id: str, sentence: str):
    use_list = list(set(util.get_noun_list(sentence, condition=3)))
    return use_list

def get_use_df(predicted: pd.DataFrame):
    use_df = pd.DataFrame()
    for _, row in predicted.iterrows():
        use_df = use_df.append(
                    pd.DataFrame({
                        "_id": row._id
                        , "use": get_use_list(row._id, row.sentence)
                    })
                )

    # 用途っぽい名詞だけ抽出
    use_df = remove(use_df)
    
    return use_df

In [152]:
precision = []
recall = []
f1 = []
for train_index, test_index in StratifiedKFold(n_splits=5).split(train_X, train_y):
    X_train, X_test = train_X.loc[train_index], train_X.loc[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]
    
    model.fit(X_train, y_train)
    predict = model.predict(X_test)
    
    predict_true_df = train_df.loc[test_index][predict]
    use_df = get_use_df(predict_true_df)
    result = util.df2dict(use_df, 'use')
    
    test_ids = train_df.loc[test_index, '_id'].unique()
    score = util.validation(result, util.extract_from_dict(train_dict, test_ids))
    
    precision.append(score['precision'])
    recall.append(score['recall'])
    f1.append(score['f1'])
    
print("f1: ", np.mean(f1))
print("precision: ", np.mean(precision))
print("recall: ", np.mean(recall))

f1:  0.3898330531005896
precision:  0.5671941190056339
recall:  0.2986401907658104


In [155]:
model.fit(train_X, train_y)
predict = model.predict(train_X)
predict_true_df = train_df[predict]
use_df = get_use_df(predict_true_df)
result = util.df2dict(use_df, 'use')

test_ids = train_df.loc[test_index, '_id'].unique()
score = util.validation(result, train_dict)

print(score)

{'precision': 0.5828144458281445, 'recall': 0.3959587274290628, 'f1': 0.47155045234723336}


## 出力

In [165]:
model = lgb.LGBMClassifier()
model.fit(train_X, train_y)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.1, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_leaves=31, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [173]:
all_wiki_filtering_df = all_wiki_df.loc[all_wiki_df.sentence.str.contains("|".join(clue_word_by_BS))].reset_index(drop=True)

In [175]:
# Entropyで得た手がかり語が文中に含まれているかどうか
X = feature.contains_clue_word(all_wiki_filtering_df, entropy_clue_words + ['用途', '効果', '目的'])

# サブタイトル中にEntropyで得た手がかり語が含まれているかどうか
X["subtitle_cat"] = feature.subtitle_cat(all_wiki_filtering_df, entropy_clue_words + ['用途', '効果', '目的'])

# 文中にカテゴリ名・記事タイトル名と一致する名詞が含まれているどうか
noun_list = pd.read_csv("../data/noun_list_in_category_and_title.csv").noun.tolist()
X["is_noun_cat"] = all_wiki_filtering_df.sentence.str.contains(util.contains_patt(noun_list)).tolist()

In [178]:
predict = model.predict(X)
predict_true_df = all_wiki_filtering_df[predict]
use_df = get_use_df(predict_true_df)
result = util.df2dict(use_df, 'use')

In [179]:
with open("../output/use.json", 'w') as f:
    json.dump(result, f)