In [1]:
import numpy as np
import pandas as pd
import re
import json
from sklearn.model_selection import StratifiedKFold
import keras.backend as K
from keras.layers import Input, Lambda, Dense, Embedding, TimeDistributed, Bidirectional, LSTM, merge, concatenate, Dropout
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras import metrics
from gensim.models.poincare import PoincareModel
from wikipedia2vec import Wikipedia2Vec

Using TensorFlow backend.


In [2]:
# Fix ramdom seed.
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(1)

In [3]:
# load Production sentence data
train_df = pd.read_csv("../data/train_split_words.csv", dtype={'_id': str})
valid_df = pd.read_csv("../data/valid_split_words.csv", dtype={'_id': str})

print("Number of train rows:", len(train_df))
print("True:", len(train_df[train_df.label == True]), "\tFalse:", len(train_df[train_df.label == False]))

print("Number of validete rows:", len(valid_df))
print("True:", len(valid_df[valid_df.label == True]), "\tFalse:", len(valid_df[valid_df.label == False]))

Number of train rows: 7435
True: 508 	False: 6927
Number of validete rows: 1564
True: 88 	False: 1476


In [4]:
with open("../data/pageid2ChEBI.json", 'r') as f:
    pageid2ChEBI_table = json.load(f)

ChEBI_df = pd.DataFrame()
for _id, ChEBIs in pageid2ChEBI_table.items():
    new_df = pd.DataFrame({'_id': [_id] * len(ChEBIs), 'ChEBI': ChEBIs})
    ChEBI_df = ChEBI_df.append(new_df)

In [5]:
# merge ChEBI DataFrame
train_df = pd.merge(train_df, ChEBI_df, on='_id', how='left')
valid_df = pd.merge(valid_df, ChEBI_df, on='_id', how='left')

print("Number of train rows:", len(train_df))
print("True:", len(train_df[train_df.label == True]), "\tFalse:", len(train_df[train_df.label == False]))

print("Number of validete rows:", len(valid_df))
print("True:", len(valid_df[valid_df.label == True]), "\tFalse:", len(valid_df[valid_df.label == False]))

Number of train rows: 8672
True: 605 	False: 8067
Number of validete rows: 1918
True: 103 	False: 1815


In [6]:
wiki2vec = Wikipedia2Vec.load('../model/jawiki_20180420_300d.pkl')
poincare_model = PoincareModel.load("../model/poincare.model")

In [7]:
def sentence2vec(s: str):
    return [_w2v(w) for w in s]

def _w2v(w):
    try:
        return np.array(wiki2vec.get_word_vector(w).tolist())
    except KeyError:
        return np.zeros(WORD_EMBEDDING_DIM)

def ontology2vec(ChEBI: str):
    '''
    Using Poincaré embedding.
    '''
    if ChEBI is np.nan:
        return np.zeros(ONTOLOGY_EMBEDDING_DIM)
    
    try:
        return poincare_model.kv[ChEBI]
    except KeyError:
        return np.zeros(ONTOLOGY_EMBEDDING_DIM)

In [9]:
X_train_words = pad_sequences(
    train_df.words.apply(lambda x: sentence2vec(x)).tolist()
    , dtype='float32'
    , padding='post'
    , truncating='pre'
    , maxlen=50
)

X_train_ontology = train_df.ChEBI.apply(lambda x: ontology2vec(x)).tolist()
X_train_ontology = np.array(X_train_ontology)

y_train = train_df.label.values

In [10]:
X_valid_words = pad_sequences(
    valid_df.words.apply(lambda x: sentence2vec(x)).tolist()
    , dtype='float32'
    , padding='post'
    , truncating='pre'
    , maxlen=50
)

X_valid_ontology = valid_df.ChEBI.apply(lambda x: ontology2vec(x)).tolist()
X_valid_ontology = np.array(X_valid_ontology)

y_valid = valid_df.label.values

In [11]:
# check dimensions
print(X_train_words.shape)
print(X_train_ontology.shape)
print(y_train.shape)

(8672, 50, 300)
(8672, 10)
(8672,)


In [12]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        
        return precision
    
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [8]:
WORD_EMBEDDING_DIM = 300
WORD_LSTM_UNIT = 512
ONTOLOGY_EMBEDDING_DIM = 10
FC_DIM = 128
DROPOUT_RATE = 0.5

In [40]:
word_embeddings = Input(shape=(None, WORD_EMBEDDING_DIM,), dtype='float32')
ontology_embeddings = Input(shape=(ONTOLOGY_EMBEDDING_DIM,), dtype='float32')

l_lstm = Bidirectional(LSTM(WORD_LSTM_UNIT, return_sequences=True))(word_embeddings)
l_max = Lambda(lambda x: K.max(x, axis=1))(l_lstm)
x = concatenate([l_max, ontology_embeddings])

x = Dropout(DROPOUT_RATE)(x)
x = Dense(FC_DIM, activation='relu')(x)
x = Dropout(DROPOUT_RATE)(x)
x = Dense(FC_DIM, activation='relu')(x)
x = Dropout(DROPOUT_RATE)(x)
x = Dense(FC_DIM, activation='relu')(x)

pred = Dense(1, activation='sigmoid')(x)

model = Model(inputs=[word_embeddings, ontology_embeddings], outputs=pred)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[metrics.binary_accuracy, f1])

In [48]:
from keras.utils import plot_model
import pydot
plot_model(model, show_shapes=True, to_file='model3.png')

In [41]:
model.fit(x=[X_train_words, X_train_ontology], y=y_train, class_weight={0:1, 1: 10}, epochs=100, batch_size=512)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f64cdfec630>

In [15]:
def evaluation(pred_true, pred_false):
    TP = pred_true[pred_true.label == True].count()[0]
    FP = pred_true[pred_true.label == False].count()[0]
    TN = pred_false[pred_false.label == False].count()[0]
    FN = pred_false[pred_false.label == True].count()[0]

    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F1 = 2 * precision * recall / (precision + recall)

    print("TP:", TP, "\tFP:", FP, "\tTN:", TN, "\tFN:", FN)
    print("Precision:", precision, "\tRecall:", recall, "\tF1:", F1)

In [42]:
predict = model.predict([X_valid_words, X_valid_ontology])

In [50]:
pred_true = valid_df.loc[np.where(predict >= 0.5)[0]]
pred_false = valid_df.loc[np.where(predict < 0.5)[0]]
pred_true_uniq = pred_true.drop_duplicates(['_id', 'sentence'])
pred_false_uniq = pred_false.drop_duplicates(['_id', 'sentence'])

evaluation(pred_true, pred_false)

TP: 54 	FP: 50 	TN: 1765 	FN: 49
Precision: 0.5192307692307693 	Recall: 0.5242718446601942 	F1: 0.5217391304347825


In [51]:
pred_true.loc[:, ['title', 'sentence', 'ChEBI', 'label']].to_csv("../dump/pred_true.csv", index=False)

In [52]:
pred_false[pred_false.label == True].loc[:, ['title', 'sentence', 'ChEBI', 'label']].to_csv("../dump/pred_false_filter_label_true.csv", index=False)

In [53]:
pred_false[pred_false.label == True][['title', 'sentence']].values

array([['N-メチルピロリドン',
        'N-メチル-2-ピロリドンは、γ-ブチロラクトンとメチルアミンとを縮合させて得る 高い溶解性を持つため、特に高分子化学の分野を中心に様々な物質に対する溶媒として用いられる。'],
       ['スチレン',
        'かつては、エチルベンゼンを塩素化したのちに脱塩化水素でオレフィンとする方法やエチルベンゼンを酸化したアセトフェノン、還元したフェニルカルビノールを経由して脱水反応オレフィンとする方法なども存在したが、今日では経済的な理由で触媒により脱水素する方法以外は利用されない。'],
       ['フルオレセイン', '反応触媒としては、塩化亜鉛の他にスルホン酸も用いられる。'],
       ['フルオレセイン', '反応触媒としては、塩化亜鉛の他にスルホン酸も用いられる。'],
       ['炭酸ベリリウム',
        'Be ( OH ) 2 + CO 2 + 3 H 2 O ⟶ BeCO 3 ⋅ 4 H 2 O 水酸化ベリリウムをアンモニア水に懸濁させて二酸化炭素を通じて飽和させ、放置すると塩基性塩Be2CO3(OH)2が沈殿する。'],
       ['塩化ウラン(VI)', '酸化ウラン(VI)はまず塩化ウラン(V)となり、さらに塩素と反応して塩化ウラン(VI)となる。'],
       ['塩化ウラン(VI)', '反応に伴って圧力が変化するため、グローブボックスなどの気密容器中で反応させる。'],
       ['エーテル (化学)', 'アルコールの共存下、オレフィンに求電子剤を作用させると求電子的付加反応によりエーテルが得られる。'],
       ['イノシトールトリスリン酸',
        '細胞膜に存在するリン脂質であるホスファチジルイノシトール4,5-ビスリン酸がホスホリパーゼCによって加水分解されると、 IP3とジアシルグリセロールが生成する。'],
       ['デカカルボニルジヒドリド三オスミウム',
        'Os3(CO)12 のオクタン溶液（または似た沸点をもつ不活性溶媒）を H2 でパージすることによって準備される。'],
       ['サッカリン', '元はトルエンから合成されたが、収率は低かった。']