In [1]:
import numpy as np
import pandas as pd
import pickle
import json
import re
from wikipedia2vec import Wikipedia2Vec

import keras.backend as K
from keras import metrics
from keras.engine.topology import Layer
from keras.models import Model
from keras.layers import Input, Dense, LSTM, Bidirectional, Dropout, concatenate, multiply, Lambda, Reshape
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
# Fix ramdom seed.
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(1)

## Preprocessing

In [3]:
train_df = pd.read_csv("../data/train_split_words.csv", dtype={'_id': str})
valid_df = pd.read_csv("../data/valid_split_words.csv", dtype={'_id': str})

print("Number of train sentences:", len(train_df))
print("True:", len(train_df[train_df.label == True]), "\tFalse:", len(train_df[train_df.label == False]))
print("Number of valid sentences:", len(valid_df))
print("True:", len(valid_df[valid_df.label == True]), "\tFalse:", len(valid_df[valid_df.label == False]))

Number of train sentences: 7435
True: 508 	False: 6927
Number of valid sentences: 1564
True: 88 	False: 1476


In [4]:
manufacturing_df = pd.read_csv("../data/manufacturing_words.csv", dtype={'_id': str})

In [5]:
# load model
ja_w2v = Wikipedia2Vec.load("../model/jawiki_20180420_300d.pkl")
en_w2v = Wikipedia2Vec.load("../model/enwiki_20180420_300d.pkl")

with open("../model/wikipedia2vec_en2ja_mapping.pkl", 'rb') as f:
    transformer = pickle.load(f)

In [6]:
def transfer_vectors(words, lang='ja'):
    return [w2v(w.lower(), lang=lang) for w in words]
        
def w2v(w, embedding_dim=300, lang='ja'):
    try:
        if lang == 'ja':
            return ja_w2v.get_word_vector(w).tolist()
        elif lang == 'en':
            return transformer.predict([en_w2v.get_word_vector(w)])[0]
        else:
            print("Undefined language.")
            return [0.0] * embedding_dim
    
    except KeyError:
        return [0.0] * embedding_dim
    
def entry2vec(title, embedding_dim=300, lang='ja'):
    try:
        if lang == 'ja':
            return ja_w2v.get_entity_vector(title).tolist()
        elif lang == 'en':
            return transformer.predict([en_w2v.get_entity_vector(title)])[0]
        else:
            print("Undefined language.")
            return [0.0] * embedding_dim
    
    except KeyError:
        return [0.0] * embedding_dim

In [9]:
train_df.words = train_df.words.apply(lambda x: eval(x))
ja_train_seq = pad_sequences(train_df.words.apply(lambda x: transfer_vectors(x)).tolist(), dtype='float32', padding='post', truncating='pre', maxlen=50)
train_df['ja_vec'] = pd.Series([vec for vec in ja_train_seq])

valid_df.words = valid_df.words.apply(lambda x: eval(x))
ja_valid_seq = pad_sequences(valid_df.words.apply(lambda x: transfer_vectors(x)).tolist(), dtype='float32', padding='post', truncating='pre', maxlen=50)
valid_df['ja_vec'] = pd.Series([vec for vec in ja_valid_seq])

manufacturing_df.manufacturing_words = manufacturing_df.manufacturing_words.apply(lambda x: eval(x))
auxiliary_seq = pad_sequences(manufacturing_df.manufacturing_words.apply(lambda x: transfer_vectors(x, lang='en')).tolist(), dtype='float32', padding='post', truncating='pre', maxlen=50)
manufacturing_df['en_vec'] = pd.Series([vec for vec in auxiliary_seq])

In [10]:
# Afeter cross join
train_df = pd.merge(train_df, manufacturing_df, on='_id')
valid_df = pd.merge(valid_df, manufacturing_df, on='_id')

In [67]:
train_df.to_pickle("../dump/train.pkl")
valid_df.to_pickle("../dump/valid.pkl")

In [11]:
# load data
train_df = pd.read_pickle("../dump/train.pkl")
valid_df = pd.read_pickle("../dump/valid.pkl")

print("Number of train sentences:", len(train_df))
print("True:", len(train_df[train_df.label == True]), "\tFalse:", len(train_df[train_df.label == False]))
print("Number of valid sentences:", len(valid_df))
print("True:", len(valid_df[valid_df.label == True]), "\tFalse:", len(valid_df[valid_df.label == False]))

Number of train sentences: 57641
True: 3593 	False: 54048
Number of valid sentences: 15735
True: 557 	False: 15178


In [12]:
X_train = np.array(train_df.ja_vec.tolist())
X_train_auxiliary = np.array(train_df.en_vec.tolist())
X_valid = np.array(valid_df.ja_vec.tolist())
X_valid_auxiliary = np.array(train_df.en_vec.tolist())

y_train = train_df.label.values.reshape((-1, 1))
y_valid = valid_df.label.values.reshape((-1, 1))

In [13]:
print(X_train.shape)
print(X_train_auxiliary.shape)
print(y_train.shape)

(57641, 50, 300)
(57641, 50, 300)
(57641, 1)


## Modeling

In [14]:
WORD_EMBEDDING_DIM = 300
FC_DIM = 128
LSTM_UNITS = 512
DROPOUT_RATE = 0.2

In [15]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [22]:
premise_input = Input(shape=(None, WORD_EMBEDDING_DIM))
hypothesis_input = Input(shape=(None, WORD_EMBEDDING_DIM))

l_lstm1 = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(premise_input)
l_lstm2 = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(hypothesis_input)

l_max1 = Lambda(lambda x: K.max(x, axis=1))(l_lstm1)
l_max2 = Lambda(lambda x: K.max(x, axis=1))(l_lstm2)

l_abssub = Lambda(lambda x: K.abs(x[0] - x[1]))([l_max1, l_max2])
l_mul = multiply([l_max1, l_max2])

x = concatenate([l_max1, l_max2, l_abssub, l_mul])

x = Dropout(DROPOUT_RATE)(x)
x = Dense(FC_DIM, activation='relu')(x)
x = Dropout(DROPOUT_RATE)(x)
x = Dense(FC_DIM, activation='relu')(x)
x = Dropout(DROPOUT_RATE)(x)
pred = Dense(1, activation='sigmoid')(x)

model = Model(inputs=[premise_input, hypothesis_input], outputs=pred)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[metrics.binary_accuracy, f1])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, None, 300)    0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, None, 300)    0                                            
__________________________________________________________________________________________________
bidirectional_3 (Bidirectional) (None, None, 1024)   3330048     input_3[0][0]                    
__________________________________________________________________________________________________
bidirectional_4 (Bidirectional) (None, None, 1024)   3330048     input_4[0][0]                    
__________________________________________________________________________________________________
lambda_4 (

In [23]:
model.fit([X_train, X_train_auxiliary], y_train, batch_size=128, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f93ef3c6a20>

In [24]:
predict = model.predict([X_valid, X_valid_auxiliary])

In [25]:
def evaluation(pred_true, pred_false):
    TP = pred_true[pred_true.label == True].count()[0]
    FP = pred_true[pred_true.label == False].count()[0]
    TN = pred_false[pred_false.label == False].count()[0]
    FN = pred_false[pred_false.label == True].count()[0]

    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F1 = 2 * precision * recall / (precision + recall)

    print("TP:", TP, "\tFP:", FP, "\tTN:", TN, "\tFN:", FN)
    print("Precision:", precision, "\tRecall:", recall, "\tF1:", F1)

In [26]:
pred_true = valid_df.loc[np.where(predict >= 0.5)[0]]
pred_false = valid_df.loc[np.where(predict < 0.5)[0]]
pred_true_uniq = pred_true.drop_duplicates(['_id', 'sentence'])
pred_false_uniq = pred_false.drop_duplicates(['_id', 'sentence'])

print("重複あり")
evaluation(pred_true, pred_false)

print("重複なし")
evaluation(pred_true_uniq, pred_false_uniq)

重複あり
TP: 233 	FP: 55 	TN: 15123 	FN: 324
Precision: 0.8090277777777778 	Recall: 0.41831238779174146 	F1: 0.5514792899408284
重複なし
TP: 13 	FP: 6 	TN: 737 	FN: 20
Precision: 0.6842105263157895 	Recall: 0.3939393939393939 	F1: 0.5


In [68]:
pred_true_uniq[['title', 'sentence', 'manufacturing', 'label']].valueｓ

array([['シアン化水素',
        '工業的にはソハイオ法によるアクリロニトリル製造の際の副産物として得られるほか、メタン、アンモニア、空気の混合ガスを高温下白金触媒に通すことによって作られる（アンドルソフ法）。',
        'common methods manufacturing hydrogen cyanide ', True],
       ['スチレン', '工業的にはエチルベンゼンを鉄触媒等で脱水素してスチレンが製造される。',
        'direct dehydrogenation ethylbenzene styrene accounts 85 commercial production ',
        True],
       ['スチレン', '次世代のスチレン製造法として、トルエンとメタノールに塩基性ゼオライト触媒を作用させる方法が研究されている。',
        'direct dehydrogenation ethylbenzene styrene accounts 85 commercial production ',
        False],
       ['ベンジルアミン', 'ベンジルアミンはベンゾニトリルの水素化によって得られる。',
        'benzylamine produced reaction benzyl chloride ammonia aqueous solution ',
        True],
       ['イサト酸無水物',
        'ベンズイソオキサゾールまたはアントラニル酸にクロロギ酸エチルを反応させるか、アントラニル酸ナトリウムにホスゲンを反応させることによって得られる。',
        'passing phosgene solution anthranilic acid aqueous hydrochloric acid',
        True],
       ['サッカリン', '1950年にアントラニル酸に亜硝酸・二酸化硫黄・塩素・アンモニアを順次作用させる改良合成法が報告された。',
        'countries commercial saccharin produced remsen fahl

In [64]:
pred_false_uniq[pred_false_uniq.label == True][['_id', 'sentence', 'manufacturing', 'label']].values

array([['497499',
        'N-メチル-2-ピロリドンは、γ-ブチロラクトンとメチルアミンとを縮合させて得る 高い溶解性を持つため、特に高分子化学の分野を中心に様々な物質に対する溶媒として用いられる。',
        'large scale production nmp n methyl 2 pyrrolidone predominantly carried reacting gamma butyrolactone excess pure aqueous methylamine high pressure tube reactor 6 12 mpa ',
        True],
       ['19566', '燻蒸等の目的ではシアン化ナトリウムに酸を加える方法が一般的である。',
        'common methods manufacturing hydrogen cyanide ', True],
       ['160786',
        'かつては、エチルベンゼンを塩素化したのちに脱塩化水素でオレフィンとする方法やエチルベンゼンを酸化したアセトフェノン、還元したフェニルカルビノールを経由して脱水反応オレフィンとする方法なども存在したが、今日では経済的な理由で触媒により脱水素する方法以外は利用されない。',
        'direct dehydrogenation ethylbenzene styrene accounts 85 commercial production ',
        True],
       ['160786', '植物・細菌・菌類の一部の種において、ケイ皮酸脱炭酸酵素によってケイ皮酸から合成される。',
        'direct dehydrogenation ethylbenzene styrene accounts 85 commercial production ',
        True],
       ['62444', '元はトルエンから合成されたが、収率は低かった。',
        'countries commercial saccharin produced remsen fahlberg process ',
        Tru