In [1]:
import numpy as np

import keras.backend as K
from keras.engine.topology import Layer
from keras.models import Model
from keras.layers import Input, Dense, LSTM, Bidirectional, Dropout, concatenate, multiply, Lambda, Reshape

Using TensorFlow backend.


In [15]:
from wikipedia2vec import Wikipedia2Vec
import pandas as pd
import pickle
import json
import re

In [2]:
# Fix ramdom seed.
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(1)

## Preprocessing

In [51]:
flatten = lambda multi_list: [item for sublist in multi_list for item in sublist if (not isinstance(item, str)) or (len(item) is not 0)]

def labeling(sentence_df: pd.DataFrame, train_dict: dict):
    _sentence_df = sentence_df.assign(label = False)
    for _id, train_values in train_dict.items():
        if len(train_values) is 0:
            continue

        _sentence_df.loc[_sentence_df._id == _id, 'label'] = \
            _sentence_df.loc[_sentence_df._id == _id].sentence.str.contains(isin_pat(train_values))

    return _sentence_df

def get_annotation(annotation_data: list, attribute: str):
    train_dict = {}
    for entry in annotation_data:
        train_dict[str(entry['WikipediaID'])] = flatten([re.findall(r'([^。]+)', item) for item in entry['Attributes'][attribute]])

    return train_dict

def isin_pat(matching: [str, list]):
    if isinstance(matching, str):
        return re.escape("%s" % str)
    elif isinstance(matching, list):
        return "|".join([re.escape(t) for t in matching])

In [52]:
# load data
train_df = pd.read_csv("../data/train.csv", dtype={'_id': str})
valid_df = pd.read_csv("../data/valid.csv", dtype={'_id': str})

with open("../data/compound_train.json", 'r', encoding='utf-8') as f:
    train_raw = json.load(f)['entry']

train_manufacturing_dict = get_annotation(train_raw, '製造方法')
train_df = labeling(train_df, train_manufacturing_dict)
valid_df = labeling(valid_df, train_manufacturing_dict)

print("Number of train sentences:", len(train_df))
print("True:", len(train_df[train_df.label == True]), "\tFalse:", len(train_df[train_df.label == False]))
print("Number of valid sentences:", len(valid_df))
print("True:", len(valid_df[valid_df.label == True]), "\tFalse:", len(valid_df[valid_df.label == False]))

Number of train sentences: 7435
True: 508 	False: 6927
Number of valid sentences: 1564
True: 88 	False: 1476


In [3]:
WORD_EMBEDDING_DIM = 300
FC_DIM = 128
LSTM_UNITS = 512

In [4]:
premise_input = Input(shape=(None, WORD_EMBEDDING_DIM))
hypothesis_input = Input(shape=(None, WORD_EMBEDDING_DIM))

l_lstm1 = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(premise_input)
l_lstm2 = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(hypothesis_input)

l_max1 = Lambda(lambda x: K.max(x, axis=0))(l_lstm1)
l_max2 = Lambda(lambda x: K.max(x, axis=0))(l_lstm2)
l_max1 = Reshape((2 * LSTM_UNITS,))(l_max1)
l_max2 = Reshape((2 * LSTM_UNITS,))(l_max2)

l_abssub = Lambda(lambda x: K.abs(x[0] - x[1]))([l_max1, l_max2])
l_mul = multiply([l_max1, l_max2])

x = concatenate([l_max1, l_max2, l_abssub, l_mul])

x = Dropout(0.2)(x)
x = Dense(FC_DIM, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(FC_DIM, activation='relu')(x)
x = Dropout(0.2)(x)
pred = Dense(1, activation='sigmoid')(x)

model = Model(inputs=[premise_input, hypothesis_input], outputs=pred)

model.compile(optimizer='adam', loss='binary_crossentropy')

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 300)    0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 300)    0                                            
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, None, 1024)   3330048     input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, None, 1024)   3330048     input_2[0][0]                    
__________________________________________________________________________________________________
lambda_1 (