In [57]:
import numpy as np
import pandas as pd
import pickle
import json
import re
from wikipedia2vec import Wikipedia2Vec

import keras.backend as K
from keras import metrics
from keras.engine.topology import Layer
from keras.models import Model
from keras.layers import Input, Dense, LSTM, Bidirectional, Dropout, concatenate, multiply, Lambda, Reshape
from keras.preprocessing.sequence import pad_sequences

In [2]:
# Fix ramdom seed.
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(1)

## Preprocessing

In [8]:
train_df = pd.read_csv("../data/train_split_words.csv", dtype={'_id': str})
valid_df = pd.read_csv("../data/valid_split_words.csv", dtype={'_id': str})

print("Number of train sentences:", len(train_df))
print("True:", len(train_df[train_df.label == True]), "\tFalse:", len(train_df[train_df.label == False]))
print("Number of valid sentences:", len(valid_df))
print("True:", len(valid_df[valid_df.label == True]), "\tFalse:", len(valid_df[valid_df.label == False]))

Number of train sentences: 7435
True: 508 	False: 6927
Number of valid sentences: 1564
True: 88 	False: 1476


In [45]:
manufacturing_df = pd.read_csv("../data/manufacturing_words.csv", dtype={'_id': str})

In [5]:
# load model
ja_w2v = Wikipedia2Vec.load("../model/jawiki_20180420_300d.pkl")
en_w2v = Wikipedia2Vec.load("../model/enwiki_20180420_300d.pkl")

with open("../model/wikipedia2vec_en2ja_mapping.pkl", 'rb') as f:
    transformer = pickle.load(f)

In [6]:
def transfer_vectors(words, lang='ja'):
    return [w2v(w.lower(), lang=lang) for w in words]
        
def w2v(w, embedding_dim=300, lang='ja'):
    try:
        if lang == 'ja':
            return ja_w2v.get_word_vector(w).tolist()
        elif lang == 'en':
            return transformer.predict([en_w2v.get_word_vector(w)])[0]
        else:
            print("Undefined language.")
            return [0.0] * embedding_dim
    
    except KeyError:
        return [0.0] * embedding_dim

In [34]:
ja_train_seq = pad_sequences(train_df.words.apply(lambda x: transfer_vectors(eval(x))).tolist(), dtype='float32', padding='post', truncating='pre', maxlen=50)
train_df['ja_vec'] = pd.Series([vec for vec in ja_train_seq])

ja_valid_seq = pad_sequences(valid_df.words.apply(lambda x: transfer_vectors(eval(x))).tolist(), dtype='float32', padding='post', truncating='pre', maxlen=50)
valid_df['ja_vec'] = pd.Series([vec for vec in ja_valid_seq])

auxiliary_seq = pad_sequences(valid_df.words.apply(lambda x: transfer_vectors(eval(x))).tolist(), dtype='float32', padding='post', truncating='pre', maxlen=50)
manufacturing_df['en_vec'] = pd.Series([vec for vec in auxiliary_seq])

In [47]:
# Afeter cross join
train_df = pd.merge(train_df, manufacturing_df, on='_id')
valid_df = pd.merge(valid_df, manufacturing_df, on='_id')

print("Number of train sentences:", len(train_df))
print("True:", len(train_df[train_df.label == True]), "\tFalse:", len(train_df[train_df.label == False]))
print("Number of valid sentences:", len(valid_df))
print("True:", len(valid_df[valid_df.label == True]), "\tFalse:", len(valid_df[valid_df.label == False]))

Number of train sentences: 57641
True: 3593 	False: 54048
Number of valid sentences: 15735
True: 557 	False: 15178


In [48]:
X_train = np.array(train_df.ja_vec.tolist())
X_train_auxiliary = np.array(train_df.en_vec.tolist())
X_valid = np.array(valid_df.ja_vec.tolist())
X_valid_auxiliary = np.array(train_df.en_vec.tolist())

y_train = train_df.label.values.reshape((-1, 1))
y_valid = valid_df.label.values.reshape((-1, 1))

In [50]:
print(X_train.shape)
print(X_train_auxiliary.shape)
print(y_train.shape)

(57641, 50, 300)
(57641, 50, 300)
(57641, 1)


In [67]:
train_df.to_pickle("../dump/train.pkl")
valid_df.to_pickle("../dump/valid.pkl")

## Modeling

In [53]:
WORD_EMBEDDING_DIM = 300
FC_DIM = 128
LSTM_UNITS = 512
DROPOUT_RATE = 0.2

In [62]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [65]:
premise_input = Input(shape=(None, WORD_EMBEDDING_DIM,))
hypothesis_input = Input(shape=(None, WORD_EMBEDDING_DIM,))

l_lstm1 = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(premise_input)
l_lstm2 = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(hypothesis_input)

l_max1 = Lambda(lambda x: K.max(x, axis=0))(l_lstm1)
l_max2 = Lambda(lambda x: K.max(x, axis=0))(l_lstm2)
l_max1 = Reshape((2 * LSTM_UNITS,))(l_max1)
l_max2 = Reshape((2 * LSTM_UNITS,))(l_max2)

l_abssub = Lambda(lambda x: K.abs(x[0] - x[1]))([l_max1, l_max2])
l_mul = multiply([l_max1, l_max2])

x = concatenate([l_max1, l_max2, l_abssub, l_mul])

x = Dropout(DROPOUT_RATE)(x)
x = Dense(FC_DIM, activation='relu')(x)
x = Dropout(DROPOUT_RATE)(x)
x = Dense(FC_DIM, activation='relu')(x)
x = Dropout(DROPOUT_RATE)(x)
pred = Dense(1, activation='sigmoid')(x)

model = Model(inputs=[premise_input, hypothesis_input], outputs=pred)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[metrics.binary_accuracy, f1])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, None, 300)    0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, None, 300)    0                                            
__________________________________________________________________________________________________
bidirectional_5 (Bidirectional) (None, None, 1024)   3330048     input_5[0][0]                    
__________________________________________________________________________________________________
bidirectional_6 (Bidirectional) (None, None, 1024)   3330048     input_6[0][0]                    
__________________________________________________________________________________________________
lambda_7 (

In [66]:
model.fit([X_train, X_train_auxiliary], y_train, batch_size=50, epochs=50)

Epoch 1/50

InvalidArgumentError: Incompatible shapes: [41,1] vs. [50,1]
	 [[Node: metrics_2/f1/mul = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](_arg_dense_9_target_0_2/_397, dense_9/Sigmoid)]]
	 [[Node: loss_2/mul/_433 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_6570_loss_2/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]