In [6]:
import numpy as np
import pandas as pd
import pickle
import json
import re
from wikipedia2vec import Wikipedia2Vec

import keras.backend as K
from keras.engine.topology import Layer
from keras.models import Model
from keras.layers import Input, Dense, LSTM, Bidirectional, Dropout, concatenate, multiply, Lambda, Reshape

In [7]:
# Fix ramdom seed.
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(1)

## Preprocessing

In [4]:
train_df = pd.read_csv("../data/train_split_words.csv", dtype={'_id': str})
valid_df = pd.read_csv("../data/valid_split_words.csv", dtype={'_id': str})

print("Number of train sentences:", len(train_df))
print("True:", len(train_df[train_df.label == True]), "\tFalse:", len(train_df[train_df.label == False]))
print("Number of valid sentences:", len(valid_df))
print("True:", len(valid_df[valid_df.label == True]), "\tFalse:", len(valid_df[valid_df.label == False]))

Number of train sentences: 57641
True: 3593 	False: 54048
Number of valid sentences: 15735
True: 557 	False: 15178


In [None]:
# load model
ja_w2v = Wikipedia2Vec.load()
en_w2v = Wikipedia2Vec.load()

with open("../model/wikipedia2vec_en2ja_mapping.pkl", 'rb') as f:
    transformer = pickle.load(f)

def transfer_vectors(words, lang='ja'):
    return [w2v(w, lang=lang) for w in words]

def w2v(w, embedding_dim=100, lang='ja'):
    try:
        if lang == 'ja':
            return ja_w2v.get_word_vector(w).tolist()
        elif lang == 'en':
            return transformer.predict([en_w2v.get_word_vector(w)])[0]
        else:
            print("Undefined language.")
            return [0.0] * embedding_dim
    
    except KeyError:
        return [0.0] * embedding_dim
    
def padding(X, padding='post', truncating='pre', maxlen=50):
    padding_seq = pad_sequences(
        X
        , dtype='float32'
        , padding=padding
        , truncating=truncating
        , maxlen=maxlen
    )
    
    return padding_seq

## Modeling

In [11]:
WORD_EMBEDDING_DIM = 300
FC_DIM = 128
LSTM_UNITS = 512
DROPOUT_RATE = 0.2

In [4]:
premise_input = Input(shape=(None, WORD_EMBEDDING_DIM))
hypothesis_input = Input(shape=(None, WORD_EMBEDDING_DIM))

l_lstm1 = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(premise_input)
l_lstm2 = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(hypothesis_input)

l_max1 = Lambda(lambda x: K.max(x, axis=0))(l_lstm1)
l_max2 = Lambda(lambda x: K.max(x, axis=0))(l_lstm2)
l_max1 = Reshape((2 * LSTM_UNITS,))(l_max1)
l_max2 = Reshape((2 * LSTM_UNITS,))(l_max2)

l_abssub = Lambda(lambda x: K.abs(x[0] - x[1]))([l_max1, l_max2])
l_mul = multiply([l_max1, l_max2])

x = concatenate([l_max1, l_max2, l_abssub, l_mul])

x = Dropout(DROPOUT_RATE)(x)
x = Dense(FC_DIM, activation='relu')(x)
x = Dropout(DROPOUT_RATE)(x)
x = Dense(FC_DIM, activation='relu')(x)
x = Dropout(DROPOUT_RATE)(x)
pred = Dense(1, activation='sigmoid')(x)

model = Model(inputs=[premise_input, hypothesis_input], outputs=pred)

model.compile(optimizer='adam', loss='binary_crossentropy')

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 300)    0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 300)    0                                            
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, None, 1024)   3330048     input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, None, 1024)   3330048     input_2[0][0]                    
__________________________________________________________________________________________________
lambda_1 (