In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
import itertools
import re
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import keras.layers as lyr
from keras.models import Model

In [3]:
BASE_DIR = './dataset/'
train = pd.read_csv(f'{BASE_DIR}train.csv')
test = pd.read_csv(f'{BASE_DIR}test.csv')

In [5]:
train['id'] = train['id'].apply(str)
test['test_id'] = test['test_id'].apply(str)

In [6]:
df = pd.concat((train, test))
df['question1'].fillna('', inplace=True)
df['question2'].fillna('', inplace=True)

### Vocab

In [8]:
counts_vectorizer = CountVectorizer(max_features=10000-1).fit(
    itertools.chain(
        df['question1'], 
        df['question2']
        )
    )
other_index = len(counts_vectorizer.vocabulary_)

In [13]:
words_tokenizer = re.compile(counts_vectorizer.token_pattern)

In [14]:
def create_padded_seqs(texts, max_len=10):
    seqs = texts.apply(
        lambda s: 
            [
                counts_vectorizer.vocabulary_[w] if w in counts_vectorizer.vocabulary_ else other_index
                for w in words_tokenizer.findall(s.lower())
            ]
        )
    return pad_sequences(seqs, maxlen=max_len)

In [15]:
X1_train, X1_val, X2_train, X2_val, y_train, y_val = \
    train_test_split(
        create_padded_seqs(df[df['id'].notnull()]['question1']), 
        create_padded_seqs(df[df['id'].notnull()]['question2']),
        df[df['id'].notnull()]['is_duplicate'].values,
        stratify=df[df['id'].notnull()]['is_duplicate'].values,
        test_size=0.3, random_state=1989
    )

In [19]:
X1_train[0]

array([   0, 9789, 4792, 8985, 9999, 3645, 3202, 6308, 5987, 4766])

In [20]:
input1_tensor = lyr.Input(X1_train.shape[1:])
input2_tensor = lyr.Input(X2_train.shape[1:])

words_embedding_layer = lyr.Embedding(X1_train.max() + 1, 100)
seq_embedding_layer = lyr.LSTM(256, activation='tanh')

seq_embedding = lambda tensor: seq_embedding_layer(words_embedding_layer(tensor))

merge_layer = lyr.multiply([seq_embedding(input1_tensor), seq_embedding(input2_tensor)])

dense1_layer = lyr.Dense(16, activation='sigmoid')(merge_layer)
ouput_layer = lyr.Dense(1, activation='sigmoid')(dense1_layer)

model = Model([input1_tensor, input2_tensor], ouput_layer)

model.compile(loss='binary_crossentropy', optimizer='adam')
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 10)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 10)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 10, 100)      1000000     ['input_1[0][0]',                
                                                                  'input_2[0][0]']                
                                                                                                  
 lstm (LSTM)                    (None, 256)          365568      ['embedding[0][0]',          

In [21]:
model.fit([X1_train, X2_train], y_train, 
          validation_data=([X1_val, X2_val], y_val), 
          batch_size=128, epochs=6, verbose=2)

Epoch 1/6
2211/2211 - 366s - loss: 0.5237 - val_loss: 0.4810 - 366s/epoch - 165ms/step
Epoch 2/6


In [None]:
features_model = Model([input1_tensor, input2_tensor], merge_layer)
features_model.compile(loss='mse', optimizer='adam')
F_train = features_model.predict([X1_train, X2_train], batch_size=128)
F_val = features_model.predict([X1_val, X2_val], batch_size=128)