In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
import itertools
import re
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import keras.layers as lyr
from keras.models import Model

import xgboost as xgb

In [2]:
BASE_DIR = './dataset/'
train = pd.read_csv(f'{BASE_DIR}train.csv')
test = pd.read_csv(f'{BASE_DIR}test.csv')

In [3]:
train['id'] = train['id'].apply(str)
test['test_id'] = test['test_id'].apply(str)

In [4]:
df = pd.concat((train, test))
df['question1'].fillna('', inplace=True)
df['question2'].fillna('', inplace=True)

### Vocab

In [5]:
counts_vectorizer = CountVectorizer(max_features=10000-1).fit(
    itertools.chain(
        df['question1'], 
        df['question2']
        )
    )
other_index = len(counts_vectorizer.vocabulary_)

In [6]:
words_tokenizer = re.compile(counts_vectorizer.token_pattern)

In [7]:
def create_padded_seqs(texts, max_len=10):
    seqs = texts.apply(
        lambda s: 
            [
                counts_vectorizer.vocabulary_[w] if w in counts_vectorizer.vocabulary_ else other_index
                for w in words_tokenizer.findall(s.lower())
            ]
        )
    return pad_sequences(seqs, maxlen=max_len)

In [8]:
X1_train, X1_val, X2_train, X2_val, y_train, y_val = \
    train_test_split(
        create_padded_seqs(df[df['id'].notnull()]['question1']), 
        create_padded_seqs(df[df['id'].notnull()]['question2']),
        df[df['id'].notnull()]['is_duplicate'].values,
        stratify=df[df['id'].notnull()]['is_duplicate'].values,
        test_size=0.3, random_state=1989
    )

In [None]:
X1_train[0]

In [9]:
input1_tensor = lyr.Input(X1_train.shape[1:])
input2_tensor = lyr.Input(X2_train.shape[1:])

words_embedding_layer = lyr.Embedding(X1_train.max() + 1, 100)
seq_embedding_layer = lyr.LSTM(256, activation='tanh')

seq_embedding = lambda tensor: seq_embedding_layer(words_embedding_layer(tensor))

merge_layer = lyr.multiply([seq_embedding(input1_tensor), seq_embedding(input2_tensor)])

dense1_layer = lyr.Dense(16, activation='sigmoid')(merge_layer)
ouput_layer = lyr.Dense(1, activation='sigmoid')(dense1_layer)

model = Model([input1_tensor, input2_tensor], ouput_layer)

model.compile(loss='binary_crossentropy', optimizer='adam')
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 10)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 10)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 10, 100)      1000000     ['input_1[0][0]',                
                                                                  'input_2[0][0]']                
                                                                                                  
 lstm (LSTM)                    (None, 256)          365568      ['embedding[0][0]',          

In [10]:
model.fit([X1_train, X2_train], y_train, 
          validation_data=([X1_val, X2_val], y_val), 
          batch_size=128, epochs=6, verbose=2)

Epoch 1/6
2211/2211 - 336s - loss: 0.5174 - val_loss: 0.4728 - 336s/epoch - 152ms/step
Epoch 2/6
2211/2211 - 316s - loss: 0.4375 - val_loss: 0.4426 - 316s/epoch - 143ms/step
Epoch 3/6
2211/2211 - 270s - loss: 0.3874 - val_loss: 0.4347 - 270s/epoch - 122ms/step
Epoch 4/6
2211/2211 - 304s - loss: 0.3437 - val_loss: 0.4295 - 304s/epoch - 137ms/step
Epoch 5/6
2211/2211 - 315s - loss: 0.3019 - val_loss: 0.4397 - 315s/epoch - 143ms/step
Epoch 6/6
2211/2211 - 289s - loss: 0.2625 - val_loss: 0.4547 - 289s/epoch - 131ms/step


<keras.callbacks.History at 0x2dc83f6fb80>

In [None]:
# ## testing the neural network
# # df = pd.read_csv(f'{BASE_DIR}test_1.csv')
# # df['test_id'] = df['test_id'].apply(str)
# # df.fillna('', inplace=True)

# _, X1_test, _, X2_test, _, y_test = \
#     train_test_split(
#         create_padded_seqs(df[df['test_id'].notnull()]['question1']), 
#         create_padded_seqs(df[df['test_id'].notnull()]['question2']),
#         df[df['test_id'].notnull()]['is_duplicate'].values,
#         stratify=df[df['test_id'].notnull()]['is_duplicate'].values,
#         test_size=0.999, random_state=1989
#     )


In [None]:
# predictions = model.predict([X1_test, X2_test])

In [11]:
features_model = Model([input1_tensor, input2_tensor], merge_layer)
features_model.compile(loss='mse', optimizer='adam')
F_train = features_model.predict([X1_train, X2_train], batch_size=128)
F_val = features_model.predict([X1_val, X2_val], batch_size=128)

### XgBoost on top

In [12]:
dTrain = xgb.DMatrix(F_train, label=y_train)
dVal = xgb.DMatrix(F_val, label=y_val)
xgb_params = {
    'objective': 'binary:logistic',
    'booster': 'gbtree',
    'eval_metric': 'logloss',
    'eta': 0.1, 
    'max_depth': 9,
    'subsample': 0.9,
    'colsample_bytree': 1 / F_train.shape[1]**0.5,
    'min_child_weight': 5,
    'silent': 1
}


In [13]:
bst = xgb.train(xgb_params, dTrain, 1000,  [(dTrain,'train'), (dVal,'val')], 
                verbose_eval=10, early_stopping_rounds=10)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-logloss:0.65421	val-logloss:0.65995
[10]	train-logloss:0.44260	val-logloss:0.49366
[20]	train-logloss:0.34823	val-logloss:0.44103
[30]	train-logloss:0.30364	val-logloss:0.42468
[40]	train-logloss:0.27322	val-logloss:0.41931
[49]	train-logloss:0.25369	val-logloss:0.41959


In [None]:
X1_test = create_padded_seqs(df[df['test_id'].notnull()]['question1'])
X2_test = create_padded_seqs(df[df['test_id'].notnull()]['question2'])

In [None]:
F_test = features_model.predict([X1_test, X2_test], batch_size=128)

In [None]:
dTest = xgb.DMatrix(F_test)

In [None]:
df_sub = pd.DataFrame({
        'test_id': df[df['test_id'].notnull()]['test_id'].values,
        'is_duplicate': bst.predict(dTest, ntree_limit=bst.best_ntree_limit)
    }).set_index('test_id')

df_sub.head()

In [None]:
df_sub['is_duplicate'].hist(bins=100)