# Quora Question Pair - Train Model with RNN + Attension + LSTM

In [71]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pwd

/content


In [3]:
% cd /content/drive/MyDrive/QuoraQuestionPair/

/content/drive/MyDrive/QuoraQuestionPair


## Load Training Set

In [4]:
from __future__ import print_function
%matplotlib inline
import numpy as np
import pandas as pd
import datetime, time, json
import keras
from keras.models import Model, Sequential
from keras.layers import Input, TimeDistributed, Dense, Lambda, concatenate, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint
from keras import backend as K
from keras import regularizers
from sklearn.model_selection import train_test_split

from importlib import reload
import dev_layers

In [64]:
# const
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
HIDDEN_DIM = 150
FEAT_DENSE_DIM = 50
VALIDATION_SPLIT = 0.1
TEST_SPLIT = 0.1
RNG_SEED = 13371447
NB_EPOCHS = 30
DROPOUT_RNN = 0.25
DROPOUT_POOL = 0.1
DROPOUT_DENSE = 0.3
DROPOUT_FEATURE = 0.2
BATCH_SIZE = 1024
L2_WEIGHT_DECAY = 1e-4
# files
Q1_TRAINING_DATA_FILE = 'q1_train.npy'
Q2_TRAINING_DATA_FILE = 'q2_train.npy'
Q1_TEST_DATA_FILE = 'q1_test.npy'
Q2_TEST_DATA_FILE = 'q2_test.npy'
TEST_ID_FILE = 'test_ids.npy'
LABEL_TRAINING_DATA_FILE = 'label_train.npy'
WORD_EMBEDDING_MATRIX_FILE = 'word_embedding_matrix.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'
TRAIN_FEAT_NPY_FILE = 'train_feat_array.npy'
TEST_FEAT_NPY_FILE = 'test_feat_array.npy'
# save params in learning
MODEL_WEIGHTS_FILE = 'question_pairs_weights.h5'

In [6]:
q1_data = np.load(open(Q1_TRAINING_DATA_FILE, 'rb'))
q2_data = np.load(open(Q2_TRAINING_DATA_FILE, 'rb'))
labels = np.load(open(LABEL_TRAINING_DATA_FILE, 'rb'))
word_embedding_matrix = np.load(open(WORD_EMBEDDING_MATRIX_FILE, 'rb'))
with open(NB_WORDS_DATA_FILE, 'r') as f:
    nb_words = json.load(f)['nb_words']

In [7]:
print('Shape of question1 data tensor:', q1_data.shape)
print('Shape of question2 data tensor:', q2_data.shape)
print('Shape of label tensor:', labels.shape)

Shape of question1 data tensor: (404290, 30)
Shape of question2 data tensor: (404290, 30)
Shape of label tensor: (404290,)


In [10]:
train_feat_array = np.load(open(TRAIN_FEAT_NPY_FILE, 'rb'))
test_feat_array = np.load(open(TEST_FEAT_NPY_FILE, 'rb'))
train_feat_array.shape

(404290, 17)

In [12]:
X = np.stack((q1_data, q2_data), axis=1)
y = labels
X_train, X_test, y_train, y_test, X_train_feat, X_test_feat = train_test_split(X, y, train_feat_array, test_size=TEST_SPLIT, random_state=RNG_SEED)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]

In [13]:
Q1_train.shape, Q2_train.shape, Q1_test.shape, Q2_test.shape, X_train_feat.shape, X_test_feat.shape

((363861, 30),
 (363861, 30),
 (40429, 30),
 (40429, 30),
 (363861, 17),
 (40429, 17))

## Model

In [65]:
reload(dev_layers)

<module 'dev_layers' from '/content/drive/My Drive/QuoraQuestionPair/dev_layers.py'>

In [66]:
question1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
question2 = Input(shape=(MAX_SEQUENCE_LENGTH,))
feat_input = Input(shape=(X_train_feat.shape[1],))

feat_layer = Dense(FEAT_DENSE_DIM, activation='relu')(feat_input)
feat_layer = Dropout(DROPOUT_FEATURE)(feat_layer)

q1 = Embedding(nb_words + 1, 
                 EMBEDDING_DIM, 
                 weights=[word_embedding_matrix], 
                 input_length=MAX_SEQUENCE_LENGTH, 
                 trainable=False)(question1)
q1 = TimeDistributed(Dense(HIDDEN_DIM, activation='relu'))(q1)

q2 = Embedding(nb_words + 1, 
                 EMBEDDING_DIM, 
                 weights=[word_embedding_matrix], 
                 input_length=MAX_SEQUENCE_LENGTH, 
                 trainable=False)(question2)
q2 = TimeDistributed(Dense(HIDDEN_DIM, activation='relu'))(q2)

q1 = dev_layers.BiLSTM_Layer(MAX_SEQUENCE_LENGTH, HIDDEN_DIM, 1, DROPOUT_RNN)(q1)
q2 = dev_layers.BiLSTM_Layer(MAX_SEQUENCE_LENGTH, HIDDEN_DIM, 1, DROPOUT_RNN)(q2)

q1, q2 = dev_layers.Attention_Layer()(q1, q2)

## LSTM return sequence, then pooling
# q1 = dev_layers.BiLSTM_Layer(MAX_SEQUENCE_LENGTH, HIDDEN_DIM, 4, DROPOUT_RNN)(q1)
# q2 = dev_layers.BiLSTM_Layer(MAX_SEQUENCE_LENGTH, HIDDEN_DIM, 4, DROPOUT_RNN)(q2)
# merged = dev_layers.Pooling_Layer(HIDDEN_DIM, 250, 4, DROPOUT_POOL, l2_weight_decay=L2_WEIGHT_DECAY)(q1, q2)
# merged = concatenate([merged, feat_layer])

## the LSTM only returns final state, then concat with features
q1 = dev_layers.BiLSTM_Layer(MAX_SEQUENCE_LENGTH, HIDDEN_DIM, 4, DROPOUT_RNN, ret_seq=False)(q1)
q2 = dev_layers.BiLSTM_Layer(MAX_SEQUENCE_LENGTH, HIDDEN_DIM, 4, DROPOUT_RNN, ret_seq=False)(q2)
merged = concatenate([q1, q2])
merged = Dense(150, activation='relu')(merged)
merged = Dropout(DROPOUT_DENSE)(merged)
merged = BatchNormalization()(merged)
merged = concatenate([merged, feat_layer])

merged = Dense(150, activation='relu', kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY))(merged)
merged = Dropout(DROPOUT_DENSE)(merged)
merged = BatchNormalization()(merged)

merged = Dense(150, activation='relu', kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY))(merged)
merged = Dropout(DROPOUT_DENSE)(merged)
merged = BatchNormalization()(merged)

merged = Dense(100, activation='relu', kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY))(merged)
merged = Dropout(DROPOUT_DENSE)(merged)
merged = BatchNormalization()(merged)

merged = Dense(10, activation='relu', kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY))(merged)
merged = Dropout(DROPOUT_DENSE)(merged)
merged = BatchNormalization()(merged)

is_duplicate = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[question1,question2,feat_input], outputs=is_duplicate)

opt = keras.optimizers.Nadam(learning_rate=0.0004)

model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

In [67]:
model.summary()

Model: "model_10"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_37 (InputLayer)           [(None, 30)]         0                                            
__________________________________________________________________________________________________
input_38 (InputLayer)           [(None, 30)]         0                                            
__________________________________________________________________________________________________
embedding_24 (Embedding)        (None, 30, 300)      36150000    input_37[0][0]                   
__________________________________________________________________________________________________
embedding_25 (Embedding)        (None, 30, 300)      36150000    input_38[0][0]                   
___________________________________________________________________________________________

In [68]:
print("Starting training at", datetime.datetime.now())
t0 = time.time()
# callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE, monitor='val_accuracy', save_best_only=True)]
# callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE, monitor='val_acc', save_best_only=True)]
callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE, monitor='val_loss', save_best_only=True)]
history = model.fit([Q1_train, Q2_train, X_train_feat],
                    y_train,
                    epochs=NB_EPOCHS,
                    validation_split=VALIDATION_SPLIT,
                    verbose=2,
                    batch_size=BATCH_SIZE,
                    callbacks=callbacks)
t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

Starting training at 2021-05-17 12:06:17.413068
Epoch 1/30
320/320 - 50s - loss: 0.6212 - accuracy: 0.6987 - val_loss: 0.4905 - val_accuracy: 0.7696
Epoch 2/30
320/320 - 32s - loss: 0.4927 - accuracy: 0.7703 - val_loss: 0.4556 - val_accuracy: 0.7830
Epoch 3/30
320/320 - 32s - loss: 0.4560 - accuracy: 0.7873 - val_loss: 0.4185 - val_accuracy: 0.8055
Epoch 4/30
320/320 - 32s - loss: 0.4357 - accuracy: 0.7991 - val_loss: 0.4105 - val_accuracy: 0.8080
Epoch 5/30
320/320 - 33s - loss: 0.4200 - accuracy: 0.8056 - val_loss: 0.4020 - val_accuracy: 0.8083
Epoch 6/30
320/320 - 32s - loss: 0.4069 - accuracy: 0.8115 - val_loss: 0.3883 - val_accuracy: 0.8161
Epoch 7/30
320/320 - 32s - loss: 0.3954 - accuracy: 0.8175 - val_loss: 0.3809 - val_accuracy: 0.8220
Epoch 8/30
320/320 - 32s - loss: 0.3858 - accuracy: 0.8211 - val_loss: 0.3703 - val_accuracy: 0.8255
Epoch 9/30
320/320 - 32s - loss: 0.3760 - accuracy: 0.8260 - val_loss: 0.3708 - val_accuracy: 0.8219
Epoch 10/30
320/320 - 32s - loss: 0.3666 - 

In [60]:
acc = pd.DataFrame({'epoch': [ i + 1 for i in history.epoch ],
                    'training': history.history['accuracy'],
                    'validation': history.history['val_accuracy']})
print(acc)

   epoch  training  validation
0      1  0.893408    0.852090
1      2  0.894611    0.850551
2      3  0.895439    0.850578
3      4  0.897308    0.852200
4      5  0.898993    0.850469
5      6  0.900395    0.850414
6      7  0.901131    0.850716
7      8  0.901681    0.852832
8      9  0.903070    0.848710
9     10  0.905229    0.851980


In [69]:
max_val_acc, idx = max((val, idx) for (idx, val) in enumerate(history.history['val_accuracy']))
print('Maximum accuracy at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(max_val_acc))

Maximum accuracy at epoch 27 = 0.8476


## Test Performance 

In [70]:
# load weights with minimum loss, not maximum accuracy
model.load_weights(MODEL_WEIGHTS_FILE)
loss, accuracy = model.evaluate([Q1_test, Q2_test, X_test_feat], y_test, verbose=0)
print('loss = {0:.4f}, accuracy = {1:.4f}'.format(loss, accuracy))

loss = 0.3249, accuracy = 0.8449


## Load Test Set

In [62]:
q1_test_data = np.load(open(Q1_TEST_DATA_FILE, 'rb'))
q2_test_data = np.load(open(Q2_TEST_DATA_FILE, 'rb'))
test_ids = np.load(open(TEST_ID_FILE, 'rb'))
print('Shape of question1 test data tensor:', q1_test_data.shape)
print('Shape of question2 test data tensor:', q2_test_data.shape)
print('Shape of label tensor:', test_ids.shape)

Shape of question1 test data tensor: (2345796, 30)
Shape of question2 test data tensor: (2345796, 30)
Shape of label tensor: (2345796,)


In [63]:
test_predict = model.predict((q1_test_data, q2_test_data, test_feat_array), batch_size=BATCH_SIZE).reshape(-1,)
test_predict += model.predict((q2_test_data, q1_test_data, test_feat_array), batch_size=BATCH_SIZE).reshape(-1,)
test_predict /= 2

res_df=pd.DataFrame({'test_id':test_ids, 'is_duplicate':test_predict.ravel()})
res_df.to_csv('./out_all_RNN_feature.csv', index=False)