# Quora Question Pair - Train Model with RNN + Attension + LSTM

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pwd

/content


In [3]:
% cd /content/drive/MyDrive/QuoraQuestionPair/

/content/drive/MyDrive/QuoraQuestionPair


## Load Training Set

In [55]:
from __future__ import print_function
%matplotlib inline
import numpy as np
import pandas as pd
import datetime, time, json
import keras
from keras.models import Model, Sequential
from keras.layers import Input, TimeDistributed, Dense, Lambda, concatenate, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint
from keras import backend as K
from keras import regularizers
from sklearn.model_selection import train_test_split

from importlib import reload
import dev_layers

In [87]:
# const
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
HIDDEN_DIM = 150
VALIDATION_SPLIT = 0.1
TEST_SPLIT = 0.1
RNG_SEED = 13371447
NB_EPOCHS = 25
DROPOUT_RNN = 0.2
DROPOUT_POOL = 0.1
DROPOUT_DENSE = 0.25
BATCH_SIZE = 1024
L2_WEIGHT_DECAY = 1e-4
# files
Q1_TRAINING_DATA_FILE = 'q1_train.npy'
Q2_TRAINING_DATA_FILE = 'q2_train.npy'
Q1_TEST_DATA_FILE = 'q1_test.npy'
Q2_TEST_DATA_FILE = 'q2_test.npy'
TEST_ID_FILE = 'test_ids.npy'
LABEL_TRAINING_DATA_FILE = 'label_train.npy'
WORD_EMBEDDING_MATRIX_FILE = 'word_embedding_matrix.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'
# save params in learning
MODEL_WEIGHTS_FILE = 'question_pairs_weights.h5'

In [7]:
q1_data = np.load(open(Q1_TRAINING_DATA_FILE, 'rb'))
q2_data = np.load(open(Q2_TRAINING_DATA_FILE, 'rb'))
labels = np.load(open(LABEL_TRAINING_DATA_FILE, 'rb'))
word_embedding_matrix = np.load(open(WORD_EMBEDDING_MATRIX_FILE, 'rb'))
with open(NB_WORDS_DATA_FILE, 'r') as f:
    nb_words = json.load(f)['nb_words']

In [8]:
print('Shape of question1 data tensor:', q1_data.shape)
print('Shape of question2 data tensor:', q2_data.shape)
print('Shape of label tensor:', labels.shape)

Shape of question1 data tensor: (404290, 30)
Shape of question2 data tensor: (404290, 30)
Shape of label tensor: (404290,)


In [9]:
X = np.stack((q1_data, q2_data), axis=1)
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RNG_SEED)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]

## Model

In [88]:
reload(dev_layers)

<module 'dev_layers' from '/content/drive/My Drive/QuoraQuestionPair/dev_layers.py'>

In [89]:
question1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
question2 = Input(shape=(MAX_SEQUENCE_LENGTH,))

q1 = Embedding(nb_words + 1, 
                 EMBEDDING_DIM, 
                 weights=[word_embedding_matrix], 
                 input_length=MAX_SEQUENCE_LENGTH, 
                 trainable=False)(question1)
q1 = TimeDistributed(Dense(HIDDEN_DIM, activation='relu'))(q1)

q2 = Embedding(nb_words + 1, 
                 EMBEDDING_DIM, 
                 weights=[word_embedding_matrix], 
                 input_length=MAX_SEQUENCE_LENGTH, 
                 trainable=False)(question2)
q2 = TimeDistributed(Dense(HIDDEN_DIM, activation='relu'))(q2)

q1 = dev_layers.BiLSTM_Layer(MAX_SEQUENCE_LENGTH, HIDDEN_DIM, 1, DROPOUT_RNN)(q1)
q2 = dev_layers.BiLSTM_Layer(MAX_SEQUENCE_LENGTH, HIDDEN_DIM, 1, DROPOUT_RNN)(q2)

q1, q2 = dev_layers.Attention_Layer()(q1, q2)

q1 = dev_layers.BiLSTM_Layer(MAX_SEQUENCE_LENGTH, HIDDEN_DIM, 4, DROPOUT_RNN)(q1)
q2 = dev_layers.BiLSTM_Layer(MAX_SEQUENCE_LENGTH, HIDDEN_DIM, 4, DROPOUT_RNN)(q2)

merged = dev_layers.Pooling_Layer(HIDDEN_DIM, 300, 4, DROPOUT_POOL, l2_weight_decay=L2_WEIGHT_DECAY)(q1, q2)

merged = Dense(100, activation='relu', kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY))(merged)
merged = Dropout(DROPOUT_DENSE)(merged)
merged = BatchNormalization()(merged)

merged = Dense(10, activation='relu', kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY))(merged)
merged = Dropout(DROPOUT_DENSE)(merged)
merged = BatchNormalization()(merged)

is_duplicate = Dense(1, activation='sigmoid')(merged)

model = Model(inputs=[question1,question2], outputs=is_duplicate)

opt = keras.optimizers.Adam(learning_rate=0.0004)

model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

In [90]:
model.summary()

Model: "model_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_23 (InputLayer)           [(None, 30)]         0                                            
__________________________________________________________________________________________________
input_24 (InputLayer)           [(None, 30)]         0                                            
__________________________________________________________________________________________________
embedding_22 (Embedding)        (None, 30, 300)      36150000    input_23[0][0]                   
__________________________________________________________________________________________________
embedding_23 (Embedding)        (None, 30, 300)      36150000    input_24[0][0]                   
____________________________________________________________________________________________

In [91]:
print("Starting training at", datetime.datetime.now())
t0 = time.time()
callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE, monitor='val_accuracy', save_best_only=True)]
# callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE, monitor='val_acc', save_best_only=True)]
history = model.fit([Q1_train, Q2_train],
                    y_train,
                    epochs=NB_EPOCHS,
                    validation_split=VALIDATION_SPLIT,
                    verbose=2,
                    batch_size=BATCH_SIZE,
                    callbacks=callbacks)
t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

Starting training at 2021-05-16 15:26:42.472151
Epoch 1/25
320/320 - 73s - loss: 0.6440 - accuracy: 0.6667 - val_loss: 0.5825 - val_accuracy: 0.7415
Epoch 2/25
320/320 - 57s - loss: 0.5506 - accuracy: 0.7451 - val_loss: 0.5140 - val_accuracy: 0.7599
Epoch 3/25
320/320 - 57s - loss: 0.5133 - accuracy: 0.7607 - val_loss: 0.4860 - val_accuracy: 0.7691
Epoch 4/25
320/320 - 57s - loss: 0.4900 - accuracy: 0.7714 - val_loss: 0.4896 - val_accuracy: 0.7616
Epoch 5/25
320/320 - 57s - loss: 0.4711 - accuracy: 0.7800 - val_loss: 0.4676 - val_accuracy: 0.7756
Epoch 6/25
320/320 - 57s - loss: 0.4554 - accuracy: 0.7892 - val_loss: 0.4577 - val_accuracy: 0.7722
Epoch 7/25
320/320 - 57s - loss: 0.4429 - accuracy: 0.7952 - val_loss: 0.4434 - val_accuracy: 0.7865
Epoch 8/25
320/320 - 57s - loss: 0.4300 - accuracy: 0.8025 - val_loss: 0.4341 - val_accuracy: 0.7915
Epoch 9/25
320/320 - 57s - loss: 0.4206 - accuracy: 0.8061 - val_loss: 0.4322 - val_accuracy: 0.7928
Epoch 10/25
320/320 - 57s - loss: 0.4106 - 

In [92]:
acc = pd.DataFrame({'epoch': [ i + 1 for i in history.epoch ],
                    'training': history.history['accuracy'],
                    'validation': history.history['val_accuracy']})
print(acc)

    epoch  training  validation
0       1  0.666712    0.741474
1       2  0.745088    0.759887
2       3  0.760662    0.769066
3       4  0.771441    0.761646
4       5  0.779958    0.775552
5       6  0.789232    0.772199
6       7  0.795208    0.786517
7       8  0.802531    0.791519
8       9  0.806085    0.792783
9      10  0.811414    0.794075
10     11  0.816468    0.798390
11     12  0.820935    0.795696
12     13  0.824823    0.799764
13     14  0.828554    0.813010
14     15  0.832228    0.811718
15     16  0.835544    0.803501
16     17  0.839584    0.813670
17     18  0.842229    0.810702
18     19  0.845585    0.810619
19     20  0.849112    0.816803
20     21  0.851139    0.819001
21     22  0.855222    0.810812
22     23  0.857183    0.815786
23     24  0.861134    0.812131
24     25  0.864145    0.821887


In [93]:
max_val_acc, idx = max((val, idx) for (idx, val) in enumerate(history.history['val_accuracy']))
print('Maximum accuracy at epoch', '{:d}'.format(idx+1), '=', '{:.4f}'.format(max_val_acc))

Maximum accuracy at epoch 25 = 0.8219


## Test Performance 

In [94]:
model.load_weights(MODEL_WEIGHTS_FILE)
loss, accuracy = model.evaluate([Q1_test, Q2_test], y_test, verbose=0)
print('loss = {0:.4f}, accuracy = {1:.4f}'.format(loss, accuracy))

loss = 0.3883, accuracy = 0.8220


## Load Test Set

In [95]:
q1_test_data = np.load(open(Q1_TEST_DATA_FILE, 'rb'))
q2_test_data = np.load(open(Q2_TEST_DATA_FILE, 'rb'))
test_ids = np.load(open(TEST_ID_FILE, 'rb'))
print('Shape of question1 test data tensor:', q1_test_data.shape)
print('Shape of question2 test data tensor:', q2_test_data.shape)
print('Shape of label tensor:', test_ids.shape)

Shape of question1 test data tensor: (2345796, 30)
Shape of question2 test data tensor: (2345796, 30)
Shape of label tensor: (2345796,)


In [96]:
test_predict = model.predict((q1_test_data, q2_test_data), batch_size=BATCH_SIZE).reshape(-1,)
test_predict += model.predict((q2_test_data, q1_test_data), batch_size=BATCH_SIZE).reshape(-1,)
test_predict /= 2

res_df=pd.DataFrame({'test_id':test_ids, 'is_duplicate':test_predict.ravel()})
res_df.to_csv('./out_all_RNN.csv', index=False)