# Model Merge

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pwd

/content


In [3]:
% cd /content/drive/MyDrive/QuoraQuestionPair

/content/drive/MyDrive/QuoraQuestionPair


In [4]:
import numpy as np
import pandas as pd
import datetime, time, json
import keras
from keras.models import Model, Sequential
from keras.layers import Input, TimeDistributed, Dense, Lambda, concatenate, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint
from keras import backend as K
from keras import regularizers
from sklearn.model_selection import train_test_split

from importlib import reload
import dev_layers
import build_MLP
import build_CNN
import build_RNN

In [5]:
BATCH_SIZE = 1024
VALIDATION_SPLIT = 0.1
TEST_SPLIT = 0.1
RNG_SEED = 13371447
NB_EPOCHS = 25
# files
Q1_TRAINING_DATA_FILE = 'q1_train.npy'
Q2_TRAINING_DATA_FILE = 'q2_train.npy'
Q1_TEST_DATA_FILE = 'q1_test.npy'
Q2_TEST_DATA_FILE = 'q2_test.npy'
TEST_ID_FILE = 'test_ids.npy'
LABEL_TRAINING_DATA_FILE = 'label_train.npy'
WORD_EMBEDDING_MATRIX_FILE = 'word_embedding_matrix.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'
TRAIN_FEAT_NPY_FILE = 'train_feat_array.npy'
TEST_FEAT_NPY_FILE = 'test_feat_array.npy'
# save params in learning
MODEL_WEIGHTS_FILE_RNN = 'RNN_weights.h5'
MODEL_WEIGHTS_FILE_CNN = 'CNN_weights.h5'
MODEL_WEIGHTS_FILE_MLP = 'MLP_weights.h5'

In [6]:
q1_data = np.load(open(Q1_TRAINING_DATA_FILE, 'rb'))
q2_data = np.load(open(Q2_TRAINING_DATA_FILE, 'rb'))
labels = np.load(open(LABEL_TRAINING_DATA_FILE, 'rb'))
word_embedding_matrix = np.load(open(WORD_EMBEDDING_MATRIX_FILE, 'rb'))
with open(NB_WORDS_DATA_FILE, 'r') as f:
    nb_words = json.load(f)['nb_words']
train_feat_array = np.load(open(TRAIN_FEAT_NPY_FILE, 'rb'))
test_feat_array = np.load(open(TEST_FEAT_NPY_FILE, 'rb'))

In [7]:
X = np.stack((q1_data, q2_data), axis=1)
y = labels
X_train, X_test, y_train, y_test, X_train_feat, X_test_feat = train_test_split(X, y, train_feat_array, test_size=TEST_SPLIT, random_state=RNG_SEED)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_test = X_test[:,0]
Q2_test = X_test[:,1]

In [8]:
train_feat_array.shape, test_feat_array.shape

((404290, 90), (2345796, 90))

## Run Models Individually

### RNN

In [9]:
model_RNN = build_RNN.get_RNN(word_embedding_matrix, X_train_feat, nb_words)

In [10]:
model_RNN.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 30)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 30)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 30, 300)      36150000    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 30, 300)      36150000    input_2[0][0]                    
______________________________________________________________________________________________

In [None]:
print("Starting training at", datetime.datetime.now())
t0 = time.time()
callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE_RNN, monitor='val_loss', save_best_only=True)]
history = model_RNN.fit([Q1_train, Q2_train, X_train_feat],
                    y_train,
                    epochs=NB_EPOCHS,
                    validation_split=VALIDATION_SPLIT,
                    verbose=2,
                    batch_size=BATCH_SIZE,
                    callbacks=callbacks)
t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

Starting training at 2021-05-18 19:24:02.614070
Epoch 1/25
320/320 - 54s - loss: 0.2101 - accuracy: 0.9104 - val_loss: 0.2409 - val_accuracy: 0.8932
Epoch 2/25
320/320 - 54s - loss: 0.2077 - accuracy: 0.9118 - val_loss: 0.2358 - val_accuracy: 0.8952
Epoch 3/25
320/320 - 54s - loss: 0.2047 - accuracy: 0.9128 - val_loss: 0.2367 - val_accuracy: 0.8912
Epoch 4/25
320/320 - 54s - loss: 0.2030 - accuracy: 0.9141 - val_loss: 0.2418 - val_accuracy: 0.8937
Epoch 5/25
320/320 - 54s - loss: 0.2006 - accuracy: 0.9155 - val_loss: 0.2408 - val_accuracy: 0.8917
Epoch 6/25
320/320 - 54s - loss: 0.1990 - accuracy: 0.9156 - val_loss: 0.2407 - val_accuracy: 0.8936
Epoch 7/25
320/320 - 54s - loss: 0.1969 - accuracy: 0.9171 - val_loss: 0.2408 - val_accuracy: 0.8889
Epoch 8/25
320/320 - 54s - loss: 0.1947 - accuracy: 0.9183 - val_loss: 0.2382 - val_accuracy: 0.8923
Epoch 9/25
320/320 - 54s - loss: 0.1912 - accuracy: 0.9197 - val_loss: 0.2473 - val_accuracy: 0.8892
Epoch 10/25
320/320 - 54s - loss: 0.1901 - 

In [13]:
model_RNN.load_weights(MODEL_WEIGHTS_FILE_RNN)
loss, accuracy = model_RNN.evaluate([Q1_test, Q2_test, X_test_feat], y_test, verbose=0)
print('loss = {0:.4f}, accuracy = {1:.4f}'.format(loss, accuracy))

loss = 0.2355, accuracy = 0.8969


### CNN

In [11]:
model_CNN = build_CNN.get_CNN(word_embedding_matrix, X_train_feat, nb_words)
model_CNN.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 30)]         0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 30)]         0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 30, 300)      36150000    input_4[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 30, 300)      36150000    input_5[0][0]                    
____________________________________________________________________________________________

In [None]:
print("Starting training at", datetime.datetime.now())
t0 = time.time()
callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE_CNN, monitor='val_loss', save_best_only=True)]
history = model_CNN.fit([Q1_train, Q2_train, X_train_feat],
                    y_train,
                    epochs=NB_EPOCHS,
                    validation_split=VALIDATION_SPLIT,
                    verbose=2,
                    batch_size=BATCH_SIZE,
                    callbacks=callbacks)
t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

Starting training at 2021-05-18 20:01:00.968299
Epoch 1/25
320/320 - 28s - loss: 0.4483 - accuracy: 0.7832 - val_loss: 0.3213 - val_accuracy: 0.8600
Epoch 2/25
320/320 - 22s - loss: 0.3334 - accuracy: 0.8554 - val_loss: 0.2896 - val_accuracy: 0.8693
Epoch 3/25
320/320 - 22s - loss: 0.3072 - accuracy: 0.8640 - val_loss: 0.2754 - val_accuracy: 0.8756
Epoch 4/25
320/320 - 22s - loss: 0.2916 - accuracy: 0.8698 - val_loss: 0.2657 - val_accuracy: 0.8789
Epoch 5/25
320/320 - 22s - loss: 0.2806 - accuracy: 0.8750 - val_loss: 0.2589 - val_accuracy: 0.8813
Epoch 6/25
320/320 - 22s - loss: 0.2695 - accuracy: 0.8801 - val_loss: 0.2524 - val_accuracy: 0.8853
Epoch 7/25
320/320 - 22s - loss: 0.2603 - accuracy: 0.8854 - val_loss: 0.2479 - val_accuracy: 0.8875
Epoch 8/25
320/320 - 22s - loss: 0.2507 - accuracy: 0.8899 - val_loss: 0.2464 - val_accuracy: 0.8872
Epoch 9/25
320/320 - 22s - loss: 0.2413 - accuracy: 0.8949 - val_loss: 0.2455 - val_accuracy: 0.8893
Epoch 10/25
320/320 - 22s - loss: 0.2329 - 

In [14]:
model_CNN.load_weights(MODEL_WEIGHTS_FILE_CNN)
loss, accuracy = model_CNN.evaluate([Q1_test, Q2_test, X_test_feat], y_test, verbose=0)
print('loss = {0:.4f}, accuracy = {1:.4f}'.format(loss, accuracy))

loss = 0.2411, accuracy = 0.8917


### MLP

In [12]:
model_MLP = build_MLP.get_MLP(word_embedding_matrix, X_train_feat, nb_words)
model_MLP.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 30)]         0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            [(None, 30)]         0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 30, 300)      36150000    input_7[0][0]                    
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 30, 300)      36150000    input_8[0][0]                    
____________________________________________________________________________________________

In [None]:
print("Starting training at", datetime.datetime.now())
t0 = time.time()
callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE_MLP, monitor='val_loss', save_best_only=True)]
history = model_MLP.fit([Q1_train, Q2_train, X_train_feat],
                    y_train,
                    epochs=NB_EPOCHS,
                    validation_split=VALIDATION_SPLIT,
                    verbose=2,
                    batch_size=BATCH_SIZE,
                    callbacks=callbacks)
t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

Starting training at 2021-05-18 19:48:29.469683
Epoch 1/25
320/320 - 8s - loss: 0.3181 - accuracy: 0.8450 - val_loss: 0.2651 - val_accuracy: 0.8720
Epoch 2/25
320/320 - 6s - loss: 0.2664 - accuracy: 0.8718 - val_loss: 0.2570 - val_accuracy: 0.8756
Epoch 3/25
320/320 - 6s - loss: 0.2502 - accuracy: 0.8807 - val_loss: 0.2453 - val_accuracy: 0.8832
Epoch 4/25
320/320 - 6s - loss: 0.2380 - accuracy: 0.8883 - val_loss: 0.2583 - val_accuracy: 0.8748
Epoch 5/25
320/320 - 6s - loss: 0.2272 - accuracy: 0.8945 - val_loss: 0.2508 - val_accuracy: 0.8819
Epoch 6/25
320/320 - 6s - loss: 0.2168 - accuracy: 0.8999 - val_loss: 0.2334 - val_accuracy: 0.8889
Epoch 7/25
320/320 - 6s - loss: 0.2074 - accuracy: 0.9054 - val_loss: 0.2426 - val_accuracy: 0.8858
Epoch 8/25
320/320 - 6s - loss: 0.1978 - accuracy: 0.9105 - val_loss: 0.2442 - val_accuracy: 0.8836
Epoch 9/25
320/320 - 6s - loss: 0.1887 - accuracy: 0.9151 - val_loss: 0.2430 - val_accuracy: 0.8878
Epoch 10/25
320/320 - 6s - loss: 0.1792 - accuracy: 

In [15]:
model_MLP.load_weights(MODEL_WEIGHTS_FILE_MLP)
loss, accuracy = model_MLP.evaluate([Q1_test, Q2_test, X_test_feat], y_test, verbose=0)
print('loss = {0:.4f}, accuracy = {1:.4f}'.format(loss, accuracy))

loss = 0.2353, accuracy = 0.8884


## Make Submissions

In [None]:
q1_test_data = np.load(open(Q1_TEST_DATA_FILE, 'rb'))
q2_test_data = np.load(open(Q2_TEST_DATA_FILE, 'rb'))
test_ids = np.load(open(TEST_ID_FILE, 'rb'))

In [None]:
test_predict = model_RNN.predict((q1_test_data, q2_test_data, test_feat_array), batch_size=BATCH_SIZE).reshape(-1,)
test_predict += model_RNN.predict((q2_test_data, q1_test_data, test_feat_array), batch_size=BATCH_SIZE).reshape(-1,)
test_predict /= 2

res_df=pd.DataFrame({'test_id':test_ids, 'is_duplicate':test_predict.ravel()})
res_df.to_csv('./out_all_RNN_feature.csv', index=False)

In [None]:
test_predict = model_CNN.predict((q1_test_data, q2_test_data, test_feat_array), batch_size=BATCH_SIZE).reshape(-1,)
test_predict += model_CNN.predict((q2_test_data, q1_test_data, test_feat_array), batch_size=BATCH_SIZE).reshape(-1,)
test_predict /= 2

res_df=pd.DataFrame({'test_id':test_ids, 'is_duplicate':test_predict.ravel()})
res_df.to_csv('./out_all_CNN_feature.csv', index=False)

In [None]:
test_predict = model_MLP.predict((q1_test_data, q2_test_data, test_feat_array), batch_size=BATCH_SIZE).reshape(-1,)
test_predict += model_MLP.predict((q2_test_data, q1_test_data, test_feat_array), batch_size=BATCH_SIZE).reshape(-1,)
test_predict /= 2

res_df=pd.DataFrame({'test_id':test_ids, 'is_duplicate':test_predict.ravel()})
res_df.to_csv('./out_all_MLP_feature.csv', index=False)

## Save Model

In [17]:
model_RNN.save("RNN_model.h5")
model_MLP.save("MLP_model.h5")
model_CNN.save("CNN_model.h5")