### Test predictions

In [None]:
# Import libraries
from keras import optimizers
from keras import backend as K

from keras.callbacks import ModelCheckpoint

from keras.layers import Embedding, Input
from keras.layers import TimeDistributed, Lambda
from keras.layers import Convolution1D, GlobalMaxPooling1D

from keras.layers.advanced_activations import PReLU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.merge import concatenate
from keras.layers.normalization import BatchNormalization
from keras.layers.recurrent import LSTM, GRU

from keras.models import Model
from keras.preprocessing import sequence, text

from keras.utils import np_utils

import _pickle as cPickle
import pickle


In [None]:
data = pd.read_csv('quora_features.csv')

In [None]:
test = pd.read_csv('quora_test_features.csv')

In [54]:
# Define train X and y, tokenize questions

y = data.is_duplicate.values

tk = text.Tokenizer(num_words=200000)

max_len = 40
tk.fit_on_texts(list(data.question1.values.astype(str)) + list(data.question2.values.astype(str)))
x1 = tk.texts_to_sequences(data.question1.values.astype(str))
x1 = sequence.pad_sequences(x1, maxlen=max_len)

x2 = tk.texts_to_sequences(data.question2.values.astype(str))
x2 = sequence.pad_sequences(x2_t, maxlen=max_len)

word_index = tk.word_index

ytrain_enc = np_utils.to_categorical(y)

In [73]:
# Define test X and y, tokenize questions

x1_t = tk.texts_to_sequences(test.question1.values.astype(str))
x1_t = sequence.pad_sequences(x1_t, maxlen=max_len)

x2_t = tk.texts_to_sequences(test.question2.values.astype(str))
x2_t = sequence.pad_sequences(x2_t, maxlen=max_len)

In [11]:
embeddings_index_large = pickle.load( open( "embeddings.p", "rb" ) )

In [None]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index_large.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
max_features = 200000
filter_length = 5
num_filter = 64
pool_length = 4

input_1 = Input(shape=(40,))
embedding_1 = Embedding(input_dim=len(word_index) + 1,
                        output_dim=300,
                        weights=[embedding_matrix],
                        input_length=40,
                        trainable=False)(input_1)

timedistributed_1 = TimeDistributed(Dense(300, activation='relu'))(embedding_1)
lambda_1 = Lambda(lambda x: K.sum(x, axis=1), output_shape=(300,))(timedistributed_1)


input_2 = Input(shape=(40,))
embedding_2 = Embedding(input_dim=len(word_index) + 1,
                        output_dim=300,
                        weights=[embedding_matrix],
                        input_length=40,
                        trainable=False)(input_2)

timedistributed_2 = TimeDistributed(Dense(300, activation='relu'))(embedding_1)
lambda_2 = Lambda(lambda x: K.sum(x, axis=1), output_shape=(300,))(timedistributed_1)



input_3 = Input(shape=(40,))
embedding_3 = Embedding(input_dim=len(word_index) + 1,
                        output_dim=300,
                        weights=[embedding_matrix],
                        input_length=40,
                        trainable=False)(input_3)

convolution_3 = Convolution1D(nb_filter=num_filter,
                         filter_length=filter_length,
                         border_mode='valid',
                         activation='relu',
                         subsample_length=1)(embedding_3)

dropout_3 = Dropout(0.2)(convolution_3)
convolution_3_2 = Convolution1D(filters=num_filter,
                         filter_length=filter_length,
                         border_mode='valid',
                         activation='relu',
                         subsample_length=1)(dropout_3)


globalmaxpooling1d_3 = GlobalMaxPooling1D()(convolution_3_2)

dropout_3_2 = Dropout(0.2)(globalmaxpooling1d_3)

dense_3 = Dense(300)(globalmaxpooling1d_3)

dropout_3_3 = Dropout(0.2)(dense_3)
batchnormalization_3 = BatchNormalization()(dropout_3_3)


input_4 = Input(shape=(40,))
embedding_4 = Embedding(input_dim=len(word_index) + 1,
                        output_dim=300,
                        weights=[embedding_matrix],
                        input_length=40,
                        trainable=False)(input_4)

convolution_4 = Convolution1D(filters=num_filter,
                         filter_length=filter_length,
                         border_mode='valid',
                         activation='relu',
                         subsample_length=1)(embedding_4)

dropout_4 = Dropout(0.2)(convolution_4)
convolution_4_2 = Convolution1D(filters=num_filter,
                         filter_length=filter_length,
                         border_mode='valid',
                         activation='relu',
                         subsample_length=1)(dropout_4)


globalmaxpooling1d_4 = GlobalMaxPooling1D()(convolution_4_2)

dropout_4_2 = Dropout(0.2)(globalmaxpooling1d_4)

dense_4 = Dense(300)(globalmaxpooling1d_4)

dropout_4_3 = Dropout(0.2)(dense_4)
batchnormalization_4 = BatchNormalization()(dropout_4_3)

input_5 = Input(shape=(40,))
embedding_5 = Embedding(input_dim=len(word_index) + 1,
                        output_dim=300,
                       input_length=40)(input_5)

dropout_5 = Dropout(0.2)(embedding_5)
ltsm_5 = LSTM(300)(dropout_5)
dropout_5_2 = Dropout(0.2)(ltsm_5)

input_6 = Input(shape=(40,))
embedding_6 = Embedding(input_dim=len(word_index) + 1,
                        output_dim=300,
                       input_length=40)(input_6)

dropout_6 = Dropout(0.2)(embedding_6)
ltsm_6 = LSTM(300)(dropout_6)
dropout_6_2 = Dropout(0.2)(ltsm_6)


merged = concatenate([lambda_1, 
                      lambda_2, 
                      batchnormalization_3, 
                      batchnormalization_4,
                      dropout_5_2,
                      dropout_6_2])

m_dense = Dense(300)(merged)
m_relu = PReLU()(m_dense)
m_dropout = Dropout(0.2)(m_relu)
m_batch = BatchNormalization()(m_dropout)

m2_dense = Dense(300)(m_batch)
m2_relu = PReLU()(m2_dense)
m2_dropout = Dropout(0.2)(m2_relu)
m2_batch = BatchNormalization()(m2_dropout)


m3_dense = Dense(300)(m2_batch)
m3_relu = PReLU()(m3_dense)
m3_dropout = Dropout(0.2)(m3_relu)
m3_batch = BatchNormalization()(m3_dropout)

m4_dense = Dense(300)(m3_batch)
m4_relu = PReLU()(m4_dense)
m4_dropout = Dropout(0.2)(m4_relu)
m4_batch = BatchNormalization()(m4_dropout)

m5_dense = Dense(300)(m4_batch)
m5_relu = PReLU()(m_dense)
m5_dropout = Dropout(0.2)(m_relu)
m5_batch = BatchNormalization()(m5_dropout)


dense_out = Dense(1, activation='sigmoid')(m5_batch)

# build and compile model
model = Model(inputs=[input_1, 
                      input_2, 
                      input_3, 
                      input_4,
                      input_5, 
                      input_6], outputs=[dense_out])

model.load_weights("weights.h5")

model.compile(optimizers.Adam(), metrics=['accuracy'], loss='binary_crossentropy')


In [75]:
# Get test data set predictions
predict = model.predict([x1_t, x2_t, x1_t, x2_t, x1_t, x2_t], batch_size=384, verbose=2)

In [77]:
raw_test = pd.read_csv('test.csv')

In [78]:
submission = pd.DataFrame({"test_id": raw_test["test_id"], "is_duplicate": predict.ravel()})

In [88]:
# Map through predictions and apply .50 threshold on prediction values
submission_50 = submission.copy()
submission_50['is_duplicate'] = [1 if val > .5 else 0 for val in submission_50['is_duplicate']]

In [94]:
# Map through predictions and apply .80, .95, .99, and .9999 threshold on prediction values

submission_80 = submission.copy()
submission_80['is_duplicate'] = [1 if val > .80 else 0 for val in submission_80['is_duplicate']]
submission_80.to_csv("predictions_80.csv", index=False)

submission_95 = submission.copy()
submission_95['is_duplicate'] = [1 if val > .95 else 0 for val in submission_95['is_duplicate']]
submission_95.to_csv("predictions_95.csv", index=False)

submission_99 = submission.copy()
submission_99['is_duplicate'] = [1 if val > .99 else 0 for val in submission_99['is_duplicate']]
submission_99.to_csv("predictions_99.csv", index=False)

In [96]:
submission_9999 = submission.copy()
submission_9999['is_duplicate'] = [1 if val > .9999 else 0 for val in submission_9999['is_duplicate']]
submission_9999.to_csv("predictions_9999.csv", index=False)