In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

from keras.layers import Dense, Input, GlobalMaxPooling1D,LSTM
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.models import Model
from keras import regularizers
from sklearn.model_selection import train_test_split

In [None]:
train = pd.read_csv('../input/quora-record-linkage/quora_train.csv',index_col=0,usecols=[1,2,3,4,5,6]).dropna()
test_data = pd.read_csv('../input/quora-record-linkage/quora_test.csv',index_col=0,usecols=[1,2,3,4,5,6]).dropna()

In [None]:
seed = 7
np.random.seed(seed)
X = train.iloc[:,2:4]
Y = train.iloc[:,4]
train_data, val_data, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=seed)
X_test = test_data.iloc[:,2:4]
y_test = test_data.iloc[:,4]

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [None]:
NUM_WORDS=30522
tokenizer = Tokenizer(num_words=NUM_WORDS,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'',
                      lower=True)
tokenizer.fit_on_texts(train_data.question1+' '+train_data.question2)
sequences1_train = tokenizer.texts_to_sequences(train_data.question1)
sequences2_train = tokenizer.texts_to_sequences(train_data.question2)
sequences1_valid=tokenizer.texts_to_sequences(val_data.question1)
sequences2_valid=tokenizer.texts_to_sequences(val_data.question2)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
X1_train = pad_sequences(sequences1_train,maxlen=32)
X2_train = pad_sequences(sequences2_train,maxlen=32)
X1_val = pad_sequences(sequences1_valid,maxlen=32)
X2_val = pad_sequences(sequences2_valid,maxlen=32)
print('Shape of X train and X validation tensor:', X1_train.shape,X1_val.shape)
print('Shape of label train and validation tensor:', y_train.shape,y_val.shape)

In [None]:
!brew install wget

!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

In [None]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

from gensim.models.keyedvectors import KeyedVectors

word_vectors = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)

EMBEDDING_DIM=300
vocabulary_size=min(len(word_index)+1,NUM_WORDS)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, i in word_index.items():
    if i>=NUM_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

del(word_vectors)

from keras.layers import Embedding
embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=False)
sequence_length = X1_train.shape[1]

In [None]:
def build_lstm_model(embedding_layer,MAX_SEQUENCE_LENGTH):
    lstm_layer = LSTM(256, dropout=0.1, recurrent_dropout=0.1)

    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    x1 = lstm_layer(embedded_sequences_1)

    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_2 = embedding_layer(sequence_2_input)
    y1 = lstm_layer(embedded_sequences_2)

    merged = concatenate([x1, y1])
    merged = Dropout(0.1)(merged)
    merged = BatchNormalization()(merged)

    merged = Dense(128, activation='relu')(merged)
    merged = Dropout(0.1)(merged)
    merged = BatchNormalization()(merged)

    preds = Dense(1, activation='sigmoid')(merged)
    
    model = Model(inputs=[sequence_1_input, sequence_2_input],outputs=preds)
    adam = Adam(lr=1e-3)
    model.compile(loss='binary_crossentropy',
        optimizer=adam,
        metrics=[tf.keras.metrics.AUC()])
    return model

In [None]:
model = build_lstm_model(embedding_layer,sequence_length)
model.summary()

In [None]:
callbacks = [EarlyStopping(monitor='val_loss')]

In [None]:
model.fit([X1_train,X2_train], y_train, batch_size=1000, epochs=5, verbose=1, validation_data=([X1_val,X2_val],y_val),callbacks=callbacks)

In [None]:
sequences1_test=tokenizer.texts_to_sequences(test_data.question1)
sequences2_test=tokenizer.texts_to_sequences(test_data.question2.astype(str))
X1_test = pad_sequences(sequences1_test,maxlen=X1_train.shape[1])
X2_test = pad_sequences(sequences2_test,maxlen=X1_train.shape[1])
y_score=model.predict([X1_test,X2_test])

In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

import matplotlib.pyplot as plt

In [None]:
y_test=test_data.is_duplicate.values
model.evaluate([X1_test,X2_test],y_test)

In [None]:
precision, recall, f1=precision_recall_curve(y_test,y_score)
average_precision = average_precision_score(y_test, y_score,average="micro")

plt.clf()
plt.figure()
f_scores = np.linspace(0.2, 0.8, num=4)
lines = []
labels = []
for f_score in f_scores:
    x = np.linspace(0.01, 1)
    y = f_score * x / (2 * x - f_score)
    l, = plt.plot(x[y >= 0], y[y >= 0], color='gray', alpha=0.2)
    plt.annotate('f1={0:0.1f}'.format(f_score), xy=(0.9, y[45] + 0.02))

lines.append(l)
labels.append('iso-f1 curves')
plt.step(recall,precision,where='post')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title(
    'Precision-Recall curve: AP={0:0.2f}'
    .format(average_precision))