# Question Duplicates

In [9]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import os
import numpy as np
import pandas as pd
import random as rnd
import tensorflow as tf

# Set random seeds
rnd.seed(34)

In [10]:
tf.__version__


'2.13.0'

In [11]:
data = pd.read_csv("questions.csv")
N = len(data)
print('Number of question pairs: ', N)
data.head()

Number of question pairs:  404351


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [12]:
# Splitting into train and test
N_train = 300000
N_test = 10240
data_train = data[:N_train]
data_test = data[N_train:N_train + N_test]
print("Train set:", len(data_train), "Test set:", len(data_test))
del (data)  # remove to free memory

Train set: 300000 Test set: 10240


### Selecting duplicate questions
Select only the question pairs that are duplicate to train the model. <br>
We need to build two sets of questions as input for the Siamese network, assuming that question $q1_i$ (question $i$ in the first set) is a duplicate of $q2_i$ (question $i$ in the second set), but all other questions in the second set are not duplicates of $q1_i$.  
The test set uses the original pairs of questions and the status describing if the questions are duplicates.

We will start by identifying the indexes in the training set which correspond to duplicate questions. For this we will define a boolean variable `td_index`, which has value `True` if the index corresponds to duplicate questions and `False` otherwise.


In [13]:
td_index = data_train['is_duplicate'] == 1
td_index = [i for i, x in enumerate(td_index) if x]
print('Number of duplicate questions: ', len(td_index))
print('Indexes of first ten duplicate questions:', td_index[:10])

Number of duplicate questions:  111486
Indexes of first ten duplicate questions: [5, 7, 11, 12, 13, 15, 16, 18, 20, 29]


In [14]:
print(data_train['question1'][18])
print(data_train['question2'][18])
print('is_duplicate: ', data_train['is_duplicate'][18])

Why are so many Quora users posting questions that are readily answered on Google?
Why do people ask Quora questions which can be answered easily by Google?
is_duplicate:  1


Next, keep only the rows in the original training set that correspond to the rows where `td_index` is `True`

In [15]:
Q1_train = np.array(data_train['question1'][td_index])
Q2_train = np.array(data_train['question2'][td_index])

Q1_test = np.array(data_test['question1'])
Q2_test = np.array(data_test['question2'])
y_test  = np.array(data_test['is_duplicate'])

In [16]:
# Printing what the data looks like
print('TRAINING QUESTIONS:\n')
print('Question 1: ', Q1_train[0])
print('Question 2: ', Q2_train[0], '\n')
print('Question 1: ', Q1_train[5])
print('Question 2: ', Q2_train[5], '\n')

print('TESTING QUESTIONS:\n')
print('Question 1: ', Q1_test[0])
print('Question 2: ', Q2_test[0], '\n')
print('is_duplicate =', y_test[0], '\n')

TRAINING QUESTIONS:

Question 1:  Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?
Question 2:  I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me? 

Question 1:  What would a Trump presidency mean for current international master’s students on an F1 visa?
Question 2:  How will a Trump presidency affect the students presently in US or planning to study in US? 

TESTING QUESTIONS:

Question 1:  How do I prepare for interviews for cse?
Question 2:  What is the best way to prepare for cse? 

is_duplicate = 0 



In [17]:
# Splitting training data into training and validation
# Splitting the data
cut_off = int(len(Q1_train) * 0.8)
train_Q1, train_Q2 = Q1_train[:cut_off], Q2_train[:cut_off]
val_Q1, val_Q2 = Q1_train[cut_off:], Q2_train[cut_off:]
print('Number of duplicate questions: ', len(Q1_train))
print("The length of the training set is:  ", len(train_Q1))
print("The length of the validation set is: ", len(val_Q1))

Number of duplicate questions:  111486
The length of the training set is:   89188
The length of the validation set is:  22298


### Encoding the questions

The next step is to learn how to encode each of the questions as a list of numbers (integers). We will encode each word of the selected duplicate pairs with an index. 

We will start by learning a word dictionary, or vocabulary, containing all the words in your training dataset, which you will use to encode each word of the selected duplicate pairs with an index. 

For this we will be using the [`TextVectorization`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/TextVectorization) layer from Keras. which will take care of everything.

In [18]:
tf.random.set_seed(0)
text_vectorization = tf.keras.layers.TextVectorization(output_mode='int',split='whitespace', standardize='strip_punctuation')
text_vectorization.adapt(np.concatenate((Q1_train,Q2_train)))

In [19]:
# Getting the vocabulary size
print(f'Vocabulary size: {text_vectorization.vocabulary_size()}')


Vocabulary size: 36224


In [20]:
print('first question in the train set:\n')
print(Q1_train[0], '\n') 
print('encoded version:')
print(text_vectorization(Q1_train[0]),'\n')

print('first question in the test set:\n')
print(Q1_test[0], '\n')
print('encoded version:')
print(text_vectorization(Q1_test[0]) )


first question in the train set:

Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me? 

encoded version:
tf.Tensor(
[ 6984     6   178    10  8988  2442 35393   761    13  6636 28205    31
    28   483    45    98], shape=(16,), dtype=int64) 

first question in the test set:

How do I prepare for interviews for cse? 

encoded version:
tf.Tensor([    4     8     6   160    17  2079    17 11775], shape=(8,), dtype=int64)


### Defining the Siamese Model

A Siamese network is a neural network which uses the same weights while working in tandem on two different input vectors to compute comparable output vectors

We will get the question as input, get it vectorized and embedded, run it through an LSTM layer, normalize $v_1$ and $v_2$, and finally get the corresponding cosine similarity for each pair of questions. Because of the implementation of the loss function we will see in the next section, we are not going to have the cosine similarity as output of our Siamese network, but rather $v_1$ and $v_2$. We will add the cosine distance step once we reach the classification step. 

To train the model, we will use the triplet loss. This loss makes use of a baseline (anchor) input that is compared to a positive (truthy) input and a negative (falsy) input. The (cosine) distance from the baseline input to the positive input is minimized, and the distance from the baseline input to the negative  input is maximized. Mathematically, we are trying to maximize the following.

$$\mathcal{L}(A, P, N)=\max \left(\|\mathrm{f}(A)-\mathrm{f}(P)\|^{2}-\|\mathrm{f}(A)-\mathrm{f}(N)\|^{2}+\alpha, 0\right),$$

where $A$ is the anchor input, for example $q1_1$, $P$ is the duplicate input, for example, $q2_1$, and $N$ is the negative input (the non duplicate question), for example $q2_2$.<br>
$\alpha$ is a margin; we can think about it as a safety net, or by how much we want to push the duplicates from the non duplicates. This is the essence of the triplet loss. However, as we will see in the next section, we will be using a pretty smart trick to improve our training, known as hard negative mining. 
<br>


In [21]:
# Returns a tensorflow Siamese model
def Siamese(text_vectorizer, vocab_size=36224, d_feature=128):

    branch = tf.keras.models.Sequential(name='sequential') 
    # Add the text_vectorizer layer. 
    branch.add(text_vectorizer)
    # Add the Embedding layer.
    branch.add(tf.keras.layers.Embedding(vocab_size,d_feature,name='embedding'))
    # Add the LSTM layer, we want to the LSTM layer to return sequences, not just one value. 
    branch.add(tf.keras.layers.LSTM(d_feature,return_sequences=True,name='LSTM'))
    # Add the GlobalAveragePooling1D layer. Remember to call it 'mean' using the parameter `name`
    branch.add(tf.keras.layers.GlobalAveragePooling1D(name='mean'))
    # Add the normalizing layer using the Lambda function.`
    branch.add(tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x),name='out'))
    
    # Define both inputs. Be mindful of the data type and size
    input1 = tf.keras.layers.Input((1,), dtype=tf.string, name='input_1')
    input2 = tf.keras.layers.Input((1,), dtype=tf.string, name='input_2')
    # Define the output of each branch of our Siamese network. Remember that both branches have the same coefficients, 
    # but they each receive different inputs.
    branch1 = branch(input1)
    branch2 = branch(input2)
    # Define the Concatenate layer. You should concatenate columns, we can fix this using the `axis`parameter. 
    # This layer is applied over the outputs of each branch of the Siamese network
    conc = tf.keras.layers.Concatenate(axis=1, name='conc_1_2')([branch1, branch2]) 
    
    return tf.keras.models.Model(inputs=[input1, input2], outputs=conc, name="SiameseModel")

In [22]:
# check the model
model = Siamese(text_vectorization, vocab_size=text_vectorization.vocabulary_size())
model.build(input_shape=None)
model.summary()
model.get_layer(name='sequential').summary()

Model: "SiameseModel"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 sequential (Sequential)     (None, 128)                  4768256   ['input_1[0][0]',             
                                                                     'input_2[0][0]']             
                                                                                                  
 conc_1_2 (Concatenate)      (None, 256)                  0         ['sequential[0][0]'

### Hard Negative Mining


You will now implement the `TripletLoss` with hard negative mining.<br>
As explained in the lecture, you will be using all the questions from each batch to compute this loss. Positive examples are questions $q1_i$, and $q2_i$, while all the other combinations $q1_i$, $q2_j$ ($i\neq j$), are considered negative examples. The loss will be composed of two terms. One term utilizes the mean of all the non duplicates, the second utilizes the *closest negative*. Our loss expression is then:
 
\begin{align}
 \mathcal{Loss_1(A,P,N)} &=\max \left( -cos(A,P)  + mean_{neg} +\alpha, 0\right) \\
 \mathcal{Loss_2(A,P,N)} &=\max \left( -cos(A,P)  + closest_{neg} +\alpha, 0\right) \\
\mathcal{Loss(A,P,N)} &= mean(Loss_1 + Loss_2) \\
\end{align}

In [23]:
# Defining the custom loss function
def TripletLossFn(v1, v2,  margin=0.25):

    # use `tf.linalg.matmul` to take the dot product of the two batches. 
    # transpose the second argument using `transpose_b=True`
    scores = tf.linalg.matmul(v2,v1,transpose_b=True)

    # calculate new batch size and cast it as the same datatype as scores. 
    batch_size = tf.cast(tf.shape(v1)[0], scores.dtype) 

    # use `tf.linalg.diag_part` to grab the cosine similarity of all positive examples
    positive = tf.linalg.diag_part(scores)
    # subtract the diagonal from scores. We can do this by creating a diagonal matrix with the values 
    # of all positive examples using `tf.linalg.diag`
    negative_zero_on_duplicate = scores-tf.linalg.diag(positive)
    
    # use `tf.math.reduce_sum` on `negative_zero_on_duplicate` for `axis=1` and divide it by `(batch_size - 1)`
    mean_negative = tf.math.reduce_sum(negative_zero_on_duplicate,axis=1)/(batch_size-1)
    
    # create a composition of two masks: 
    # the first mask to extract the diagonal elements, 
    # the second mask to extract elements in the negative_zero_on_duplicate matrix that are larger than the elements in the diagonal 
    mask_exclude_positives = tf.cast((tf.eye(batch_size)==1)|(negative_zero_on_duplicate>tf.expand_dims(positive,1)),
                                    scores.dtype)

    # multiply `mask_exclude_positives` with 2.0 and subtract it out of `negative_zero_on_duplicate`
    negative_without_positive = negative_zero_on_duplicate-2.0*mask_exclude_positives
    # print(scores)
    # print(negative_without_positive)
    # take the row by row `max` of `negative_without_positive`. 
    # Hint: `tf.math.reduce_max(negative_without_positive, axis = None
    closest_negative = tf.math.reduce_max(negative_without_positive, axis =1)
    # print(closest_negative)
    # compute `tf.maximum` among 0.0 and `A`
    # A = subtract `positive` from `margin` and add `closest_negative` 
    triplet_loss1 = tf.maximum(margin-positive+closest_negative,0.0)

    # compute `tf.maximum` among 0.0 and `B`
    # B = subtract `positive` from `margin` and add `mean_negative` 
    triplet_loss2 = tf.maximum(margin-positive+mean_negative,0.0)
    # add the two losses together and take the `tf.math.reduce_sum` of it
    triplet_loss = tf.math.reduce_sum(triplet_loss1+triplet_loss2)
    
    return triplet_loss

In [24]:
# Checking the function
v1 = np.array([[0.26726124, 0.53452248, 0.80178373],[0.5178918 , 0.57543534, 0.63297887]])
v2 = np.array([[ 0.26726124,  0.53452248,  0.80178373],[-0.5178918 , -0.57543534, -0.63297887]])
print("Triplet Loss:", TripletLossFn(v1,v2).numpy())

Triplet Loss: 0.7035076825158911


To recognize it as a loss function, keras needs it to have two inputs: true labels, and output labels. We will not be using the true labels, but we still need to pass some dummy variable with size `(batch_size,)` for TensorFlow to accept it as a valid loss.

Additionally, the `out` parameter must coincide with the output of your Siamese network, which is the concatenation of the processing of each of the inputs, so we need to extract $v_1$ and $v_2$ from there.

In [25]:
def TripletLoss(labels, out, margin=0.25):
    _, embedding_size = out.shape # get embedding size
    v1 = out[:,:int(embedding_size/2)] # Extract v1 from out
    v2 = out[:,int(embedding_size/2):] # Extract v2 from out
    return TripletLossFn(v1, v2, margin=margin)

### Model Training
Now it's time to finally train our model. As usual, we have to define the cost function and the optimizer. We also have to build the actual model we will be training. 

To pass the input questions for training and validation we will use the iterator produced by [`tensorflow.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset).

We will now write a function that takes in our model to train it. To train our model we have to decide how many times we want to iterate over the entire data set; each iteration is defined as an `epoch`. For each epoch, we have to go over all the data, using our `Dataset` iterator.

In [26]:
train_dataset = tf.data.Dataset.from_tensor_slices(((train_Q1, train_Q2),tf.constant([1]*len(train_Q1))))
val_dataset = tf.data.Dataset.from_tensor_slices(((val_Q1, val_Q2),tf.constant([1]*len(val_Q1))))

In [27]:
def train_model(Siamese, TripletLoss, text_vectorizer, train_dataset, val_dataset, d_feature=128, lr=0.01, epochs=5):

    # Instantiate the Siamese model
    model = Siamese(text_vectorizer,
                    vocab_size = text_vectorizer.vocabulary_size(), #set vocab_size accordingly to the size of your vocabulary
                    d_feature = d_feature)
    # Compile the model
    model.compile(loss=TripletLoss,
                  optimizer = tf.keras.optimizers.Adam(
    learning_rate=lr))
    # Train the model 
    model.fit(train_dataset,
              epochs = epochs,
              validation_data = val_dataset,
             )

    return model

Now call the `train_model` function. We will be using a batch size of 256. 

To create the data generators we will be using the method `batch` for `Dataset` object. We will also call the `shuffle` method, to shuffle the dataset on each iteration.

In [28]:
epochs = 3
batch_size = 256
train_generator = train_dataset.shuffle(len(train_Q1),
                                        seed=7, 
                                        reshuffle_each_iteration=True).batch(batch_size=batch_size)
val_generator = val_dataset.shuffle(len(val_Q1), 
                                   seed=7,
                                   reshuffle_each_iteration=True).batch(batch_size=batch_size)
# model = train_model(Siamese, TripletLoss,text_vectorization, 
#                                             train_generator, 
#                                             val_generator, 
#                                             epochs=epochs,)

### Evaluating the model

For this, we load a pretrained model for predictiona and compute the cosine loss between each pair of questions.

In [29]:
model = tf.keras.models.load_model('model/trained_model.keras', safe_mode=False, compile=False)

# Show the model architecture
model.summary()

Model: "SiameseModel"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 sequential (Sequential)     (None, 128)                  4768256   ['input_1[0][0]',             
                                                                     'input_2[0][0]']             
                                                                                                  
 conc_1_2 (Concatenate)      (None, 256)                  0         ['sequential[0][0]'

### Classify

In [30]:
def classify(test_Q1, test_Q2, y_test, threshold, model, batch_size=64, verbose=True):
    y_pred = []
    test_gen = tf.data.Dataset.from_tensor_slices(((test_Q1, test_Q2),None)).batch(batch_size=batch_size)
    
    ### START CODE HERE ###

    pred = model.predict(test_gen)
#     print(pred.shape)
    _, n_feat = pred.shape
    v1 = pred[:,:int(n_feat/2)]
    v2 = pred[:,int(n_feat/2):]
    
    # Compute the cosine similarity. Using `tf.math.reduce_sum`. 
    # Don't forget to use the appropriate axis argument.
#     d  = v1*v2
    d=tf.math.reduce_sum(v1*v2,axis=1)

    # Check if d>threshold to make predictions
    y_pred = tf.cast(d>threshold, tf.int32)
    
    # take the average of correct predictions to get the accuracy
    accuracy=tf.cast(y_test==y_pred,tf.int32)

    accuracy = tf.reduce_sum(accuracy,0)/len(y_pred)

    # compute the confusion matrix using `tf.math.confusion_matrix`
    cm = tf.math.confusion_matrix(y_test,y_pred)
    
    
    return accuracy, cm

In [31]:
accuracy, cm = classify(Q1_test,Q2_test, y_test, 0.7, model,  batch_size = 512) 
print("Accuracy", accuracy.numpy())
print(f"Confusion matrix:\n{cm.numpy()}")

Accuracy 0.7259765625
Confusion matrix:
[[4876 1506]
 [1300 2558]]


### Testing with custom questions

Here we will test the model with our own questions. We will write a function `predict` which takes two questions as input and returns `True` or `False` depending on whether the question pair is a duplicate or not.   

In [32]:
def predict(question1, question2, threshold, model, verbose=False):
    generator = tf.data.Dataset.from_tensor_slices((([question1], [question2]),None)).batch(batch_size=1)

    # Call the predict method of your model and save the output into v1v2
    v1v2 = model.predict(generator)
    # Extract v1 and v2 from the model output
    _,n_feat=v1v2.shape
    v1 = v1v2[:,:int(n_feat/2)]
    v2 = v1v2[:,int(n_feat/2):]
    # Take the dot product to compute cos similarity of each pair of entries, v1, v2
    # Since v1 and v2 are both vectors, use the function tf.math.reduce_sum instead of tf.linalg.matmul
    d = tf.math.reduce_sum(v1*v2,axis=1)
    # Is d greater than the threshold?
    res = d>threshold

    if(verbose):
        print("Q1  = ", question1, "\nQ2  = ", question2)
        print("d   = ", d.numpy())
        print("res = ", res.numpy())

    return res.numpy()

In [33]:
# Example 1
question1 = "When will I see you?"
question2 = "When can I see you again?"
# 1 means it is duplicated, 0 otherwise
predict(question1 , question2, 0.7, model, verbose = True)

Q1  =  When will I see you? 
Q2  =  When can I see you again?
d   =  [0.8422111]
res =  [ True]


array([ True])

In [34]:
# Example 2
question1 = "Do you think it is monday?"
question2 = "Is it monday?"
# 1 means it is duplicated, 0 otherwise
predict(question1 , question2, 0.7, model, verbose=True)

Q1  =  Do you think it is monday? 
Q2  =  Is it monday?
d   =  [0.44877848]
res =  [False]


array([False])