# Loading the dataset & Looking at it


In [None]:
import os
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from collections import defaultdict
import random
from math import ceil

import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip /content/drive/MyDrive/Colab_Notebooks/train.csv.zip

Archive:  /content/drive/MyDrive/Colab_Notebooks/train.csv.zip
  inflating: train.csv               


In [None]:
data = pd.read_csv('train.csv')

In [None]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [None]:
data.shape

(404290, 6)

In [None]:
data = data[~data['question1'].isna()]
data = data[~data['question2'].isna()]

In [None]:
data[~data['question1'].isna()]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
...,...,...,...,...,...,...
404285,404285,433578,379845,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
404286,404286,18840,155606,Do you believe there is life after death?,Is it true that there is life after death?,1
404287,404287,537928,537929,What is one coin?,What's this coin?,0
404288,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0


In [None]:
# non-duplicate example
questions = data[data['is_duplicate'] == 0].loc[0]
print(f'Q1: {questions.question1}\nQ2: {questions.question2}')

Q1: What is the step by step guide to invest in share market in india?
Q2: What is the step by step guide to invest in share market?


In [None]:
# duplicate example
questions = data[data['is_duplicate'] == 1].loc[7]
print(f'Q1: {questions.question1}\nQ2: {questions.question2}')

Q1: How can I be a good geologist?
Q2: What should I do to be a great geologist?


In [None]:
train, test = train_test_split(data, stratify=data['is_duplicate'], random_state=42)

In [None]:
print("Number of dublicates: ", train.query('is_duplicate == 1').shape[0])
print("Number of non-dublicates: ", train.query('is_duplicate == 0').shape[0])

Number of dublicates:  111947
Number of non-dublicates:  191268


In [None]:
print("Number of questions: ", 2 * train.shape[0])
print("Number of unique questions: ", 2 * train.shape[0] - train.qid1.unique().shape[0] - train.qid2.unique().shape[0])

Number of questions:  606430
Number of unique questions:  142568


In [None]:
train_idx = train[train['is_duplicate'] == 1].id.tolist()
print(f'Number of training examples: {len(train_idx)}')

Number of training examples: 111947


In [None]:
q1_train_data = np.array(train.loc[train_idx, 'question1'])
q2_train_data = np.array(train.loc[train_idx, 'question2'])
q1_test_data = np.array(test['question1'])
q2_test_data = np.array(test['question2'])

q1_train = np.empty_like(q1_train_data)
q2_train = np.empty_like(q2_train_data)
q1_test = np.empty_like(q1_test_data)
q2_test = np.empty_like(q2_test_data)

y_test  = np.array(test['is_duplicate'])

In [None]:
q1_train_data[:5], q2_train_data[:5]

(array(['Can a Gemini man and a Gemini woman have a successful relationship? Or are they incompatible?',
        'How can I delete my own question from Quora?',
        'Why are there still people who think that the Earth is flat?',
        'What should I do to concentrate more on my studies?',
        'How can one stop caring too much?'], dtype=object),
 array(['What is the compatibility of a Gemini man and a Gemini woman, romantically?',
        'Can I delete all the questions I asked on Quora?',
        'Why do some people currently believe the earth is flat?',
        'How do I concentrate in study?',
        'What should I do to stop myself from caring too much?'],
       dtype=object))

In [None]:
nltk.download('punkt')

vocab = defaultdict(lambda: 0)
vocab['<PAD>'] = 1

for idx in range(len(q1_train_data)):
    q1_train[idx] = nltk.word_tokenize(q1_train_data[idx])
    q2_train[idx] = nltk.word_tokenize(q2_train_data[idx])
    q = q1_train[idx] + q2_train[idx]
    for word in q:
        if word not in vocab:
            vocab[word] = len(vocab) + 1
print('Vocabulary size is: ', len(vocab))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Vocabulary size is:  36446


In [None]:
# processing test

for idx in range(len(q1_test_data)): 
    q1_test[idx] = nltk.word_tokenize(q1_test_data[idx])
    q2_test[idx] = nltk.word_tokenize(q2_test_data[idx])

In [None]:
q1_train

array([list(['Can', 'a', 'Gemini', 'man', 'and', 'a', 'Gemini', 'woman', 'have', 'a', 'successful', 'relationship', '?', 'Or', 'are', 'they', 'incompatible', '?']),
       list(['How', 'can', 'I', 'delete', 'my', 'own', 'question', 'from', 'Quora', '?']),
       list(['Why', 'are', 'there', 'still', 'people', 'who', 'think', 'that', 'the', 'Earth', 'is', 'flat', '?']),
       ...,
       list(['Which', 'mobile', 'phone', 'should', 'I', 'buy', 'under', 'Rs.15000', '?']),
       list(['Should', 'I', 'invest', 'in', 'Bitcoin', 'now', '?', 'Why', '?']),
       list(['Has', 'anyone', 'been', 'able', 'to', 'stop', 'masturbating', '?', 'If', 'yes', ',', 'how', '?'])],
      dtype=object)

In [None]:
for i in range(len(q1_train)):
    q1_train[i] = [vocab[word] for word in q1_train[i]]
    q2_train[i] = [vocab[word] for word in q2_train[i]]

        
for i in range(len(q1_test)):
    q1_test[i] = [vocab[word] for word in q1_test[i]]
    q2_test[i] = [vocab[word] for word in q2_test[i]]

In [None]:
q2_train

array([list([16, 17, 18, 19, 20, 3, 4, 5, 6, 3, 4, 7, 21, 22, 11]),
       list([2, 25, 26, 32, 18, 33, 25, 34, 35, 31, 11]),
       list([36, 45, 46, 39, 47, 48, 18, 49, 17, 44, 11]), ...,
       list([200, 13, 114, 782, 1131, 51, 827, 472, 1292, 11]),
       list([692, 25, 4191, 55, 3922, 11]),
       list([23, 24, 25, 58, 614, 11])], dtype=object)

In [None]:
q1_train, q1_val, q2_train, q2_val = train_test_split(q1_train, q2_train)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
def longest(lst):
    """
    Returns len of longest list in array.
    """
    longest_list = max(len(elem) for elem in lst)
    return longest_list

In [None]:
def supplement(array, length, pad):
    """
    Add pad elements to complete array to needed length.
    """
    result = []
    for i in range(len(array)):
        result.append(np.hstack((array[i], [pad]*(length-len(array[i])))))
    return np.array(result)

In [None]:
a = [[23, 24, 25, 180, 40, 470, 27, 1442, 11, 1, 23, 344, 34, 34, 364, 6],
     [23, 17, 7793, 7794, 67, 3, 1677, 11],
     [16, 13, 18, 114, 1182, 91, 8, 4855, 35, 963, 11]
     ]
supplement(a, 30, 1)

array([[  23,   24,   25,  180,   40,  470,   27, 1442,   11,    1,   23,
         344,   34,   34,  364,    6,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1],
       [  23,   17, 7793, 7794,   67,    3, 1677,   11,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1],
       [  16,   13,   18,  114, 1182,   91,    8, 4855,   35,  963,   11,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1]])

In [None]:
class PairsDataset(torch.utils.data.Dataset):
    def __init__(self, q1, q2):
        self.q1 = q1
        self.q2 = q2

    def __getitem__(self, idx):
        item = (self.q1[idx], self.q2[idx])
        return item

    def __len__(self):
        return len(self.q1)

def iterator(q1, q2, batch_size=128, shuffle=False):
    q1, q2 = data_generator(q1, q2, batch_size, shuffle=shuffle) # padding the sequences to the maximum length amongst the samples in batches
    dataset = PairsDataset(q1, q2)
    return dataset

def data_generator(q1, q2, batch_size, pad=1, shuffle=True):
    """
    Generator function that yields batches of data

    Args:
        q1 (list): List of transformed (to tensor) questions.
        q2 (list): List of transformed (to tensor) questions.
        batch_size (int): Number of elements per batch.
        pad (int, optional): Pad character defaults to 1.
        shuffle (bool, optional): If the batches should be randomnized or not. Defaults to True.
    Returns:
        tuple: Of the form (input1, input2) with types (numpy.ndarray, numpy.ndarray)
        NOTE: input1: inputs to your model [q1a, q2a, q3a, ...] i.e. (q1a,q1b) are duplicates
              input2: targets to your model [q1b, q2b,q3b, ...] i.e. (q1a,q2i) i!=a are not duplicates
    """
    q1_batch_all = []
    q2_batch_all = []
    idx = 0
    len_q = len(q1)
    question_indexes = [*range(len_q)]
    if shuffle:
        random.shuffle(question_indexes) 
    
    q1 = q1[question_indexes]
    q2 = q2[question_indexes]

    batches_num = ceil(len_q / batch_size)
    q1_batch_all = [q1[(i*batch_size):((i+1)*batch_size)] for i in range(batches_num)]
    q2_batch_all = [q2[(i*batch_size):((i+1)*batch_size)] for i in range(batches_num)]
    
    for i in range(batches_num):
        max_len = max(longest(q1_batch_all[i]),longest(q2_batch_all[i]))
        max_len = 2**int(np.ceil(np.log2(max_len)))

        q1_batch_all[i] = supplement(q1_batch_all[i], max_len, pad)
        q2_batch_all[i] = supplement(q2_batch_all[i], max_len, pad)
        
        q1_batch_all[i] = np.array(q1_batch_all[i])
        q2_batch_all[i] = np.array(q2_batch_all[i])
    return (q1_batch_all, q2_batch_all)

# Buiding the siamese network

In [None]:
class SiameseModel(nn.Module):
    """ 
    Siamese model.

    Args:
        vocab_size (int, optional): Length of the vocabulary. Defaults to len(vocab).
        d_model (int, optional): Depth of the model.
        mode (str, optional): 'train', 'eval' or 'predict', predict mode is for fast inference. Defaults to 'train'.

    Returns:
        A PyTorch Siamese model. 
    """
    def __init__(self, vocab_size=len(vocab), d_model=512, hid_size=256, num_layers=2):
        super(SiameseModel, self).__init__()
        self.emb =  nn.Embedding(vocab_size, d_model) # defining the embeddings of vocab and d_model size
        self.lstm = nn.LSTM(d_model, hidden_size=hid_size, num_layers=num_layers, batch_first=True) # Defining an LSTM layer
        self.ll = nn.Linear(hid_size, 64) # Using dense layer 

    def forward_once(self, q):
        x = self.emb(q)
        x, _ = self.lstm(x)
        x = self.ll(x)
        x = torch.mean(x, axis=1) 
        x = F.normalize(x) 
        return x

    def forward(self, q1, q2):
        o1 = self.forward_once(q1)
        o2 = self.forward_once(q2)
        return (o1, o2)


# Measuring the quality

In [None]:
import scipy.spatial as sp


class TripletLoss(torch.nn.Module):
    """
    Custom Loss function.

    Args:
        v1 (torch.tensor): Array with dimension (batch_size, model_dimension) associated to Q1.
        v2 (torch.tensor): Array with dimension (batch_size, model_dimension) associated to Q2.
        margin (torch.tensor, optional): Desired margin. Defaults to 0.25.

    """

    
    def forward(self, v1, v2, margin=torch.tensor([0.25]).to(device)):
        #scores = torch.tensor(1 - sp.distance.cdist(v1.detach().numpy(),
        #                      v2.detach().numpy(), 'cosine'), requires_grad=True) - does not work with GPU
        #print(scores)

        scores = torch.mm(v1, v2.T).to(device)

        batch_size = len(scores)  # calculate new batch size
        positive = scores.diag() # the positive `diagonal` entries in `scores` (duplicates)
       
        negative_without_positive = scores - 2*torch.eye(v1.shape[0], requires_grad=True).to(device)
        
        closest_negative = torch.max(negative_without_positive, axis=1)[0].to(device)

        negative_zero_on_duplicate = (1.0 - torch.eye(v1.shape[0], requires_grad=True).to(device)) * scores

        
        mean_negative = torch.sum(
            negative_zero_on_duplicate, axis=1) / (batch_size-1)

        
        loss1 = torch.max(torch.tensor([0.0]).to(device),
                          margin - positive + closest_negative)
        
        loss2 = torch.max(torch.tensor([0.0]).to(device),
                          margin - positive + mean_negative)

        triplet_loss = torch.mean(torch.add(loss1, loss2))
        return triplet_loss

In [None]:
v1 = torch.tensor(np.array([[0.26726124,  0.53452248,  0.80178373],
                            [-0.5178918, -0.57543534, -0.63297887]]), requires_grad=True)
v2 = torch.tensor(np.array([[0.26726124, 0.53452248, 0.80178373], 
                            [0.5178918, 0.57543534, 0.63297887]]), requires_grad=True)
loss = TripletLoss().to(device)
res = loss(v1, v2)
print("Triplet Loss:", res)  # expecting 0.5
res.backward()

Triplet Loss: tensor(0.5000, device='cuda:0', dtype=torch.float64, grad_fn=<MeanBackward0>)


# Training the model

In [None]:
from tqdm.notebook import tqdm

In [None]:
BATCH_SIZE = 128

train_iter = iterator(
    q1_train,
    q2_train,
    BATCH_SIZE
)

In [None]:
loss = TripletLoss().to(device)
model = SiameseModel().to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epoch = 10

In [None]:
for epoch in tqdm(range(num_epoch)):
    losses = []
    model.train()

    for data in tqdm(train_iter):
        q1, q2 = data
        q1 = torch.tensor(q1, dtype=torch.int64).cuda()
        q2 = torch.tensor(q2, dtype=torch.int64).cuda()
        optimizer.zero_grad()
        out1, out2 = model(q1, q2)
        
        res = loss(out1, out2)

        losses.append(res)
        res.backward()
        optimizer.step()
        
    losses = torch.tensor(losses)
    print(f"Epoch {epoch}\n Current loss {torch.mean(losses)}\n")

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/656 [00:00<?, ?it/s]

Epoch 0
 Current loss 0.12198887765407562



  0%|          | 0/656 [00:00<?, ?it/s]

Epoch 1
 Current loss 0.04186873883008957



  0%|          | 0/656 [00:00<?, ?it/s]

Epoch 2
 Current loss 0.026318058371543884



  0%|          | 0/656 [00:00<?, ?it/s]

Epoch 3
 Current loss 0.02011486142873764



  0%|          | 0/656 [00:00<?, ?it/s]

Epoch 4
 Current loss 0.017391733825206757



  0%|          | 0/656 [00:00<?, ?it/s]

Epoch 5
 Current loss 0.016015948727726936



  0%|          | 0/656 [00:00<?, ?it/s]

Epoch 6
 Current loss 0.014976991340517998



  0%|          | 0/656 [00:00<?, ?it/s]

Epoch 7
 Current loss 0.01403803937137127



  0%|          | 0/656 [00:00<?, ?it/s]

Epoch 8
 Current loss 0.013236054219305515



  0%|          | 0/656 [00:00<?, ?it/s]

Epoch 9
 Current loss 0.012619104236364365



In [None]:
model.eval()

SiameseModel(
  (emb): Embedding(76412, 512)
  (lstm): LSTM(512, 256, num_layers=2, batch_first=True)
  (ll): Linear(in_features=256, out_features=64, bias=True)
)

In [None]:
def classify(test_q1, test_q2, y, threshold, model, vocab, data_generator=data_generator, batch_size=64):
    """Function to test the accuracy of the model.

    Args:
        test_Q1 (numpy.ndarray): Array of Q1 questions.
        test_Q2 (numpy.ndarray): Array of Q2 questions.
        y (numpy.ndarray): Array of actual target.
        threshold (float): Desired threshold.
        model: The Siamese model.
        vocab (collections.defaultdict): The vocabulary used.
        data_generator (function): Data generator function. Defaults to data_generator.
        batch_size (int, optional): Size of the batches. Defaults to 64.

    Returns:
        float: Accuracy of the model.
    """       
    accuracy = 0
    test_iter = train_iter = iterator(
        test_q1,
        test_q2,
        batch_size
    )

    for ix, i in enumerate(tqdm(test_iter)):
        
        q1, q2 = i
        q1 = torch.tensor(q1, dtype = torch.int64).to(device)
        q2 = torch.tensor(q2, dtype = torch.int64).to(device)
        y_test = y[(ix*batch_size):((ix+1)*batch_size)]

        # Call the model
        v1, v2 = model(q1, q2)
        v1 = v1.detach().cpu().numpy()
        v2 = v2.detach().cpu().numpy()
        
        if ix == len(test_iter)-1: # last batch has different size
            batch_size = len(y) - ix*batch_size
            
        for j in range(batch_size):
            d = v1[j].dot(v2[j].T)
            res = d > threshold
            accuracy += (y_test[j] == res)

    accuracy = accuracy / len(test_q1)
    
    return accuracy

In [None]:
accuracy = classify(q1_test, q2_test, y_test, 0.7, model, vocab, batch_size=BATCH_SIZE) 
print("Accuracy", accuracy)

  0%|          | 0/790 [00:00<?, ?it/s]

Accuracy 0.7393442298559443


With the learning rate higher than 0.001 model does not train.