In [1]:
from time import time
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
import itertools
import datetime
from collections import Counter
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [12]:
a_string = "I am crazy.Yes"

split_string = re.split("\s|(?<!\d)[,.](?!\d)", a_string)
split_string 

['I', 'am', 'crazy', 'Yes']

Get data here
https://www.kaggle.com/c/quora-question-pairs/data?select=train.csv.zip

In [13]:
data = pd.read_csv('data/train.csv')
data.shape

(404290, 6)

In [14]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [15]:
data = data.dropna()
data.shape

(404287, 6)

In [16]:
X = data[['question1','question2']]
Y = data['is_duplicate']

In [17]:
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

In [18]:
len(X_train)

323429

In [19]:
def get_vocab(contents):
    """Computes Dict of counts of words.
    Computes the number of times a word is on a document.
    """
    vocab = defaultdict(int)
    for content in contents:
        for line in content:
            words = re.split("\s|(?<!\d)[,.](?!\d)", line)
            for word in words:
                vocab[word] += 1
    return vocab

In [20]:
contents = [X_train['question1'], X_train['question2']]

In [21]:
counts = get_vocab(contents)

In [22]:
#counts

In [23]:
for word in list(counts):
    if counts[word] < 5:
        del counts[word]

In [24]:
len(counts)

42884

In [25]:
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

## Dataset

In [26]:
def encode_sentence(line, N=25):
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in re.split("\s|(?<!\d)[,.](?!\d)", line)])
    l = min(N, len(enc1))
    enc[:l] = enc1[:l]
    return enc, l

In [21]:
class QuoraDataset(Dataset):
    def __init__(self, X, y):
        # YOUR CODE HERE
    
    def __len__(self):
        # YOUR CODE HERE
    
    def __getitem__(self, idx):
        # YOUR CODE HERE
    
train_ds = QuoraDataset(X_train, y_train)
valid_ds = QuoraDataset(X_val, y_val)

In [22]:
x1, s1, x2, s2, y = train_ds[1]
x1, x2

(array([19, 20, 21, 22, 23, 24,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0], dtype=int32),
 array([  3,  20,  47,  22,  23, 363,  44, 364, 365,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       dtype=int32))

In [23]:
s1, s2, y

(6, 9, 1)

In [24]:
batch_size = 5
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [77]:
class SiameseCBOW(nn.Module):
    def __init__(self, vocab_size, emb_size=100):
        """Here you need the following layers:
        -- embedding
        -- Linear
        -- dropout
        """
        super(SiameseCBOW, self).__init__()
        # YOUR CODE HERE
        
    def forward(self, x1, s1, x2, s2):
        """
        1) look oup the embeddings of the two sentences
        2) sum and divide by length each of them to get representations 
           for each sentence
        3) subtract representations and take absolute value
        4) apply a linear layer to get a logit 
        """
        # YOUR CODE HERE
        
        return 

In [78]:
x1, s1, x2, s2, y = next(iter(train_dl))
vocab_size = len(words)
model = CBOW2(vocab_size, emb_size=50)

In [79]:
s1 = s1.view(s1.shape[0], 1)
s2 = s2.view(s2.shape[0], 1)
y_hat = model(x1.long(), s1, x2.long(), s2)

In [80]:
loss = F.binary_cross_entropy_with_logits(y_hat, y.unsqueeze(1).float())
loss

tensor(0.7120, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)

## Training

In [81]:
def train_epocs(model, train_dl, valid_dl, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x1, s1, x2, s2, y in train_dl:
            x1 = x1.long()
            x2 = x2.long()
            s1 = s1.view(s1.shape[0], 1)
            s2 = s2.view(s2.shape[0], 1)
            y_hat = model(x1, s1, x2, s2)
            optimizer.zero_grad()
            loss = F.binary_cross_entropy_with_logits(y_hat, y.unsqueeze(1).float())
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc = val_metrics(model, valid_dl)
        #if i % 5 == 1:
        print("train loss %.3f val loss %.3f and val accuracy %.3f" % (sum_loss/total, val_loss, val_acc))

In [82]:
def val_metrics(model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x1, s1, x2, s2, y in valid_dl:
        x1 = x1.long()
        x2 = x2.long()
        s1 = s1.view(s1.shape[0], 1)
        s2 = s2.view(s2.shape[0], 1)
        y = y.unsqueeze(1).float()
        y_hat = model(x1, s1, x2, s2)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        y_pred = y_hat > 0
        correct += (y_pred.float() == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total, correct/total

In [83]:
batch_size = 2000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [84]:
vocab_size = len(words)
print(vocab_size)
model = CBOW2(vocab_size, emb_size=50)

42886


In [85]:
train_epocs(model, train_dl, valid_dl, epochs=10, lr=0.01)

train loss 0.613 val loss 0.558 and val accuracy 0.715
train loss 0.512 val loss 0.496 and val accuracy 0.767
train loss 0.428 val loss 0.467 and val accuracy 0.784
train loss 0.366 val loss 0.457 and val accuracy 0.794
train loss 0.320 val loss 0.455 and val accuracy 0.800
train loss 0.285 val loss 0.458 and val accuracy 0.804
train loss 0.260 val loss 0.465 and val accuracy 0.807
train loss 0.242 val loss 0.470 and val accuracy 0.809
train loss 0.225 val loss 0.475 and val accuracy 0.810
train loss 0.213 val loss 0.483 and val accuracy 0.812


In [None]:
train_epocs(model, train_dl, valid_dl, epochs=10, lr=0.001)

train loss 0.194 val loss 0.489 and val accuracy 0.813
train loss 0.192 val loss 0.492 and val accuracy 0.813
train loss 0.190 val loss 0.494 and val accuracy 0.813
train loss 0.186 val loss 0.496 and val accuracy 0.813
train loss 0.185 val loss 0.498 and val accuracy 0.813
train loss 0.183 val loss 0.501 and val accuracy 0.814
