In [None]:
# Dowloading the split dataset
!wget -q https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/quora-IR-dataset.zip
!unzip -q ./quora-IR-dataset.zip

In [7]:
# !pip install --quiet -U sentence-transformers

In [1]:
import pandas as pd
import torch
import numpy as np

# Data

In [2]:
train = pd.read_csv('classification/dev_pairs.tsv', sep='\t')
train.head(1)

Unnamed: 0,qid1,qid2,question1,question2,is_duplicate
0,188855,188856,How did Michel Foucault feel about surveillanc...,What is Michel Foucaults opinion on surveillan...,1


In [3]:
print("Rows: %d\nUnique first questions: %d\nUnique second questions: %d\nUnique questions: %d" % (len(train), train.qid1.nunique(), train.qid2.nunique(), len(np.unique([train.qid1, train.qid2]))))

Rows: 36971
Unique first questions: 26801
Unique second questions: 31540
Unique questions: 53426


In [4]:
test = pd.read_csv('classification/test_pairs.tsv', sep='\t')

# Model

**Resources**:
* [Sentence Bert Paper](https://arxiv.org/pdf/1908.10084.pdf)
* Model from [Sentence Transformers: sbert.com](https://www.sbert.net/)

In [5]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import InputExample, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader

device = torch.device('cuda')
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
model = model.to(device)

model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

# Model Training

In [6]:
train.head(1)

Unnamed: 0,qid1,qid2,question1,question2,is_duplicate
0,188855,188856,How did Michel Foucault feel about surveillanc...,What is Michel Foucaults opinion on surveillan...,1


Choose one of the losses below

## Pairwise Ranking Loss

In [1]:
# Preprocessing
def to_input_examples(df):
    def to_example(arr):
        return InputExample(texts=arr[:2], label=np.float(arr[2]))
    
    return [*map(to_example, df[["question1", "question2", "is_duplicate"]].values)]

def to_input_examples_one(groups_df):
    return [InputExample(texts=[v.question], label=v.label) for i,v in groups_df.iterrows()]

In [10]:
train_loader = DataLoader(to_input_examples(train), batch_size=32, shuffle=True)

* Contrastive loss is based on euclidean distance with margin default
* Cosine Similarity loss is an alternative

In [11]:
# train_loss = losses.CosineSimilarityLoss(model)
train_loss = losses.ContrastiveLoss(model)

model.fit(
    train_objectives=[(train_loader, train_loss)], 
    epochs=10, 
    warmup_steps=100, 
    show_progress_bar=True
)



Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1156 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1156 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1156 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1156 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1156 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1156 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1156 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1156 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1156 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1156 [00:00<?, ?it/s]

In [12]:
model.fit(
    train_objectives=[(train_loader, train_loss)], 
    epochs=10, 
    show_progress_bar=True
)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1156 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1156 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1156 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1156 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1156 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1156 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1156 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1156 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1156 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1156 [00:00<?, ?it/s]

## Triplet Loss

To train with triplets, the questions need to be split into groups.

Note that in this case negatives are formed from all the pairs that are not supplied as positives. An alternative approach is used in a different notebook, where negatives are only sampled from the ones present in the dataset. Both perform that pairwise losses on this dataset, though.

In [7]:
def create_groups(df):
    groups = {}
    
    for i, item in df.iterrows():
        if not item.is_duplicate:
            continue
        
        id1, id2 = item.qid1, item.qid2
        left_group = {id1} if id1 not in groups else set(groups[id1])
        right_group = {id2} if id2 not in groups else set(groups[id2])
        merged = tuple(sorted(left_group.union(right_group)))
        
        for id in merged:
            groups[id] = merged
        
    questions = {}
    for i, item in df.iterrows():
        id1, id2 = item.qid1, item.qid2
        questions[id1] = [id1, item.question1, groups[id1][0] if id1 in groups else -1]
        questions[id2] = [id2, item.question2, groups[id2][0] if id2 in groups else -1]
        
    return pd.DataFrame(questions.values(), columns=["qid", "question", "label"])

In [None]:
train_groups = create_groups(train)

In [None]:
train_loss = losses.BatchHardSoftMarginTripletLoss(model)

model.fit(
    train_objectives=[(train_loader, train_loss)], 
    epochs=10, 
    warmup_steps=100, 
    show_progress_bar=True
)

In [None]:
train_loader = DataLoader(to_input_examples_one(train_groups), batch_size=32, shuffle=True)

# Accuracy

In [13]:
def calc_encodings(df):
    questions = [*df[['qid1', 'question1']].values, *df[['qid2', 'question2']].values]
    questions = pd.DataFrame(questions, columns=['id', 'question'])
    questions.question = questions.question.apply(str)
    questions.drop_duplicates(inplace=True)
    questions = questions.reset_index().drop(columns=['index'])
    
    encod = model.encode(questions.question)
    encodings = {}
    
    for i, item in questions.iterrows():
        encodings[item.id] = encod[i]
    
    return np.stack(df['qid1'].apply(lambda x: encodings[x]).values), np.stack(df['qid2'].apply(lambda x: encodings[x]).values)
    
def calc_threshold(df, encod1, encod2):
    distances = np.linalg.norm(encod1 - encod2, axis=1)
    
    positive_distance = distances[df.is_duplicate].mean()
    negative_distance = distances[~df.is_duplicate].mean()
    threshold = (positive_distance + negative_distance) / 2

    print("Positive d: %.3f, Avg d: %.3f, Negative d: %.3f" % (positive_distance, threshold, negative_distance))
    
    df['dist'] = distances
    return positive_distance, threshold, negative_distance

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def loss(y, y_pred):
    return -np.mean(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))

def calc_metrics(df, positive_dist, threshold, negative_dist):
    pred = sigmoid(-(df['dist'] - threshold) / (negative_dist - positive_dist))
    accuracy = np.mean((pred > 0.5) == df.is_duplicate)
    log_loss = loss(df.is_duplicate, pred)

    print("Accuracy: %.5f" % accuracy)
    print("Log Loss: %.5f" % log_loss)

In [14]:
encod1, encod2 = calc_encodings(train)
positive_dist, threshold, negative_dist = calc_threshold(train, encod1, encod2)

Positive d: 5.154, Avg d: 6.659, Negative d: 8.164


In [15]:
calc_metrics(train, positive_dist, threshold, negative_dist)

Accuracy: 0.99145
Log Loss: 0.22781


In [16]:
encod1, encod2 = calc_encodings(test)
calc_threshold(test, encod1, encod2);

Positive d: 8.994, Avg d: 7.373, Negative d: 5.752


In [34]:
calc_metrics(test, positive_dist, 6, negative_dist)

Accuracy: 0.78974
Log Loss: 0.46428


# Saving Model

In [27]:
import pickle

In [29]:
with open('model.pckl', 'wb') as file:
    pickle.dump(model, file)

In [32]:
!du -h model.pckl

88M	model.pckl
