In [None]:
!pip install --quiet -U sentence-transformers

In [None]:
# # Dowloading split dataset
! wget https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/quora-IR-dataset.zip
! unzip ./quora-IR-dataset.zip

In [None]:
import os
from datetime import datetime
import csv
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sentence_transformers import models, losses, util
from sentence_transformers import SentenceTransformer, evaluation
from sentence_transformers.readers import InputExample

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


# setting seed
random.seed(42)
os.environ['PYTHONHASHSEED'] = str(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

In [None]:
num_epochs = 5
batch_size = 32
model_save_path = 'output/sbert_base_ocl_10'

In [None]:
train_samples = []
with open('classification/train_pairs.tsv', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        sample = InputExample(texts=[row['question1'], row['question2']], label=int(row['is_duplicate']))
        train_samples.append(sample)
        
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)

In [None]:
dev_sentences1 = []
dev_sentences2 = []
dev_labels = []
with open(os.path.join('classification/dev_pairs.tsv'), encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        dev_sentences1.append(row['question1'])
        dev_sentences2.append(row['question2'])
        dev_labels.append(int(row['is_duplicate']))
        
dev_evaluator = evaluation.BinaryClassificationEvaluator(dev_sentences1, dev_sentences2, dev_labels)

In [None]:
test_sentences1 = []
test_sentences2 = []
test_labels = []
with open("classification/test_pairs.tsv", encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        test_sentences1.append(row['question1'])
        test_sentences2.append(row['question2'])
        test_labels.append(int(row['is_duplicate']))
        
evaluator = evaluation.BinaryClassificationEvaluator(test_sentences1, test_sentences2, test_labels, 
                                                     batch_size=batch_size, 
                                                     write_csv=True)

In [None]:
word_embedding_model = models.Transformer('distilbert-base-uncased', max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)
bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# for loading old model
# bi_encoder = SentenceTransformer('../input/base-sbert/output/sbert_base_ocl_5')

In [None]:
margin = 0.5
distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE
train_loss = losses.OnlineContrastiveLoss(model=bi_encoder, distance_metric=distance_metric, margin=margin)

In [None]:
bi_encoder.fit(train_objectives=[(train_dataloader, train_loss)], 
               evaluator=dev_evaluator, 
               epochs=num_epochs, 
               output_path=model_save_path, 
               save_best_model=True)

In [None]:
# saving evaluation scores
bi_encoder.evaluate(evaluator, output_path='.')