In [1]:
!pip install --quiet -U sentence-transformers



In [2]:
# # Dowloading split dataset
! wget https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/quora-IR-dataset.zip
! unzip ./quora-IR-dataset.zip

--2022-04-21 04:49:52--  https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/quora-IR-dataset.zip
Resolving public.ukp.informatik.tu-darmstadt.de (public.ukp.informatik.tu-darmstadt.de)... 130.83.167.186
Connecting to public.ukp.informatik.tu-darmstadt.de (public.ukp.informatik.tu-darmstadt.de)|130.83.167.186|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 93570055 (89M) [application/zip]
Saving to: ‘quora-IR-dataset.zip’


2022-04-21 04:49:54 (86.6 MB/s) - ‘quora-IR-dataset.zip’ saved [93570055/93570055]

Archive:  ./quora-IR-dataset.zip
   creating: information-retrieval/
  inflating: information-retrieval/corpus.tsv  
  inflating: information-retrieval/dev-queries.tsv  
  inflating: information-retrieval/test-queries.tsv  
  inflating: quora_duplicate_questions.tsv  
   creating: classification/
  inflating: classification/dev_pairs.tsv  
  inflating: classification/test_pairs.tsv  
  inflating: classifica

In [3]:
import os
from datetime import datetime
import csv
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sentence_transformers import models, losses, util
from sentence_transformers import SentenceTransformer, evaluation
from sentence_transformers.readers import InputExample

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


# setting seed
random.seed(42)
os.environ['PYTHONHASHSEED'] = str(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

In [4]:
num_epochs = 8
batch_size = 32
model_save_path = 'output/sbert_base_mnr_16'
margin = 0.5

In [5]:
train_samples = []
with open('classification/train_pairs.tsv', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        sample = InputExample(texts=[row['question1'], row['question2']], label=int(row['is_duplicate']))
        train_samples.append(sample)
        
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)

In [6]:
dev_sentences1 = []
dev_sentences2 = []
dev_labels = []
with open(os.path.join('classification/dev_pairs.tsv'), encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        dev_sentences1.append(row['question1'])
        dev_sentences2.append(row['question2'])
        dev_labels.append(int(row['is_duplicate']))
        
dev_evaluator = evaluation.BinaryClassificationEvaluator(dev_sentences1, dev_sentences2, dev_labels)

In [7]:
test_sentences1 = []
test_sentences2 = []
test_labels = []
with open("classification/test_pairs.tsv", encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        test_sentences1.append(row['question1'])
        test_sentences2.append(row['question2'])
        test_labels.append(int(row['is_duplicate']))
        
evaluator = evaluation.BinaryClassificationEvaluator(test_sentences1, test_sentences2, test_labels)

In [8]:
# word_embedding_model = models.Transformer('distilbert-base-uncased', max_seq_length=256)
# pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
#                                pooling_mode_mean_tokens=True,
#                                pooling_mode_cls_token=False,
#                                pooling_mode_max_tokens=False)
# bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model])

bi_encoder = SentenceTransformer('../input/qqp-sbert-base/output/sbert_base_mnr_8')

In [9]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)
distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE
train_loss = losses.MultipleNegativesRankingLoss(bi_encoder)
# train_loss = losses.OnlineContrastiveLoss(model=bi_encoder, distance_metric=distance_metric, margin=margin)

In [10]:
#pre-training model evaluation
print('Test dataset score:', bi_encoder.evaluate(evaluator))

Test dataset score: 0.6717797070663291


In [11]:
bi_encoder.fit(train_objectives=[(train_dataloader, train_loss)], 
               evaluator=dev_evaluator, 
               epochs=num_epochs, 
               output_path=model_save_path, 
               save_best_model=True)



Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8698 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8698 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8698 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8698 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8698 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8698 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8698 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8698 [00:00<?, ?it/s]

In [12]:
#post training model evaluation
print('Test dataset score:', bi_encoder.evaluate(evaluator))

Test dataset score: 0.6665017963893702
