In [1]:
!pip install --quiet -U sentence-transformers



In [2]:
# # Dowloading split dataset
! wget https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/quora-IR-dataset.zip
! unzip ./quora-IR-dataset.zip

--2022-04-21 07:18:46--  https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/quora-IR-dataset.zip
Resolving public.ukp.informatik.tu-darmstadt.de (public.ukp.informatik.tu-darmstadt.de)... 130.83.167.186
Connecting to public.ukp.informatik.tu-darmstadt.de (public.ukp.informatik.tu-darmstadt.de)|130.83.167.186|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 93570055 (89M) [application/zip]
Saving to: ‘quora-IR-dataset.zip’


2022-04-21 07:18:48 (84.8 MB/s) - ‘quora-IR-dataset.zip’ saved [93570055/93570055]

Archive:  ./quora-IR-dataset.zip
   creating: information-retrieval/
  inflating: information-retrieval/corpus.tsv  
  inflating: information-retrieval/dev-queries.tsv  
  inflating: information-retrieval/test-queries.tsv  
  inflating: quora_duplicate_questions.tsv  
   creating: classification/
  inflating: classification/dev_pairs.tsv  
  inflating: classification/test_pairs.tsv  
  inflating: classifica

In [3]:
import os
from datetime import datetime
import csv
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sentence_transformers import models, losses, util
from sentence_transformers import SentenceTransformer, evaluation
from sentence_transformers.readers import InputExample
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


# setting seed
random.seed(42)
os.environ['PYTHONHASHSEED'] = str(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

In [4]:
num_epochs = 8
batch_size = 32
model_save_path = 'output/cross_base_mnr_8'

In [5]:
train_samples = []
with open('classification/train_pairs.tsv', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        sample = InputExample(texts=[row['question1'], row['question2']], label=int(row['is_duplicate']))
        train_samples.append(sample)
        
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)

In [6]:
dev_samples = []
with open('classification/dev_pairs.tsv', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        sample = InputExample(texts=[row['question1'], row['question2']], label=int(row['is_duplicate']))
        dev_samples.append(sample)
        
dev_evaluator = CEBinaryClassificationEvaluator.from_input_examples(dev_samples)

In [7]:
test_samples = []
with open('classification/test_pairs.tsv', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        sample = InputExample(texts=[row['question1'], row['question2']], label=int(row['is_duplicate']))
        test_samples.append(sample)
        
test_evaluator = CEBinaryClassificationEvaluator.from_input_examples(test_samples)

In [8]:
model = CrossEncoder('distilbert-base-uncased', num_labels=1)

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [9]:
#pre-training model evaluation
print('Test dataset score:', test_evaluator(model))

Test dataset score: 0.44947548033364926


In [10]:
model.fit(train_dataloader=train_dataloader,
          evaluator=dev_evaluator,
          epochs=num_epochs,
          warmup_steps=2000,
          output_path=model_save_path,
          save_best_model=True
    )



Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8698 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8698 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8698 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8698 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8698 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8698 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8698 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8698 [00:00<?, ?it/s]

In [11]:
#post training model evaluation
print('Test dataset score:', test_evaluator(model))

Test dataset score: 0.8656083527618981
