In [None]:
!pip install --quiet -U sentence-transformers

In [None]:
# # Dowloading split dataset
! wget https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/quora-IR-dataset.zip
! unzip ./quora-IR-dataset.zip

In [None]:
import os
from datetime import datetime
import csv
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sentence_transformers import models, losses, util
from sentence_transformers import SentenceTransformer, evaluation
from sentence_transformers.readers import InputExample
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


# setting seed
random.seed(42)
os.environ['PYTHONHASHSEED'] = str(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

In [None]:
num_epochs = 5
batch_size = 32
model_save_path = 'output/cross_base_10'

In [None]:
train_samples = []
with open('classification/train_pairs.tsv', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        sample = InputExample(texts=[row['question1'], row['question2']], label=int(row['is_duplicate']))
        train_samples.append(sample)
        
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)

In [None]:
dev_samples = []
with open('classification/dev_pairs.tsv', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        sample = InputExample(texts=[row['question1'], row['question2']], label=int(row['is_duplicate']))
        dev_samples.append(sample)
        
dev_evaluator = CEBinaryClassificationEvaluator.from_input_examples(dev_samples)

In [None]:
test_samples = []
with open('classification/test_pairs.tsv', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        sample = InputExample(texts=[row['question1'], row['question2']], label=int(row['is_duplicate']))
        test_samples.append(sample)
        
test_evaluator = CEBinaryClassificationEvaluator.from_input_examples(test_samples)

In [None]:
model = CrossEncoder('distilbert-base-uncased', num_labels=1)

# for loading old model
# model = CrossEncoder('../input/cross-encoder/output/cross_base_8')

In [None]:
model.fit(train_dataloader=train_dataloader,
          evaluator=dev_evaluator,
          epochs=num_epochs,
          output_path=model_save_path,
          save_best_model=True
    )

In [None]:
# saving evaluation scores
test_evaluator(model)