In [None]:
!pip install --quiet -U sentence-transformers

In [None]:
# # Dowloading split dataset
! wget https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/quora-IR-dataset.zip
! unzip ./quora-IR-dataset.zip

In [None]:
import os
from datetime import datetime
import csv
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sentence_transformers import models, losses, util
from sentence_transformers import SentenceTransformer, evaluation
from sentence_transformers.readers import InputExample
from transformers import T5ForConditionalGeneration,T5Tokenizer
from sentence_transformers.cross_encoder import CrossEncoder

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


# setting seed
random.seed(42)
os.environ['PYTHONHASHSEED'] = str(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

In [None]:
num_epochs = 5
batch_size = 32
model_save_path = 'output/sbert_custom_10'

In [None]:
gold_samples = []
with open('classification/train_pairs.tsv', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        sample = InputExample(texts=[row['question1'], row['question2']], label=int(row['is_duplicate']))
        gold_samples.append(sample)

In [None]:
df = pd.read_csv('./classification/train_pairs.tsv', delimiter='\t', on_bad_lines='skip')
df

In [None]:
df.is_duplicate.groupby(df.is_duplicate).count()

In [None]:
# Paraphrase sentences to generated new dataset using T5 model trained on duplicate questions from the Quora dataset. 
# Pre-trained model available at https://huggingface.co/ramsrigouthamg/t5_paraphraser

num_sentences = 20000 # number of rows used to generate augmented data
max_length = 256 - len("paraphrase: " + " </s>")    

model_name = 'ramsrigouthamg/t5_paraphraser'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(torch_device)

def get_response(input_texts, num_return_sequences=1):
    encoding = tokenizer.batch_encode_plus(input_texts,pad_to_max_length=True, return_tensors="pt", max_length=256)
    input_ids, attention_masks = encoding["input_ids"].to(torch_device), encoding["attention_mask"].to(torch_device)
    beam_outputs = model.generate(
        input_ids=input_ids, attention_mask=attention_masks,
        do_sample=True,
        max_length=max_length,
        top_k=120,
        top_p=0.98,
        early_stopping=True,
        num_return_sequences=num_return_sequences
    )
    tgt_texts = tokenizer.batch_decode(beam_outputs, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    return tgt_texts


len_idx = random.sample(list(df.index[(df.question1.str.len() < max_length) & (df.question2.str.len() < max_length)]), num_sentences)
for batch_idx in np.split(len_idx, np.arange(batch_size, len(len_idx), batch_size)):
    df.loc[batch_idx, 'aug1'] = get_response(list("paraphrase: " + df.loc[batch_idx, 'question1'] + " </s>"))
    df.loc[batch_idx, 'aug2'] = get_response(list("paraphrase: " + df.loc[batch_idx, 'question2'] + " </s>"))

aug_df = df[df.aug1.notnull()]
aug_df.to_csv('aug_train.csv', index=False)

In [None]:
# Labelling augmented dataset using cross-encoder model trained on Quora Question Pair dataset
aug_df = pd.read_csv('aug_train.csv')
ce_model = CrossEncoder('cross-encoder/quora-distilroberta-base')

aug_df['q1a1'] = ce_model.predict(list(zip(aug_df['question1'], aug_df['aug1'])), batch_size=batch_size) > 0.5
aug_df['q1a2'] = ce_model.predict(list(zip(aug_df['question1'], aug_df['aug2'])), batch_size=batch_size) > 0.5
aug_df['q2a1'] = ce_model.predict(list(zip(aug_df['question2'], aug_df['aug1'])), batch_size=batch_size) > 0.5
aug_df['q2a2'] = ce_model.predict(list(zip(aug_df['question2'], aug_df['aug2'])), batch_size=batch_size) > 0.5
aug_df['a1a2'] = ce_model.predict(list(zip(aug_df['aug1'], aug_df['aug2'])), batch_size=batch_size) > 0.5
aug_df[['q1a1', 'q1a2', 'q2a1', 'q2a2', 'a1a2']] = aug_df[['q1a1', 'q1a2', 'q2a1', 'q2a2', 'a1a2']].astype(int)

flat_df = pd.DataFrame.from_dict({'question1': pd.concat([aug_df.question1, aug_df.question1, aug_df.question2, aug_df.question2, aug_df.aug1], ignore_index=True),
                            'question2': pd.concat([aug_df.aug1, aug_df.aug2, aug_df.aug1, aug_df.aug2, aug_df.aug2], ignore_index=True),
                            'is_duplicate': pd.concat([aug_df.q1a1, aug_df.q1a2, aug_df.q2a1, aug_df.q2a2, aug_df.a1a2], ignore_index=True)})
flat_df.to_csv('flat_train.csv', index=False)

In [None]:
flat_df.is_duplicate.groupby(flat_df.is_duplicate).count()

In [None]:
silver_samples = []
with open('flat_train.csv', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn)
    for row in reader:
        sample = InputExample(texts=[row['question1'], row['question2']], label=int(row['is_duplicate']))
        silver_samples.append(sample)
        
train_dataloader = DataLoader(gold_samples + silver_samples, shuffle=True, batch_size=batch_size)

In [None]:
dev_sentences1 = []
dev_sentences2 = []
dev_labels = []
with open(os.path.join('classification/dev_pairs.tsv'), encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        dev_sentences1.append(row['question1'])
        dev_sentences2.append(row['question2'])
        dev_labels.append(int(row['is_duplicate']))
        
dev_evaluator = evaluation.BinaryClassificationEvaluator(dev_sentences1, dev_sentences2, dev_labels)

In [None]:
test_sentences1 = []
test_sentences2 = []
test_labels = []
with open("classification/test_pairs.tsv", encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        test_sentences1.append(row['question1'])
        test_sentences2.append(row['question2'])
        test_labels.append(int(row['is_duplicate']))
        
evaluator = evaluation.BinaryClassificationEvaluator(test_sentences1, test_sentences2, test_labels)

In [None]:
word_embedding_model = models.Transformer('distilbert-base-uncased', max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)
bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# for loading old model
# bi_encoder = SentenceTransformer('../input/aug-sbert-quora/output/sbert_quora_5')

In [None]:
margin = 0.5
distance_metric = losses.SiameseDistanceMetric.COSINE_DISTANCE
train_loss = losses.OnlineContrastiveLoss(model=bi_encoder, distance_metric=distance_metric, margin=margin)

In [None]:
bi_encoder.fit(train_objectives=[(train_dataloader, train_loss)], 
               evaluator=dev_evaluator, 
               epochs=num_epochs, 
               output_path=model_save_path, 
               save_best_model=True)

In [None]:
# saving evaluation results
bi_encoder.evaluate(evaluator, output_path='.')