In [None]:
pip install -U sentence-transformers jsonlines

In [None]:
# mount drive to access data
from google.colab import drive
drive.mount('/content/gdrive')

# Load data

In [None]:
# load data
import jsonlines
from sentence_transformers.readers import InputExample
from typing import List, Tuple

DIR = "/content/gdrive/MyDrive/CUNY_Comp_Ling/advanced_nlp/term_project/data/"
TRAIN = 'train.jsonl'
DEV = 'dev.jsonl'
TEST = 'test.jsonl'

def construct_examples(filepath: str) -> List[InputExample]:
  examples = []
  with jsonlines.open(filepath) as source:
    for line in source.iter():
      abstract = line['abstract']
      text = line['text']
      label = line['label']
      examples.append(InputExample(texts=[abstract, text], label=label))
  return examples

def construct_eval_examples(filepath: str) -> Tuple[List, List, List]:
  abstracts = []
  texts = []
  labels = []
  with jsonlines.open(filepath) as source:
    for line in source.iter():
      abstracts.append(line['abstract'])
      texts.append(line['text'])
      labels.append(line['label'])
  return abstracts, texts, labels

In [None]:
train_examples = construct_examples(DIR + TRAIN)
dev_examples = construct_eval_examples(DIR + DEV)
test_examples = construct_eval_examples(DIR + TEST)

In [None]:
len(train_examples), len(dev_examples[0]), len(test_examples[0])

# Train

In [None]:
from sentence_transformers import SentenceTransformer, losses, evaluation
from torch.utils.data import DataLoader
from typing import Dict, Union


def regression_training(
    model_name_or_path: str, 
    batch_size: int,
    hparams: Dict[List, Union[str, int, bool]]
    ):
    
    regression_model = SentenceTransformer(model_name_or_path)

    #define dataloader and loss
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
    train_loss = losses.CosineSimilarityLoss(regression_model)

    # evaluator
    evaluator = evaluation.EmbeddingSimilarityEvaluator(
        dev_examples[0], 
        dev_examples[1], 
        dev_examples[2]
    )

    # train
    regression_model.fit(
        train_objectives=[(train_dataloader, train_loss)], 
        epochs=hparams['epochs'], 
        warmup_steps=hparams['warmup_steps'],
        scheduler=hparams['scheduler'],
        evaluator=evaluator,
        evaluation_steps=hparams['eval_steps'],
        output_path=hparams['output_path'],
        save_best_model=hparams['save_best_model'])


In [None]:
# train

# hparams from the paper
HYPERPARAMS = {
    'epochs': 1,
    'scheduler': 'WarmupLinear',
    'warmup_steps': len(train_examples) // 10,
    # optimizer = Adam by default
    # optimizer_params = {'lr': 2e-05} by default
    'eval_steps': 500,
    'output_path': DIR + "sbert_base/",
    'save_best_model': True
}

regression_training(
    model_name_or_path='roberta-base', # Training the RoBERTa-based model
    batch_size=8,
    hparams=HYPERPARAMS,
    )

# Test

In [None]:
# save test predictions

import numpy as np
import pandas as pd
import time
from sentence_transformers import util

def sts_similarity(sent_1: str, sent_2: str, model: SentenceTransformer):
  emb1 = model.encode(sent_1)
  emb2 = model.encode(sent_2)
  cos_sim = util.pytorch_cos_sim(emb1, emb2)
  return np.array(cos_sim)[0][0]

def save_predictions(gold_data_path: str, pred_save_path: str,
                     model: SentenceTransformer):
  df = pd.DataFrame(columns=['gold_labels', 'predictions'])
  count = 0
  start_time = time.process_time()
  with jsonlines.open(gold_data_path) as f:
    for line in f.iter():
        abstract = line['abstract']
        text = line['text']
        label = line['label']
        sts = sts_similarity(abstract, text, model)
        results = {
            'gold_labels': label,
            'predictions': sts,
        }
        df = df.append(results, ignore_index=True)
        if not count % 100:
          print(f"processed {count} texts in {time.process_time() - start_time}")
          start_time = time.process_time()
        count += 1
    df.to_csv(pred_save_path)

In [None]:
PRED_SAVE_PATH = 'sbert_base_predictions.csv'

save_predictions(DIR + TEST, DIR + PRED_SAVE_PATH, best_regression_model)