In [None]:
!nvidia-smi

In [None]:
!pip install -U sentence-transformers jsonlines datasets ipywidgets

In [3]:
!pip install tqdm>=4.55.0

In [None]:
# install nvidia apex to use mixed precision

try:
  import apex
except:
  !git clone https://github.com/NVIDIA/apex
  %cd apex
  !pip install -v --no-cache-dir ./

In [None]:
# mount drive to access data
from google.colab import drive
drive.mount('/content/gdrive')

# Load data

In [6]:
# load data
import jsonlines
from sentence_transformers.readers import InputExample
from typing import List, Tuple

DIR = "/content/gdrive/MyDrive/CUNY_Comp_Ling/advanced_nlp/term_project/data/"
TRAIN = 'train.jsonl'
DEV = 'dev.jsonl'
TEST = 'test.jsonl'

def construct_examples(filepath: str) -> List[InputExample]:
  examples = []
  with jsonlines.open(filepath) as source:
    for line in source.iter():
      abstract = line['abstract']
      text = line['text']
      label = line['label']
      examples.append(InputExample(texts=[abstract, text], label=label))
  return examples

def construct_eval_examples(filepath: str) -> Tuple[List, List, List]:
  abstracts = []
  texts = []
  labels = []
  with jsonlines.open(filepath) as source:
    for line in source.iter():
      abstracts.append(line['abstract'])
      texts.append(line['text'])
      labels.append(line['label'])
  return abstracts, texts, labels

In [12]:
train_examples = construct_examples(DIR + TRAIN)
dev_examples = construct_eval_examples(DIR + DEV)
test_examples = construct_eval_examples(DIR + TEST)

In [None]:
len(train_examples), len(test_examples[0]), len(dev_examples[0])

# Train

In [None]:
import datasets
import numpy as np
import os
import pandas as pd
from pathlib import Path
import random
import tqdm

import torch
from torch import nn
from torch import functional as F
from torch.utils.data import (
    TensorDataset,
    random_split,
    RandomSampler,
    DataLoader
)

from transformers import (
    LongformerForSequenceClassification,
    LongformerModel,
    LongformerConfig,
    Trainer, 
    TrainingArguments,
    AdamW,
)


In [9]:
from sentence_transformers import SentenceTransformer, losses, evaluation, models
from torch.utils.data import DataLoader
from typing import Dict, Union


def regression_training(
    model_name_or_path: str, 
    batch_size: int,
    hparams: Dict[str, Union[str, int, bool]],
    embedding_model_args: Dict[str, Union[str, int, bool]],
    train_examples, dev_examples
    ):
    # initiate with Longformer
    embedding_model = models.Transformer(
        model_name_or_path,
        model_args=embedding_model_args) 
    pooling_model = models.Pooling(embedding_model.get_word_embedding_dimension()) 
    dense_model = models.Dense(
        in_features=pooling_model.get_sentence_embedding_dimension(), 
        out_features=256, 
        activation_function=nn.Tanh()
        )
    modules = [
               embedding_model,
               pooling_model,
               dense_model
               ]
    regression_model = SentenceTransformer(modules=modules)

    #define dataloader and loss
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
    train_loss = losses.CosineSimilarityLoss(regression_model)

    # evaluator
    evaluator = evaluation.EmbeddingSimilarityEvaluator(
        dev_examples[0], 
        dev_examples[1], 
        dev_examples[2]
    )

    # train
    regression_model.fit(
        train_objectives=[(train_dataloader, train_loss)], 
        epochs=hparams['epochs'], 
        warmup_steps=hparams['warmup_steps'],
        scheduler=hparams['scheduler'],
        evaluator=evaluator,
        evaluation_steps=hparams['eval_steps'],
        output_path=hparams['output_path'],
        save_best_model=hparams['save_best_model'],
        use_amp=hparams['use_amp'] if 'use_amp' in hparams else False,
        checkpoint_path=hparams['checkpoint_path']
        )



In [None]:
MODEL_SAVE_PATH = DIR + "sbert_long/"

# hparams from the paper
hyperparams = {
    'epochs': 1,
    'scheduler': 'WarmupLinear',
    'warmup_steps': len(train_examples) // 10,
    # optimizer = Adam by default
    # optimizer_params = {'lr': 2e-05} by default
    'eval_steps': 100,
    'output_path': MODEL_SAVE_PATH,
    'save_best_model': True,
    'use_amp': True, # mixed precision
    'checkpoint_path': MODEL_SAVE_PATH,
}

# longformer args
embedding_model_args = {
    'gradient_checkpointing':True,
    "num_labels": 1,
    "max_length": 1024,
}

# train
regression_training(
    model_name_or_path='allenai/longformer-base-4096', 
    batch_size=4,
    hparams=hyperparams,
    embedding_model_args=embedding_model_args,
    train_examples=train_examples,
    dev_examples=dev_examples
    )


In [15]:
# save test predictions

import numpy as np
import pandas as pd
import time
from sentence_transformers import util

def sts_similarity(sent_1: str, sent_2: str, model: SentenceTransformer):
  emb1 = model.encode(sent_1)
  emb2 = model.encode(sent_2)
  cos_sim = util.pytorch_cos_sim(emb1, emb2)
  return np.array(cos_sim)[0][0]

def save_predictions(gold_data_path: str, pred_save_path: str,
                     model: SentenceTransformer):
  df = pd.DataFrame(columns=['gold_labels', 'predictions'])
  count = 0
  start_time = time.process_time()
  with jsonlines.open(gold_data_path) as f:
    for line in f.iter():
        abstract = line['abstract']
        text = line['text']
        label = line['label']
        sts = sts_similarity(abstract, text, model)
        results = {
            'gold_labels': label,
            'predictions': sts,
        }
        df = df.append(results, ignore_index=True)
        if not count % 100:
          print(f"processed {count} texts in {time.process_time() - start_time}")
          start_time = time.process_time()
        count += 1
    df.to_csv(pred_save_path)

In [None]:
PRED_SAVE_PATH = 'sbert_long_predictions.csv'

best_regression_model = SentenceTransformer(MODEL_SAVE_PATH)

save_predictions(DIR + TEST, DIR + PRED_SAVE_PATH, best_regression_model)