In [None]:
!python --version
!pip install transformers ray

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW

import math
import pandas as pd
import numpy as np
import random
import json
import os

from transformers import LongformerTokenizer, LongformerForSequenceClassification
from transformers import get_linear_schedule_with_warmup

from ray import tune, put, get
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from functools import partial

In [None]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

## Data

In [None]:
path_to_train_df = '/path/to/preprocessed/train_df.json'
path_to_validation_df = '/path/to/preprocessed/val_df.json'

train_df = pd.read_json(path_to_train_df)
val_df = pd.read_json('/content/drive/My Drive/AI/SemEval2023/normalized/val_focused_no_augmentation_df.json')

print(len(train_df))
print(len(val_df))

In [None]:
tokenizer = LongformerTokenizer.from_pretrained("yikuan8/Clinical-Longformer")
model = LongformerForSequenceClassification.from_pretrained("yikuan8/Clinical-Longformer")
model.to(device)

In [None]:
def get_dataset(df):
  MAX_LEN = 4096
  input_token_ids = []
  attention_mask_ids = []
  label_ids = []

  premise_list = df['Focused_premise'].to_list()
  hypothesis_list = df['Statement'].to_list()
  label_list = df['Label'].to_list()
  label_dict = {'Entailment': 0, 'Contradiction': 1}

  for (premise, hypothesis, label) in zip(premise_list, hypothesis_list, label_list):
    tokenization_output = tokenizer.encode_plus(text=hypothesis,
                                                   text_pair=' '.join(premise),
                                                   add_special_tokens=True,
                                                   # padding="max_length", # Somehow breaks tokenization. TODO: look into this
                                                   truncation=True,
                                                   max_length=MAX_LEN, 
                                                   return_tensors="pt",
                                                  #  return_token_type_ids=True, # no segment ids for this model 
                                                   return_attention_mask=True)
      
    out_input_ids = tokenization_output['input_ids'][0]
    out_mask_ids = tokenization_output['attention_mask'][0]

    input_token_ids.append(out_input_ids)
    attention_mask_ids.append(out_mask_ids)
    label_ids.append(label_dict[label])

  input_token_ids = pad_sequence(input_token_ids, batch_first=True)
  attention_mask_ids = pad_sequence(attention_mask_ids, batch_first=True)
  label_ids = torch.tensor(label_ids)
  
  dataset = TensorDataset(input_token_ids, attention_mask_ids, label_ids)
  return dataset

# Get the train, validation dataframes. For now, just return em' 
def get_train_val_dfs():
  return train_df, val_df

def get_dataloaders(batchSize):
  train_df, validation_df = get_train_val_dfs()
  
  train_data = get_dataset(train_df)
  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batchSize)

  validation_data = get_dataset(validation_df)
  validation_sampler = SequentialSampler(validation_data)
  validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batchSize)

  return train_dataloader, validation_dataloader

In [None]:
# len(train_df["Premise"])

In [None]:
# get_dataset(val_df)

In [None]:
# t = tokenizer.encode("Inclusion Criteria:   Pathologically confirmed breast cancer, determined to be a candidate for primary systemic (neoadjuvant) therapy and for surgical resection of residual primary tumor following completion of neoadjuvant therapy   Locally advanced breast cancer, not stage IV, and with a tumor size >= 2 cm (as measured on imaging or estimated by physical exam)   No obvious contraindications for primary chemotherapy   Residual tumor planned to be removed surgically following completion of neoadjuvant therapy   Able to lie still for 1.5 hours for PET scanning   Eastern Cooperative Oncology Group (ECOG) performance status =< 2 (Karnofsky >= 60%)   Leukocytes >= 3,000/ul   Absolute neutrophil count >= 1,500/ul   Platelets >= 100,000/ul   Total bilirubin within normal institutional limits   Aspartate aminotransferase (AST) (serum glutamic oxaloacetic transaminase [SGOT])/alanine aminotransferase (ALT) (serum glutamate pyruvate transaminase [SGPT]) =< 2.5 times the institutional upper limit of normal   Creatinine within normal institutional limits OR creatinine clearance >= 30 mL/min/1.73 m^2 for patients with creatinine levels above institutional normal   If female, postmenopausal for a minimum of one year, OR surgically sterile, OR not pregnant, confirmed by institutional standard of care (SOC) pregnancy test, and willing to use adequate contraception (hormonal or barrier method of birth control; abstinence) for the duration of study participation   Able to understand and willing to sign a written informed consent document and a Health Insurance Portability and Accountability Act (HIPAA) authorization in accordance with institutional guidelines Exclusion Criteria:   Previous treatment (chemotherapy, radiation, or surgery) to involved breast; including hormone therapy   Uncontrolled intercurrent illness including, but not limited to, ongoing or active infection, symptomatic congestive heart failure, unstable angina pectoris, cardiac arrhythmia, or psychiatric illness/social situations that would limit compliance with study requirements   Medically unstable   Condition requiring anesthesia for PET scanning and/or unable to lie still for 1.5 hours   History of allergic reactions attributed to compounds of similar chemical or biologic composition to F-18 fluorothymidine   Pregnant or nursing   Previous malignancy, other than basal cell or squamous cell carcinoma of the skin or in situ carcinoma of the cervix, from which the patient has been disease free for less than 5 years   Currently on hormone therapy as the primary systemic neoadjuvant therapy")
# len(t)

In [None]:
# Number of training epochs
epochs = 5

def get_scheduler_and_optimizer(model, learningRate, batches, adamEpsilon = 1e-8, warmupStepsPercentage=0):
  optimizer = AdamW(model.parameters(),
                  lr = learningRate, # args.learning_rate - default is 5e-5
                  eps = adamEpsilon # args.adam_epsilon  - default is 1e-8.
                )
  
  total_steps = epochs * batches

  return get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = math.floor(total_steps * warmupStepsPercentage),
                                            num_training_steps = total_steps), optimizer

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def fine_tune(config, model, checkpoint_dir=None):
  # Store the average loss after each epoch so we can plot them.
  loss_values = []

  evaluated_model_ref = get(model)

  train_dataloader, validation_dataloader = get_dataloaders(config["batchSize"]) # 300 and 2 are just placeholder values
  scheduler, optimizer = get_scheduler_and_optimizer(evaluated_model_ref, config["learningRate"], len(train_dataloader) * epochs, config["adamEpsilon"], config["warmupStepsPercentage"])
  criterion = nn.CrossEntropyLoss()

  if checkpoint_dir:
    model_state, optimizer_state = torch.load(os.path.join(checkpoint_dir, "checkpoint"))
    evaluated_model_ref.load_state_dict(model_state)
    optimizer.load_state_dict(optimizer_state)

  # For each epoch...
  for epoch_i in range(0, epochs):
      
      # ========================================
      #               Training
      # ========================================
      
      # Perform one full pass over the training set.

      # Reset the total loss for this epoch.
      total_loss = 0

      # Put the model into training mode. Don't be mislead--the call to 
      # `train` just changes the *mode*, it doesn't *perform* the training.
      # `dropout` and `batchnorm` layers behave differently during training
      # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
      evaluated_model_ref.train()

      # For each batch of training data...
      for step, (pair_token_ids, mask_ids, labels) in enumerate(train_dataloader):

          # Always clear any previously calculated gradients before performing a
          # backward pass. PyTorch doesn't do this automatically because 
          # accumulating the gradients is "convenient while training RNNs". 
          # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
          evaluated_model_ref.zero_grad()   

          pair_token_ids = pair_token_ids.to(device)
          mask_ids = mask_ids.to(device)
          labels = labels.to(device)     

          # Perform a forward pass (evaluate the model on this training batch).
          # This will return the loss (rather than the model output) because we
          # have provided the `labels`.
          # The documentation for this `model` function is here: 
          # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
          loss, prediction = evaluated_model_ref(input_ids=pair_token_ids, 
                                  attention_mask=mask_ids, 
                                  labels=labels).values()
          
          # Accumulate the training loss over all of the batches so that we can
          # calculate the average loss at the end. `loss` is a Tensor containing a
          # single value; the `.item()` function just returns the Python value 
          # from the tensor.
          total_loss += loss.item()

          # Perform a backward pass to calculate the gradients.
          loss.backward()

          # Clip the norm of the gradients to 1.0.
          # This is to help prevent the "exploding gradients" problem.
          torch.nn.utils.clip_grad_norm_(evaluated_model_ref.parameters(), 1.0)

          # Update parameters and take a step using the computed gradient.
          # The optimizer dictates the "update rule"--how the parameters are
          # modified based on their gradients, the learning rate, etc.
          optimizer.step()

          # Update the learning rate.
          scheduler.step()
          
      # ========================================
      #               Validation
      # ========================================
      # After the completion of each training epoch, measure our performance on
      # our validation set.

      print("")
      print("Running Validation...")

      # Put the model in evaluation mode--the dropout layers behave differently
      # during evaluation.
      evaluated_model_ref.eval()

      # Tracking variables 
      eval_loss, eval_accuracy = 0, 0
      nb_eval_steps, nb_eval_examples = 0, 0

      # Evaluate data for one epoch
      for (pair_token_ids, mask_ids, labels) in validation_dataloader:
          
          # Add batch to GPU
          pair_token_ids = pair_token_ids.to(device)
          mask_ids = mask_ids.to(device)
          labels = labels.to(device)

          # Telling the model not to compute or store gradients, saving memory and
          # speeding up validation
          with torch.no_grad():        

              # Forward pass, calculate logit predictions.
              # This will return the logits rather than the loss because we have
              # not provided labels.
              # token_type_ids is the same as the "segment ids", which 
              # differentiates sentence 1 and 2 in 2-sentence tasks.
              # The documentation for this `model` function is here: 
              # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
              outputs = evaluated_model_ref(input_ids=pair_token_ids, 
                              attention_mask=mask_ids)
          
          # Get the "logits" output by the model. The "logits" are the output
          # values prior to applying an activation function like the softmax.
          logits = outputs[0]

          # Move logits and labels to CPU
          logits = logits.detach().cpu().numpy()
          label_ids = labels.to('cpu').numpy()

          # # dubug only - get the incorrectly classified token sequences.
          # pred_flat = np.argmax(logits, axis=1).flatten()
          # labels_flat = label_ids.flatten()
          # token_ids_list = pair_token_ids.cpu().numpy()
          # incorrectly_classified.append(token_ids_list[pred_flat != labels_flat])
          
          # Calculate the accuracy for this batch of test sentences.
          tmp_eval_accuracy = flat_accuracy(logits, label_ids)
          tmp_eval_loss = criterion(torch.tensor(logits), torch.tensor(label_ids))
          
          # Accumulate the total accuracy.
          eval_accuracy += tmp_eval_accuracy
          eval_loss += tmp_eval_loss

          # Track the number of batches
          nb_eval_steps += 1

      with tune.checkpoint_dir(epoch_i) as checkpoint_dir:
        path = os.path.join(checkpoint_dir, "checkpoint")
        torch.save((evaluated_model_ref.state_dict(), optimizer.state_dict()), path)
            
      tune.report(loss=(eval_loss/nb_eval_steps), accuracy=eval_accuracy/nb_eval_steps)

In [None]:
# driver

config = {
    "learningRate": tune.choice([2e-5]),
    "batchSize": tune.choice([8]),
    "adamEpsilon": tune.choice([1e-8]),
    "warmupStepsPercentage": tune.choice([0])
    }
    #   # "learningRate": tune.loguniform(1e-5, 5e-5),
    #   # "learningRate": tune.choice([2e-5, 3e-4, 1e-4, 5e-5, 3e-5, 2e-3, 2e-2, 2e-1]),

# config = {
#       "windowSize": tune.choice([100, 150]),
#       "windowOverlapRate": tune.choice([1.25, 2, 3]),
#       "learningRate": tune.loguniform(2e-5, 3e-5),
#       # "learningRate": tune.choice([2e-5, 3e-4, 1e-4, 5e-5, 3e-5, 2e-3, 2e-2, 2e-1]),
#       "batchSize": tune.choice([64]),
#       "adamEpsilon": tune.choice([1e-8, 1e-6]),
#       "warmupStepsPercentage": tune.choice([0, 0.02, 0.05, 0.1, 0.2, 0.35])
#     }

scheduler = ASHAScheduler(
    metric="accuracy",
    mode="max",
    max_t=epochs,
    grace_period=2,
    reduction_factor=2)

reporter = CLIReporter(
    metric_columns=["loss", "accuracy", "training_iteration"])

# Create a ref to the model to pass to the fine_tune function.
# The model is ~600 MiB. If it is directly referenced in our fine_tune method, Ray will throw a 'worker function too big > 95MiB' error.
modelRef = put(model)

result = tune.run(
    partial(fine_tune, model=modelRef),
    resources_per_trial={"gpu": 1},
    config=config,
    num_samples=1,
    scheduler=scheduler,
    progress_reporter=reporter)

best_trial = result.get_best_trial("accuracy", "max", "last")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(
    best_trial.last_result["loss"]))
print("Best trial final validation accuracy: {}".format(
    best_trial.last_result["accuracy"]))

# best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
# device = "cpu"
# if torch.cuda.is_available():
#     device = "cuda:0"
#     if gpus_per_trial > 1:
#         best_trained_model = nn.DataParallel(best_trained_model)
# best_trained_model.to(device)

# best_checkpoint_dir = best_trial.checkpoint.value
# model_state, optimizer_state = torch.load(os.path.join(
#     best_checkpoint_dir, "checkpoint"))
# best_trained_model.load_state_dict(model_state)

# test_acc = test_accuracy(best_trained_model, device)
# print("Best trial test set accuracy: {}".format(test_acc))

In [None]:
trial = result.get_best_trial("accuracy", "max", "last")
result.get_best_checkpoint(trial, "accuracy", "max", "last")