## Load Dependencies, Model and Data

In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install wandb

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForQuestionAnswering, AutoModelForSequenceClassification, AutoModelForMultipleChoice
from transformers import TrainingArguments, Trainer
from sklearn.metrics import classification_report
from datasets import load_dataset, DatasetDict
import textwrap
import pandas as pd
import numpy as np
import wandb
import os

In [None]:
def stratify(dataset, yes_max, no_max):
  
  " Simple class balancing function w/ shuffling"

  yes_count = 0
  no_count = 0

  exclude_id = []

  for i in range(len(dataset)):

    if (dataset[i]['answers'] == "yes"): 
      yes_count+=1
      if yes_count > yes_max:
        exclude_id.append(i)
    
    if (dataset[i]['answers'] == "no"):   
      no_count+=1
      if no_count > no_max:
        exclude_id.append(i)

  dataset = dataset.select(
      (
          i for i in range(len(dataset)) 
          if i not in set(exclude_id)
      )
  )

  return dataset.shuffle(seed=42)

In [None]:
# Load the labeled BioASQ dataset

dataset = load_dataset("reginaboateng/Bioasq7b")['train']

# Balance classes

dataset_balanced = stratify(dataset, 883, 883)
train_dataset = dataset_balanced.select(( i for i in range(0, 1500)))
val_dataset = dataset_balanced.select(( i for i in range(1500, 1600)))
test_dataset = dataset_balanced.select(( i for i in range(1600, 1766)))

# Add numeric label column for all datasets

d = {'yes' : 0, 'no': 1}
new_column = [d[fd] for fd in train_dataset['answers']] 
train_dataset = train_dataset.add_column("label", new_column)
new_column = [d[fd] for fd in val_dataset['answers']] 
val_dataset = val_dataset.add_column("label", new_column)
new_column = [d[fd] for fd in test_dataset['answers']] 
test_dataset = test_dataset.add_column("label", new_column)


In [None]:
## Check label distributions

print(pd.Series(train_dataset['label']).value_counts())
print(pd.Series(val_dataset['label']).value_counts())
print(pd.Series(test_dataset['label']).value_counts())
print("\n", pd.Series(train_dataset['label']).value_counts(normalize=True))
print(pd.Series(val_dataset['label']).value_counts(normalize=True))
print(pd.Series(test_dataset['label']).value_counts(normalize=True))

In [None]:
# Download BioBERT/PubMedBERT model

tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
model = AutoModelForMultipleChoice.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
#tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-v1.1')
#model = AutoModelForMultipleChoice.from_pretrained('dmis-lab/biobert-v1.1')

#tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-tiny')
#model = AutoModelForMultipleChoice.from_pretrained('prajjwal1/bert-tiny')
#tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
#model = AutoModelForMultipleChoice.from_pretrained('bert-base-uncased')

In [None]:
# View model properties via config file

config = AutoConfig.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')
#config = AutoConfig.from_pretrained('dmis-lab/biobert-v1.1')
#config = AutoConfig.from_pretrained('bert-base-uncased')
#config = AutoConfig.from_pretrained('prajjwal1/bert-tiny')

config

## Preprocess & Tokenize Data

In [None]:
def preprocess(example):

  ''' Basic preprocessing & tokenizer function; adapted from https://huggingface.co/docs/transformers/tasks/multiple_choice'''

  batch_size = 2
  answers = ["yes", "no"]
  context = [[c] * len(answers) for c in example["context"]]
  question_headers = example["question"]
  
  question_answer = [
      [f"{header} {a}" for a in answers] for i, header in enumerate(question_headers)
  ]

  #print("\n", context)
  #print(question_answer)

  context = sum(context, [])
  question_answer = sum(question_answer, [])
  
  tokenized_examples = tokenizer(context, question_answer, truncation='only_first', max_length=512)
  
  return {k: [v[i : i + batch_size] for i in range(0, len(v), len(answers))] for k, v in tokenized_examples.items()}


In [None]:
# Tokenize the data 

tokenized_train_dataset = train_dataset.map(preprocess, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess, batched=True)

In [None]:
# Inspect first element of training 

print(len(tokenized_train_dataset[0]["input_ids"][0]))
print(tokenized_train_dataset[0]["question"])
print(tokenized_train_dataset[0]["context"])
print(tokenized_train_dataset[0]["label"])

In [None]:
# Check number of input_id tokens per example 

for d in tokenized_train_dataset:
  print("\nNumber of words in context + question: ", len(d['context'].split()) + len(d['question'].split()))
  print("Number of input id tokens: ", len(d["input_ids"][0]))
  print("Number of input id tokens: ", len(d["input_ids"][1]))

## Finetune model on the BioASQ task

In [None]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received. Adapted from https://huggingface.co/docs/transformers/tasks/multiple_choice
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
      
      label_name = "label" if "label" in features[0].keys() else "label"
      labels = [feature.pop(label_name) for feature in features]
      batch_size = len(features)
      num_choices = len(features[0]["input_ids"])

      #print("\nFeatures: ", features)
      #print("\nFeature len: ", len(features))
      #print("num_choices: ", num_choices)
      
      flattened_features = [
          [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
   
      flattened_features = sum(flattened_features, [])

      batch = self.tokenizer.pad(
          flattened_features,
          padding=self.padding,
          max_length=self.max_length,
          pad_to_multiple_of=self.pad_to_multiple_of,
          return_tensors="pt",
      )

      batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
      batch["labels"] = torch.tensor(labels, dtype=torch.int64)

      #print('batch input_id size: ', batch['input_ids'].shape)
      #print('batch token_type_id size: ', batch['token_type_ids'].shape)
      #print('batch attention_mask size: ', batch['attention_mask'].shape)
      #print('batch labels: ', batch['labels'])
      #print('labels size: ', len(labels))
      #print('\nBatch: ', batch)

      return batch

In [None]:
# Load evaluation metrics

from datasets import load_metric
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)
    

In [None]:
# Vanilla training setup

training_args = TrainingArguments(
    output_dir="PubmedBert-QA",
    evaluation_strategy="steps",
    #save_strategy="epoch",
    eval_steps=20,
    load_best_model_at_end=True,
    learning_rate=  2.86229819276255e-05 ,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs= 2,
    weight_decay= 0.05,
    logging_steps=10,
    #fp16=False,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()
test_results = trainer.predict(test_dataset=tokenized_test_dataset)
test_results

## Hyperparameter Search

In [None]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mhk4cd[0m ([33mtheotherkhan[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# W&B hyperparameter specification

# method
sweep_config = {
    'method': 'random'
}

# hyperparameters
parameters_dict = {
    'epochs': {
        'values': [1, 2]
        },
    'batch_size': {
        'values': [4, 8, 12]
        },
    'learning_rate': {
        'distribution': 'log_uniform_values',
        'min': 1e-6,
        'max': 1e-4
    },
    'weight_decay': {
        'values': [0.05, 0.1, 0.15]
    },
}

metric = {
    'name' : 'loss',
    'goal' : 'minimize'
}

sweep_config['metric'] = metric
sweep_config['parameters'] = parameters_dict
sweep_id = wandb.sweep(sweep_config, project='teacher-biobert-bioasq-narrow-2')


In [None]:
# W&B trainer setup

preds = []
test_accs = []

def train(config=None):

  with wandb.init(config=config):
    # set sweep configuration
    config = wandb.config

    training_args = TrainingArguments(
      output_dir="/content/wandb/outputs", 
      evaluation_strategy="steps",
      save_strategy="steps",
      eval_steps=20,
      load_best_model_at_end=True,
      #learning_rate=5e-4,
      learning_rate=config.learning_rate,
      per_device_train_batch_size=config.batch_size,
      per_device_eval_batch_size=config.batch_size,
      num_train_epochs=config.epochs,
      weight_decay=config.weight_decay,
      logging_steps=10,
      push_to_hub=False,
      report_to="wandb"
    )

    trainer = Trainer(
        model=model,
        #model_init=model_init,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_val_dataset,
        tokenizer=tokenizer,
        data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
        compute_metrics=compute_metrics,
    )

    trainer.train()

In [None]:
wandb.agent(sweep_id, train, count=10)

## Save Model

In [None]:
model_name = "teacher_model" # student_model
trainer.save_model("/content/" + model_name)
model.save_pretrained("/content/" + model_name)
!zip -r /content/teacher_model.zip /content/teacher_model

## Evaluate model on the test set


In [None]:
# Evaluation using model.predict()
test_results = trainer.predict(test_dataset=tokenized_test_dataset)
test_results

In [None]:
# # Evaluation using model.evaluate()
eval_result = model.evaluate(eval_dataset=tokenized_test_dataset)
eval_result

In [None]:
# Manual results framework

'''
  results = pd.DataFrame()
  results['y_true'] = tokenized_test_dataset['label']
  results['y_pred'] = y_pred

  acc = results[results['y_true'] == results['y_pred']].shape[0]/results.shape[0]
  acc
'''