# Install and import necessary libraries


In [None]:
!pip install transformers

In [None]:
from transformers import BertModel, BertTokenizerFast, DistilBertTokenizerFast, DistilBertModel, RobertaModel, RobertaTokenizerFast
from transformers.optimization import AdamW, get_linear_schedule_with_warmup
import torch 
import copy
import math
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import textwrap 
from pylab import rcParams
from matplotlib import rc
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from torch import nn, optim
from torch.utils import data 
from collections import defaultdict
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import KFold


RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Check for GPU

Tesla T4 and P100 should work

In [None]:
!nvidia-smi

In [None]:
if torch.cuda.is_available():
  device = torch.device("cuda:0")
else:
  device = torch.device("cpu")

In [None]:
device

# Prepare Data
This works a bit differently since we need single sentences as input, but still want to preserve the same datasplits.
*   open labeled dataset as dataframe
*   add abstract id so we know to which abstract each sentences belongs
*   create datasplits with random seed
*   select a split by changing the idx
*   explode the entries in the dataframe to have an entry for each sentence 









In [None]:
df = pd.read_json("cleaned1000.json")
df.tail()

In [None]:
df['Abstract_index'] = df.index
df = df[["Sentences", "Extractive", "Abstract_index"]]
df.tail()

In [None]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)
train_idx = []
test_idx = []
for train_index, test_index in kf.split(df):
  train_idx.append(train_index)
  test_idx.append(test_index)

In [None]:
idx = 2
df_train = df.iloc[train_idx[idx]]
df_test = df.iloc[test_idx[idx]]
df_val = df_test

In [None]:
# this function to explode to columns at the same times is from: https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows
def explode(df, lst_cols, fill_value='', preserve_index=False):
    # make sure `lst_cols` is list-alike
    if (lst_cols is not None
        and len(lst_cols) > 0
        and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)
    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()
    # preserve original index values    
    idx = np.repeat(df.index.values, lens)
    # create "exploded" DF
    res = (pd.DataFrame({
                col:np.repeat(df[col].values, lens)
                for col in idx_cols},
                index=idx)
             .assign(**{col:np.concatenate(df.loc[lens>0, col].values)
                            for col in lst_cols}))
    # append those rows that have empty lists
    if (lens == 0).any():
        # at least one list in cells is empty
        res = (res.append(df.loc[lens==0, idx_cols], sort=False)
                  .fillna(fill_value))
    # revert the original index order
    res = res.sort_index()
    # reset index if requested
    if not preserve_index:        
        res = res.reset_index(drop=True)
    return res

In [None]:
df_train = explode(df_train, ["Sentences", "Extractive"], fill_value='')
df_test = explode(df_test, ["Sentences", "Extractive"], fill_value='')

# Chose model and tokenizer
Replace the model name with one of the commented names to select on of the following models and its tokenizer from huggingface or local data:


*   BERT-cased/uncased
*   DistilBERT-cased/uncased
*   RoBERTa
*   BioBERT
*   SciBERT-scivocab-cased/uncased (basevocab versions must be downloaded manually from the repository (https://github.com/allenai/scibert)
*   Bert-cased DAPT
*   DistilBERT-uncased DAPT


For the DAPT models the tokenizers of the base model needs to be selected. Notice that for single sentence input I only made one notebook for all models because there aren't as many changes necessary when switching models. However, when switching between BERT, DistilBERT, and RoBERTa, some entries in the code need to be adjusted. These entries are marked with comments









In [None]:
# "bert-base-cased"
# "bert-base-uncased"
# "distilbert-base-cased"
# "distilbert-based-uncased"
# "roberta-base"
# "dmis-lab/biobert-base-cased-v1.1"
# "bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12"
# "allenai/scibert_scivocab_uncased"
# "allenai/scibert_scivocab_cased"  
# "/content/drive/MyDrive/Master Thesis/DAPT/distilbert_DAPT"


PRE_TRAINED_MODEL_NAME = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(PRE_TRAINED_MODEL_NAME) #DistilBertTokenizerFast / RobertaTokenizerFast
#tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

# Tokenizer and DataLoaders

The tokenizer creates the input for the pre-trained LMs and does the padding/truncation. This is simpler for single sentence input. The necessary outputs are:

*   Labels
*   Standard inputs: input_ids, attention_mask (segment_ids aren't necessary for BERT here because each input has only one sentence)

The dataloaders use this function to create batches from the dataset which can be used by the LMs.



In [None]:
MAX_LEN = 64
BATCH_SIZE = 96

In [None]:
class SentenceDataset(data.Dataset):

  def __init__(self, sentences, targets, abstract_ids, tokenizer, max_len):
    self.sentences = sentences
    self.targets = targets
    self.abstract_ids = abstract_ids
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.sentences)

  def __getitem__(self, item):
    sentence = self.sentences[item]
    target = self.targets[item]
    abstract_id = self.abstract_ids[item]

    encoding = tokenizer(
      sentence,
      padding="max_length", 
      truncation=True, 
      max_length=self.max_len,   
      return_tensors="pt" 
    )

    return {
      "input_ids": encoding["input_ids"].squeeze(),
      "labels": torch.tensor(target, dtype=torch.long),
      "attention_mask": encoding["attention_mask"].squeeze(),
      "abstract_ids": torch.tensor(abstract_id, dtype=torch.long)
    }

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = SentenceDataset(
      sentences = df.Sentences.to_numpy(),
      targets = df.Extractive.to_numpy(),
      abstract_ids = df.Abstract_index.to_numpy(),
      tokenizer=tokenizer,
      max_len=max_len,
  )

  return data.DataLoader(
      ds,
      batch_size=batch_size,
  )

In [None]:
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

# Abstract Summarizer
The single sentence classifier defines the configuration of the model. Here, can be defined how the sentence representations are formed (cls token or pooling) and from which layer or layer combinations the representations are used (default is last hidden layer). Here, the classifier is the only decoder option.



In [None]:
class SingleSentClassifier(nn.Module):
  def __init__(self):
      super(SingleSentClassifier, self).__init__()
      self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
      self.linear1 = nn.Linear(self.bert.config.hidden_size, 1)
      self.sigmoid = nn.Sigmoid()

  def forward(self, input_ids, attention_mask):
      outputs = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask,
      )
      #top_vec = (1/3) * (outputs.hidden_states[1] + outputs.hidden_states[11] + outputs.hidden_states[12]) # combination of different layers
      #top_vec = outputs.hidden_states[1] # single layer
      top_vec = outputs.last_hidden_state # last hidden layer
      sents_vec = pool_sents(top_vec, attention_mask)  #pooling
      #sents_vec = top_vec[:, 0] #cls token
      out = self.linear1(sents_vec).squeeze(-1)
      out = self.sigmoid(out)
      return out

In [None]:
def pool_sents(top_vec, attention_mask):
  sents_vec = torch.zeros([top_vec.size(0), top_vec.size(2)], dtype=torch.float32).to(device)
  for s in range(top_vec.size(0)):
    sent_len = torch.sum(attention_mask[s]).item() - 2
    sents_vec[s] = torch.mean(top_vec[s, :sent_len], 0).to(device)
  return sents_vec

In [None]:
model = SingleSentClassifier()
model.to(device)

In [None]:
def length_loss(preds, labels):
  
  loss = (preds - labels)**2
  if preds < labels:
    loss = loss * 2

  return loss

# Sentence Selection

Here, the sentences for the summary of each abstract are selected based on the selection strategy:


*   Best-3: select 3 highest scoring sentences
*   Dynamic: select sentence based on threshold, then add/remove highest/lowest scoring sentences if the prediction was outside of the allowed sentence range (1-4)

To change the strategy, uncomment the indicated lines and comment the lines for the other strategy (a bit messy, sorry). Because the abstracts are split into sentences, we need to put the sentences back together to form abstracts again.


In [None]:
def select_sents(sent_scores, labels, abstract_ids):

  abs_accs = []
  abs_prec = []
  abs_rec = []
  len_losses = []
  preds_len = np.array([0,0,0,0])
  gold_len = np.array([0,0,0,0])

  for abs_id in torch.unique(abstract_ids):
    abs_sents = (abstract_ids == abs_id).nonzero(as_tuple=True)[0].to(device)
    abs_scores = sent_scores[abs_sents].to(device)
    abs_labels = labels[abs_sents].to(device)

    abs_scores = abs_scores.cpu().data.numpy()

    # select best 3
    # selected_sents = np.argsort(-abs_scores) 
    # selected_sents = selected_sents[:3]
    # selected_sents = torch.tensor(selected_sents)
    # ones = torch.ones(abs_scores.shape, dtype=torch.int64)
    # selected_sents = torch.zeros(abs_scores.shape, dtype=torch.int64).scatter(0, selected_sents, ones).to(device)
    # until here

    # dynamic selection
    selected_sents = (abs_scores > 0.5).astype(int)

    if selected_sents.sum() == 0:
     sorted_sents = np.argsort(-abs_scores)
     selected_sents[sorted_sents[0]] = 1

    elif selected_sents.sum() > 4:
      sorted_sents = np.argsort(-abs_scores)
      selected_sents = np.zeros(selected_sents.size, dtype=int)
      for p in range(4):
        selected_sents[sorted_sents[p]] = 1
    
    selected_sents = torch.tensor(selected_sents).to(device)
    # until here

    correct = 0
    correct_pos = 0

    for i in range(selected_sents.size(0)):
      if selected_sents[i] == abs_labels[i]:
        correct += 1
        if abs_labels[i] == 1:
          correct_pos += 1

    acc = correct / selected_sents.size(0)
    prec = correct_pos / torch.sum(selected_sents).item()
    rec = correct_pos / torch.sum(abs_labels).item()
    len_loss = length_loss(torch.sum(selected_sents, dtype=float).item(), torch.sum(abs_labels, dtype=float).item())
    len_losses.append(len_loss)
    preds_len[torch.sum(selected_sents, dtype=int).item()-1] += 1
    gold_len[torch.sum(abs_labels, dtype=int).item()-1] += 1

    for k in range(selected_sents.size(0)):
      abs_accs.append(acc)
      abs_prec.append(prec)
      abs_rec.append(rec)

  return np.mean(abs_accs), np.mean(abs_prec), np.mean(abs_rec), np.mean(len_losses), preds_len, gold_len

# Training and validation

Here the actual fine-tuning happens. First, the hyperparameters and some metrics are defined. Then the training and evaluation functions are defined and called. During the training process, multiple stats are returned at the end of every epoch including: loss, accuraccy, precision, eecall, f1 score, length Loss and the length distributions.

In [None]:
EPOCHS = 7

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.BCELoss(reduction="none").to(device)

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler):

  model = model.train()

  losses = []

  for batch in train_data_loader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)
    abstract_ids = batch["abstract_ids"].to(device)

    sent_scores = model(
        input_ids=input_ids,
        attention_mask=attention_mask
    )

    loss = loss_fn(sent_scores, labels.float()).sum()
    losses.append(loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device):

  model = model.eval()

  all_abstract_ids = torch.empty([0]).to(device)
  all_labels = torch.empty([0]).to(device)
  all_sent_scores = torch.empty([0]).to(device)
  losses = []

  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch["input_ids"].to(device)
      attention_mask = batch["attention_mask"].to(device)
      labels = batch["labels"].to(device)
      abstract_ids = batch["abstract_ids"].to(device)

      sent_scores = model(
          input_ids=input_ids,
          attention_mask=attention_mask
      )

      loss = loss_fn(sent_scores, labels.float()).sum()
      losses.append(loss.item())

      all_abstract_ids = torch.cat((all_abstract_ids, abstract_ids))
      all_labels = torch.cat((all_labels, labels))
      all_sent_scores = torch.cat((all_sent_scores, sent_scores))

    acc, prec, rec, len_losses, preds_len, gold_len = select_sents(all_sent_scores, all_labels, all_abstract_ids)

  return np.mean(losses), acc, prec, rec, len_losses, preds_len, gold_len

In [None]:
def f1_score(prec, rec):
  f1 = 2*((prec*rec)/(prec+rec))
  # "balanced f1:"  2*((prec*rec)/(prec+rec)) - difference(prec, rec)*0.2
  return f1

In [None]:
%%time

best_f1 = 0

for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_loss = train_epoch(
      model,
      train_data_loader,
      loss_fn,
      optimizer,
      device,
      scheduler,
  )

  print(f'Train loss {round(train_loss, 4)}')


  val_loss, val_acc, val_prec, val_rec, val_len_loss, preds_len, gold_len = eval_model(
    model,
    test_data_loader,
    loss_fn,
    device
  )

  f1 = f1_score(val_prec, val_rec)

  print(f'Val loss {round(val_loss, 4)} accuracy {round(val_acc, 4)}')
  print(f"Val len loss {round(val_len_loss, 4)}")
  print(f"Precision: {round(val_prec, 4)} Recall: {round(val_rec, 4)}")
  print(f"F1-score: {round(f1, 4)}")
  print(f"Predictions distribution: {preds_len}")
  print(f"Labels distribution: {gold_len}")
  print()

  if f1 > best_f1:
    #torch.save(model.state_dict(), 'best_base_summarizer.bin')
    best_f1 = f1