# Install and import necessary libraries


In [None]:
!pip install transformers

In [None]:
from transformers import DistilBertTokenizerFast, DistilBertModel
from transformers.optimization import AdamW, get_linear_schedule_with_warmup
import torch 
import copy
import math
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import textwrap 
from pylab import rcParams
from matplotlib import rc
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from torch import nn, optim
from torch.utils import data 
from collections import defaultdict
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import KFold

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Check for GPU

Tesla T4 and P100 should work

In [None]:
!nvidia-smi

In [None]:
if torch.cuda.is_available():
  device = torch.device("cuda:0")
else:
  device = torch.device("cpu")

In [None]:
device

# Prepare Data


*   open labeled dataset as dataframe
*   create datasplits with random seed
*   select a split by changing the idx







In [None]:
df = pd.read_json("cleaned1000.json")
df.tail()

In [None]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)
train_idx = []
test_idx = []
for train_index, test_index in kf.split(df):
  train_idx.append(train_index)
  test_idx.append(test_index)

In [None]:
idx = 0 # testing was executed with splits 0,1 and 2
df_train = df.iloc[train_idx[idx]]
df_test = df.iloc[test_idx[idx]]
df_val = df_test

# Chose model and tokenizer
Replace the model name with one of the commented names to select on of the following models and its tokenizer from huggingface or local data:


*   DistilBERT-cased/uncased
*   DistilBERT-uncased DAPT

For the DAPT models the tokenizers of the base model needs to be selected.






In [None]:
# "distilbert-base-cased"
# "distilbert-base-uncased"

PRE_TRAINED_MODEL_NAME = "distilbert-base-cased"
tokenizer = DistilBertTokenizerFast.from_pretrained(PRE_TRAINED_MODEL_NAME)

# Tokenizer and DataLoaders

The tokenizer creates the input for the pre-trained LMs and does the padding. The necessary outputs are:


*   Sentences as text (techincally not needed but nice for testing)
*   Labels
*   Standard inputs: input_ids, attention_mask, segment_ids (BERT-only)
*   Additional inputs: cls_ids, cls_mask

The dataloaders use this function to create batches from the dataset which can be used by the LMs.



In [None]:
MAX_LEN = 512
MAX_SENTS = 15
BATCH_SIZE = 12

In [None]:
def tokenize_abstract(sentences, targets, max_len, max_sents, tokenizer):

  encoding = tokenizer(sentences) 

  joint_input_ids = []
  joint_attention_mask = []
  x = 0

  for n in range(len(encoding["input_ids"])):
    joint_input_ids.extend(encoding["input_ids"][n])
    joint_attention_mask.extend(encoding["attention_mask"][n])

  token_padding = [0] * (max_len - len(joint_input_ids))
  sent_padding = [0] * (max_sents - len(sentences))

  joint_input_ids.extend(token_padding)
  joint_attention_mask.extend(token_padding)

  joint_input_ids = torch.tensor(joint_input_ids).squeeze()
  joint_attention_mask = torch.tensor(joint_attention_mask).squeeze()

  labels = torch.tensor(targets)
  labels = torch.cat((labels, torch.tensor(sent_padding)), 0).long()

  cls_ids = [i for i, t in enumerate(joint_input_ids) if t == tokenizer.cls_token_id]
  cls_ids.extend(sent_padding)
  cls_ids = torch.tensor(cls_ids)
  

  cls_mask = torch.cat((torch.ones([len(sentences)]), torch.zeros([max_sents - len(sentences)])), 0).long()

  abstract_text = "<>".join(sentences)

  return {
      "sentences": abstract_text,
      "labels": labels,
      "input_ids": joint_input_ids,
      "attention_mask": joint_attention_mask,
      "cls_ids": cls_ids,
      "cls_mask": cls_mask
  }

In [None]:
class AbsClassDataset(data.Dataset):

  def __init__(self, abstracts, targets, tokenizer, max_len, max_sents):
    self.abstracts = abstracts
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.max_sents = max_sents

  def __len__(self):
    return len(self.abstracts)

  def __getitem__(self, item):
    abstract = self.abstracts[item]
    target = self.targets[item]

    encoding = tokenize_abstract(abstract, target, max_len=self.max_len, max_sents = self. max_sents, tokenizer=self.tokenizer)

    return {
      "abstract_text": encoding["sentences"],
      "labels": encoding["labels"],
      "input_ids": encoding["input_ids"],
      "attention_mask": encoding["attention_mask"],
      "cls_ids": encoding["cls_ids"],
      "cls_mask": encoding["cls_mask"]
    }

In [None]:
def create_data_loader(df, tokenizer, max_len, max_sents, batch_size):
  ds = AbsClassDataset(
      abstracts=df.Sentences.to_numpy(),
      targets=df.Extractive.to_numpy(),
      tokenizer=tokenizer,
      max_len=max_len,
      max_sents=max_sents
  )

  return data.DataLoader(
      ds,
      batch_size=batch_size,
  )

In [None]:
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, MAX_SENTS, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, MAX_SENTS, BATCH_SIZE)

# Summarization Layers

Here, the two different options for summarization layers (or encoders as they are called in the code) are defined:


*   The Classifier simply does a linear transformation followed by a softmax
*   The TransformerInterEncoder from the BERtSumExt implementation, code is modified from the original repository (https://github.com/nlpyang/PreSumm). For some reason the transformer layers from pytorch do not feature positional encoding so this needs to be defined manually



In [None]:
class Classifier(nn.Module):
  def __init__(self, hidden_size):
      super(Classifier, self).__init__()
      self.linear1 = nn.Linear(hidden_size, 1)
      self.sigmoid = nn.Sigmoid()

  def forward(self, x, cls_mask):
      h = self.linear1(x).squeeze(-1)
      sent_scores = self.sigmoid(h) * cls_mask.float()
      return sent_scores

In [None]:
class PositionalEncoding(nn.Module):

  def __init__(self, dropout, dim, max_len=5000):
    pe = torch.zeros(max_len, dim)
    position = torch.arange(0, max_len).unsqueeze(1)
    div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float) *
                          -(math.log(10000.0) / dim)))
    pe[:, 0::2] = torch.sin(position.float() * div_term)
    pe[:, 1::2] = torch.cos(position.float() * div_term)
    pe = pe.unsqueeze(0)
    super(PositionalEncoding, self).__init__()
    self.register_buffer('pe', pe)
    self.dropout = nn.Dropout(p=dropout)
    self.dim = dim

  def forward(self, emb, step=None):
    emb = emb * math.sqrt(self.dim)
    if (step):
        emb = emb + self.pe[:, step][:, None, :]

    else:
        emb = emb + self.pe[:, :emb.size(1)]
    emb = self.dropout(emb)
    return emb

  def get_emb(self, emb):
    return self.pe[:, :emb.size(1)]

In [None]:
class TransformerInterEncoder(nn.Module):

  def __init__(self, d_model, d_ff, heads, dropout, num_layers=0):
    super(TransformerInterEncoder, self).__init__()
    self.pos_emb = PositionalEncoding(dropout, d_model)
    self.encoder_layer = nn.TransformerEncoderLayer(
        d_model=d_model, 
        nhead=heads,
        dim_feedforward=d_ff,
        dropout=dropout
    )
    self.transformer_encoder = nn.TransformerEncoder(
        encoder_layer=self.encoder_layer,
        num_layers=num_layers,
        norm=nn.LayerNorm(d_model, eps=1e-6)
    )
    self.linear1 = nn.Linear(d_model, 1, bias=True)
    self.sigmoid = nn.Sigmoid()

  def forward(self, sents_vec, mask):
    batch_size, n_sents = sents_vec.size(0), sents_vec.size(1)
    pos_emb = self.pos_emb.pe[:, :n_sents]
    x = sents_vec * mask[:, :, None].float()
    x = x + pos_emb
    x = torch.transpose(x, 0, 1)
    x = self.transformer_encoder(x, src_key_padding_mask=(1-mask).bool())
    x = torch.transpose(x, 0, 1)

    h = self.linear1(x).squeeze(-1)
    sent_scores = self.sigmoid(h) * mask.float()

    return sent_scores

# Abstract Summarizer
The abstract summarizer defines the configuration of the model. Here, can be defined how the sentence representations are formed (cls token or pooling) and from which layer or layer combinations the representations are used (default is last hidden layer). Which encoder is used can be defined when the model is initialized.



In [None]:
def pool_sents(top_vec, cls_ids, attention_mask):
  sents_vec = torch.zeros([top_vec.size(0), MAX_SENTS, top_vec.size(2)], dtype=torch.float32).to(device)
  for s in range(top_vec.size(0)):
    for i in range(14):
      padding = False
      if cls_ids[s, i+1].item() == 0:
        if cls_ids[s, i].item() != 0:
          sent_start = cls_ids[s, i].item()
          sent_end = torch.sum(attention_mask[s]).item() - 2
        else:
          padding = True
      else:
        sent_start = cls_ids[s, i].item()
        sent_end = cls_ids[s, i+1].item() - 2
      if not padding:
        sents_vec[s, i] = torch.mean(top_vec[s, sent_start:sent_end], 0).to(device)
  return sents_vec

In [None]:
class AbstractSummarizer(nn.Module):

  def __init__(self, encoder):
    super(AbstractSummarizer, self).__init__()
    self.bert = DistilBertModel.from_pretrained(PRE_TRAINED_MODEL_NAME, output_hidden_states=True)
    if encoder == "transformer":
      self.encoder = TransformerInterEncoder(self.bert.config.hidden_size,
                                             2048, 8, 0.1, num_layers=2)
    else:
      self.encoder = Classifier(self.bert.config.hidden_size)

  def forward(self, input_ids, attention_mask, cls_ids, cls_mask):
    outputs = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask,
    )
    top_vec = outputs.last_hidden_state # last hidden layer
    #top_vec = outputs.hidden_states[1] # single layer
    #top_vec = (1/3) * (outputs.hidden_states[1] + outputs.hidden_states[2] + outputs.hidden_states[6]) # combination of different layers
    #sents_vec = top_vec[torch.arange(top_vec.size(0)).unsqueeze(1), cls_ids]  # sent representations = CLS tokens
    sents_vec = pool_sents(top_vec, cls_ids, attention_mask)  # sent representations = pooled over all tokens
    sents_vec = sents_vec * cls_mask[:, :, None].float()
    sent_scores = self.encoder(sents_vec, cls_mask).squeeze(-1)
    return sent_scores, cls_mask

# Sentence Selection

Here, the sentences for the summary of each abstract are selected based on the selection strategy:


*   Best-3: select 3 highest scoring sentences
*   Dynamic: select sentence based on threshold, then add/remove highest/lowest scoring sentences if the prediction was outside of the allowed sentence range (1-4)

To change the strategy, uncomment the indicated lines and comment the lines for the other strategy (a bit messy, sorry)


In [None]:
def select_sents(sent_scores, labels, cls_mask, test_mode=False):

  abs_accs = []
  abs_prec = []
  abs_rec = []
  prec = 0
  rec = 0
  preds_len = np.array([0,0,0,0])
  gold_len = np.array([0,0,0,0])


  sent_scores = sent_scores.cpu().data.numpy()

  # Best-3
  selected_sents = np.argsort(-sent_scores, 1) 
  selected_sents = selected_sents[:,:3]
  selected_sents = torch.tensor(selected_sents)
  ones = torch.ones(sent_scores.shape, dtype=torch.int64)
  selected_sents = torch.zeros(sent_scores.shape, dtype=torch.int64).scatter(1, selected_sents, ones).to(device)

  # Dynamic
  # selected_sents = (sent_scores > 0.5).astype(int)
  # selected_sents = torch.tensor(selected_sents).to(device)

  abs_len = cls_mask.sum(dim=1).int() #always leave this and the following three lines uncommented

  for i in range(len(cls_mask)):
    abs_labels = labels[i][:abs_len[i]]
    preds = selected_sents[i, :abs_len[i]]
    preds = preds.cpu().data.numpy()

    # if preds.sum() == 0:
    #   sentence_scores = sent_scores[i][:abs_len[i]]
    #   sorted_sents = np.argsort(-sentence_scores)
    #   preds[sorted_sents[0]] = 1

    # elif preds.sum() > 4:
    #   sentence_scores = sent_scores[i][:abs_len[i]]
    #   sorted_sents = np.argsort(-sentence_scores)
    #   preds = np.zeros(preds.size, dtype=int)
    #   for p in range(4):
    #     preds[sorted_sents[p]] = 1

    # selected_sents[i, :abs_len[i]] = torch.tensor(preds).to(device)
    # until here

    correct = 0
    correct_pos = 0

    for j in range(len(abs_labels)):
      if abs_labels[j] == preds[j]:
        correct += 1
        if test_mode and abs_labels[j] == 1:
          correct_pos += 1

    if test_mode:
      abs_labels = abs_labels.cpu().data.numpy()
      prec = correct_pos / preds.sum()
      rec = correct_pos / abs_labels.sum()
      for k in range(len(abs_labels)):
        abs_prec.append(prec)
        abs_rec.append(rec)
      preds_len[preds.sum()-1] += 1
      gold_len[abs_labels.sum()-1] += 1

    #all abstracts weighted the same
    #abs_accs.append(correct / len(abs_labels)) 
    
    #accuracy weighted by abstract length
    acc = correct / len(abs_labels)
    for k in range(len(abs_labels)):
      abs_accs.append(acc)

  if test_mode:
    return selected_sents, np.mean(abs_accs), np.mean(abs_prec), np.mean(abs_rec), preds_len, gold_len

  else:
    return selected_sents, np.mean(abs_accs)

# Initialize the model

chose the summary layer by changing the encoder variable ("classifier" or "transformer")

In [None]:
model = AbstractSummarizer(encoder="transformer")
model.to(device)

# Training and validation

Here the actual fine-tuning happens. First, the hyperparameters and some metrics are defined. Then the training and evaluation functions are defined and called. During the training process, multiple stats are returned at the end of every epoch including: loss, accuraccy, precision, eecall, f1 score, length Loss and the length distributions.

In [None]:
EPOCHS = 7

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.BCELoss(reduction="none").to(device)

In [None]:
def length_loss(preds, labels):

  losses = []
  
  for i in range(len(preds)):
    loss = (preds[i] - labels[i])**2
    if preds[i] < labels[i]:
      loss = loss * 2
    losses.append(loss.item())

  return np.mean(losses)

In [None]:
def f1_score(prec, rec):
  f1 = 2*((prec*rec)/(prec+rec))
  # "balanced f1:"  2*((prec*rec)/(prec+rec)) - difference(prec, rec)*0.2
  return f1

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler):

  model = model.train()
  
  losses = []
  len_losses = []
  accuracies = []

  for batch in data_loader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)
    cls_ids = batch["cls_ids"].to(device)
    cls_mask = batch["cls_mask"].to(device)

    sent_scores, cls_mask = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        cls_ids=cls_ids,
        cls_mask=cls_mask
    )

    loss = loss_fn(sent_scores, labels.float()).sum()
    losses.append(loss.item())

    selected_sents, abs_acc = select_sents(sent_scores, labels, cls_mask)
    accuracies.append(abs_acc)

    len_loss = length_loss(torch.sum(selected_sents, dim=1, dtype=float), torch.sum(labels, dim=1, dtype=float))
    len_losses.append(len_loss.item())

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return np.mean(accuracies), np.mean(losses), np.mean(len_losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, test_mode=False):

  model = model.eval()

  losses = []
  len_losses = []
  accuracies = []
  prec = []
  rec = []
  preds_len_sum = np.array([0,0,0,0])
  gold_len_sum = np.array([0,0,0,0])

  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch["input_ids"].to(device)
      attention_mask = batch["attention_mask"].to(device)
      labels = batch["labels"].to(device)
      cls_ids = batch["cls_ids"].to(device)
      cls_mask = batch["cls_mask"].to(device)
      labels = batch["labels"].to(device)

      sent_scores, cls_mask = model(
          input_ids=input_ids,
          attention_mask=attention_mask,
          cls_ids=cls_ids,
          cls_mask=cls_mask
      )

      loss = loss_fn(sent_scores, labels.float()).sum()
      losses.append(loss.item())

      if test_mode:
        selected_sents, abs_acc, abs_prec, abs_rec, preds_len, gold_len = select_sents(sent_scores, labels, cls_mask, test_mode=True)
        accuracies.append(abs_acc)
        prec.append(abs_prec)
        rec.append(abs_rec)
        preds_len_sum += preds_len
        gold_len_sum += gold_len
        len_loss = length_loss(torch.sum(selected_sents, dim=1, dtype=float), torch.sum(labels, dim=1, dtype=float))
        len_losses.append(len_loss.item())

      else:
        selected_sents, abs_acc = select_sents(sent_scores, labels, cls_mask)
        accuracies.append(abs_acc)

  if test_mode:
    return np.mean(accuracies), np.mean(losses), np.mean(len_losses), np.mean(prec), np.mean(rec), preds_len_sum, gold_len_sum
  else:
    return np.mean(accuracies), np.mean(losses)

In [None]:
%%time

best_f1 = 0

for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_acc, train_loss, train_len_loss = train_epoch(
      model,
      train_data_loader,
      loss_fn,
      optimizer,
      device,
      scheduler,
  )

  print(f'Train loss {round(train_loss, 4)} accuracy {round(train_acc, 4)}')
  print(f"Train len loss {round(train_len_loss, 4)}")


  val_acc, val_loss, val_len_loss, pos_prec, pos_rec, preds_len, gold_len = eval_model(
    model,
    test_data_loader,
    loss_fn,
    device,
    test_mode=True
  )

  f1 = f1_score(pos_prec, pos_rec)

  print(f'Val loss {round(val_loss, 4)} accuracy {round(val_acc, 4)}')
  print(f"Val len loss {round(val_len_loss, 4)}")
  print(f"Precision: {round(pos_prec, 4)} Recall: {round(pos_rec, 4)}")
  print(f"F1-score: {round(f1, 4)}")
  print(f"Predictions distribution: {preds_len}")
  print(f"Labels distribution: {gold_len}")
  print()

  if f1 > best_f1:
    #torch.save(model.state_dict(), 'best_distilbert_summarizer.bin')
    best_f1 = f1