In [None]:
import time
import random
import numpy as np
import pandas as pd
import torch

from torch.utils.data import (
    TensorDataset, DataLoader, WeightedRandomSampler, 
    RandomSampler, SequentialSampler, SubsetRandomSampler
)
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from transformers import (
    BertForSequenceClassification, BertTokenizer, AdamW, 
    get_linear_schedule_with_warmup, AutoModel, AutoTokenizer
)

Read annotated data

In [None]:
# Dataframe containing a list of papers (title and abstract) and labels indicating relevance
df_labelled = pd.read_csv("data/samples/labelled_relevant_papers.csv")
df_labelled["title_abstract"] = df_labelled["title"] + " " + df_labelled["abstract"]
df_labelled = df_labelled[df_labelled["title_abstract"].notna()]
df_labelled.fillna(0, inplace=True)
df_labelled.rename(columns={'relevant': 'label'}, inplace=True)

In [None]:
# Get the lists of paper title + abstract and their labels
text = df_labelled.title_abstract.values
labels = df_labelled.label.values

First, perform text embedding

In [None]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_cased')

In [None]:
max_len = 0

for sample in text:

    input_ids = tokenizer.encode(sample, add_special_tokens=True)

    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

In [None]:
# Tokenize all of the sentences and map the tokens to their word IDs
input_ids = []
attention_masks = []

for sample in text:

    encoded_dict = tokenizer.encode_plus(
                        sample,                      
                        add_special_tokens = True, 
                        max_length = 196,      
                        pad_to_max_length = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',   
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

Set up classification model

In [None]:
# Load BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained(
    "allenai/scibert_scivocab_cased",
    num_labels = 2,
    output_attentions = False, 
    output_hidden_states = False,
)

# Run the model on the CPU
device = torch.device("cpu")
model.to(device)

In [None]:
params = list(model.named_parameters())

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
                )

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''

    elapsed_rounded = int(round((elapsed)))
    
    return str(datetime.timedelta(seconds=elapsed_rounded))

Performance measures

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def flat_sensitivity(logits, labels):
    predictions = np.argmax(logits, axis=1).flatten()
    labels_flat = labels.flatten()
    return recall_score(labels_flat, predictions, average='binary')

def flat_f1_score(logits, labels):
    predictions = np.argmax(logits, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, predictions, average='binary')

Final model set up

In [None]:
seed_val = 49
epochs = 4
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Define the number of folds for cross-validation
num_folds = 5
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed_val)

sentences = df_labelled.title_abstract.values
labels = df_labelled.label.values

# Tokenize all of the sentences and map the tokens to their word IDs
input_ids = []
attention_masks = []

for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                
                        add_special_tokens = True,
                        max_length = 196,      
                        pad_to_max_length = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',   
                   )

    # Add the encoded sentence to the list
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding)
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

dataset = TensorDataset(input_ids, attention_masks, labels)

# Convert text inputs, attention masks, and labels to numpy arrays
text_inputs_numpy = input_ids.numpy()
attention_masks_numpy = attention_masks.numpy()
labels_numpy = labels.numpy()

training_stats = []

# Measure the total training time for the whole run
total_t0 = time.time()

# Perform cross-validation
for fold, (train_index, val_index) in enumerate(skf.split(text_inputs_numpy, labels_numpy)):
    print(f"Fold {fold+1}/{num_folds}")

    # Split data into training and validation sets
    train_texts, val_texts = text_inputs_numpy[train_index], text_inputs_numpy[val_index]
    train_masks, val_masks = attention_masks_numpy[train_index], attention_masks_numpy[val_index]
    train_labels, val_labels = labels_numpy[train_index], labels_numpy[val_index]

    train_labels_arr = train_labels.copy()
    val_labels_arr = val_labels.copy()

    # Convert arrays back to tensors
    train_texts = torch.tensor(train_texts, dtype=torch.long)
    val_texts = torch.tensor(val_texts, dtype=torch.long)
    train_masks = torch.tensor(train_masks, dtype=torch.long)
    val_masks = torch.tensor(val_masks, dtype=torch.long)

    # Convert labels to tensors
    train_labels = torch.tensor(train_labels, dtype=torch.long)
    val_labels = torch.tensor(val_labels, dtype=torch.long)

    # Create TensorDatasets
    train_dataset = TensorDataset(train_texts, train_masks, train_labels)
    val_dataset = TensorDataset(val_texts, val_masks, val_labels)

    print('{:>5,} training samples'.format(len(train_dataset)))
    print('{:>5,} validation samples'.format(len(val_dataset)))

    class_counts = df_labelled.label.value_counts()
    #class_weights = 1/class_counts
    #sample_weights = [1/class_counts[i] for i in df_paper.label]

    # Define batch size
    batch_size = 32

    # Create DataLoaders
    train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    sampler=WeightedRandomSampler(
    weights=[1/class_counts[i] for i in train_labels_arr],
    num_samples=len(train_dataset),
    replacement=True),
    shuffle=False  # Shuffle for training set
    )

    validation_dataloader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    sampler=WeightedRandomSampler(
    weights=[1/class_counts[i] for i in val_labels_arr],
    num_samples=len(val_dataset),
    replacement=True),
    shuffle=False  # No need to shuffle for validation set
    )

    total_steps = len(train_dataloader) * epochs

    # Create the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

    # Training Loop
    for epoch_i in range(epochs):
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_train_loss = 0

        # Put the model into training mode
        model.train()

        for step, batch in enumerate(train_dataloader):

            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            model.zero_grad()
            output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            (loss, logits) = output[:2]

            total_train_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        # Calculate the average loss over all batches
        avg_train_loss = total_train_loss / len(train_dataloader)

        training_time = format_time(time.time() - t0)

        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(training_time))

        # Validation loop
        print("Running Validation...")
        t0 = time.time()
        model.eval()

        total_eval_accuracy = 0
        total_eval_loss = 0
        total_eval_sensitivity = 0
        total_eval_f1_score = 0
        nb_eval_steps = 0

        for batch in validation_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            with torch.no_grad():
                output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
                (loss, logits) = output[:2]

            total_eval_loss += loss.item()
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            total_eval_accuracy += flat_accuracy(logits, label_ids)
            total_eval_sensitivity += flat_sensitivity(logits, label_ids)
            total_eval_f1_score += flat_f1_score(logits, label_ids)

        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        avg_val_loss = total_eval_loss / len(validation_dataloader)
        avg_val_sensitivity = total_eval_sensitivity / len(validation_dataloader)
        avg_val_f1_score = total_eval_f1_score / len(validation_dataloader)
        validation_time = format_time(time.time() - t0)

        print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  Sensitivity: {0:.2f}".format(avg_val_sensitivity))
        print("  F1 Score: {0:.2f}".format(avg_val_f1_score))
        print("  Validation took: {:}".format(validation_time))

        training_stats.append({
            'fold': fold + 1,
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Valid. Sensitivity': avg_val_sensitivity,
            'Valid. F1 Score': avg_val_f1_score,
            'Training Time': training_time,
            'Validation Time': validation_time
        })

# Calculate the average training time
total_training_time = format_time(time.time() - total_t0)
print("")
print("Total training took {:}".format(total_training_time))

Save model

In [None]:
import os

output_dir = './final_model/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Model saved to %s" % output_dir)

model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
output_dir = './final_model/'

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained(output_dir)

# Load the model
model = BertForSequenceClassification.from_pretrained(output_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Assess performance

In [None]:
# Create a DataFrame from our training statistics
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index
df_stats = df_stats.set_index('epoch')

Predict relevance of unlabelled papers

In [None]:
df_unlabelled = pd.read_csv("data/unlabelled_papers.csv")
df_unlabelled.drop_duplicates(subset="title", inplace=True)

In [None]:
df_unlabelled.dropna(subset=["title_abstract"], inplace=True)

In [None]:
# Create sentence and label lists
papers = df_unlabelled["title_abstract"].values

# Tokenize all of the sentences and map the tokens to thier word IDs
input_ids = []
attention_masks = []

for paper in papers:
    encoded_dict = tokenizer.encode_plus(
                        paper,                      
                        add_special_tokens = True, 
                        max_length = 196,           
                        pad_to_max_length = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',     
                   )
    
    # Add the encoded sentence to the list   
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding)
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Set the batch size 
batch_size = 32  

# Create the DataLoader
prediction_data = TensorDataset(input_ids, attention_masks)
prediction_dataloader = DataLoader(prediction_data, batch_size=batch_size)

In [None]:
# Prediction on test set
# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions = []

# Predict 
for i, batch in enumerate(prediction_dataloader):
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask = batch
  
  with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()

  # Store predictions and true labels
  predictions.append(logits)

In [None]:
pred_labels = []
for i in range(len(predictions)):
  pred_labels.append(np.argmax(predictions[i], axis=1).flatten())