In [None]:
%%capture
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
import random
import re
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms

In [None]:
# Import Bio_ClinicalBERT model from huggingface

from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
BioClinicalBert = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [None]:
# Import pubmed abstracts data into a dataframe

data = pd.read_csv('/kaggle/input/pubmed/pubmed_abstracts.csv')

In [None]:
corpus = data['neuroprosthetics'].dropna().values.tolist() + data['covid_19'].dropna().values.tolist() + data['deep_learning'].dropna().values.tolist() + data['human_connectome'].dropna().values.tolist() + data['brain_machine_interfaces'].dropna().values.tolist()

In [None]:
# Build text library
text = []

# Iterate over text snippets
for line in corpus:

  try:

    # Remove abstracts which only contain title
    if not re.match(".*\[\]",line):

      # Append only abstract text
      text.append((line[line.find("[")+1:line.find("]")])[1:-2])

  except:
    pass

In [None]:
def create_inputs(text):

  # Create list of sentences
  sentences_a = []
  sentences_b = []
  labels = []

  # Create a list of paragraphs, split by full-stop mark
  bag = []

  for paragraph in text:
    bag.append(paragraph.split('.'))

  ListOfSentences = [x for sublist in bag for x in sublist]

  # Iterate over each abstract
  for para in bag:

    # Abstract should have more than 1 line
    if len(para) > 1:

      # Choose a random sentence 
      start_key = random.randint(0,len(para)-2)
      sentences_a.append(para[start_key])

      # Append random sentence 50% of the time
      if random.random() > 0.5:
        sentences_b.append(ListOfSentences[random.randint(0,len(ListOfSentences) - 1)])
        labels.append(0)

      # Append next sentence other 50% of the time
      else:
        sentences_b.append(para[start_key + 1])
        labels.append(1)

 # BERT Tokenizer, 512 length sentences
  inputs = tokenizer(sentences_a,sentences_b,
                    return_tensors='pt',
                    max_length = 512,
                    truncation = True,
                    padding = 'max_length')
  # Add a labels section
  inputs['labels'] = torch.LongTensor([labels]).T

  return inputs 

In [None]:
# Number of usable abstracts
len(text)

In [None]:
# Shuffle randomly, to intermix topics
random.shuffle(text)

In [None]:
# split into train, valid and test
train = text[:20000]
valid = text[20001:25000]
test = text[25001:]

In [None]:
# Build PyTorch dataset class instance for clinical text
class MedTextDataset(torch.utils.data.Dataset):

  def __init__(self,text):
    self.text = text
  
  def __len__(self):
    return len(self.text)

  def __getitem__(self,index):
    dictionary = create_inputs(self.text[index])
    return {key: dictionary[key] for key in dictionary}

In [None]:
# Instantiate
train_dataset = MedTextDataset(train)
valid_dataset = MedTextDataset(valid)
test_dataset = MedTextDataset(test)

In [None]:
# Build dataloader class object

BATCH_SIZE = 32
trainloader = torch.utils.data.DataLoader(train_dataset,batch_size = BATCH_SIZE, shuffle = True)
validloader = torch.utils.data.DataLoader(valid_dataset,batch_size = BATCH_SIZE, shuffle = True)
testloader = torch.utils.data.DataLoader(test_dataset,batch_size = BATCH_SIZE, shuffle = True)

In [None]:
# Send to 'GPU'
device = torch.device('cuda')

In [None]:
# Build BERT Model + Linear Layer + Sigmoid
class BioClinicalBertNSP(nn.Module):

  def __init__(self):
    super().__init__()

    # Create models
    self.bioclinicalbert = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
    self.linear = nn.Linear(768,1)
  
  def forward(self,input_ids,attention_mask,token_type_ids):

    # Pass through layers
    outputs = self.bioclinicalbert(input_ids, attention_mask = attention_mask,token_type_ids = token_type_ids)

    outputs = outputs['last_hidden_state']
    outputs = torch.mean(outputs, dim = 1)

    outputs = self.linear(outputs)
    outputs = F.sigmoid(outputs)

    return outputs

In [None]:
%config Completer.use_jedi = True

In [None]:
def compute_accuracy(model, data_loader, device):
    
    model.eval()

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for batch_idx, batch in enumerate(data_loader):

            ### Prepare data
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, token_type_ids = token_type_ids, attention_mask=attention_mask)

            predicted_labels = torch.round(outputs)

            num_examples += labels.size(0)

            correct_pred += (predicted_labels == labels).sum()
    return correct_pred.float()/num_examples * 100

In [None]:
model = BioClinicalBertNSP()

In [None]:
for name,param in model.named_parameters():
    param.requires_grad = False

In [None]:
model.linear.weight.requires_grad = True
model.linear.bias.requires_grad = True

In [None]:
for name, param in model.named_parameters():
    if (param.requires_grad == True):
        print(name,param.requires_grad)

In [None]:
%%capture
model.to(device)

In [None]:
from transformers import AdamW
optim = AdamW(model.parameters(), lr = 1e-5)

In [None]:
start_time = time.time()
NUM_EPOCHS = 2

criterion = nn.BCELoss()

for epoch in range(NUM_EPOCHS):
    
    model.train()
    
    for batch_idx, batch in enumerate(trainloader):
        
      optim.zero_grad()

      input_ids = batch['input_ids'].to(device)
      token_type_ids = batch['token_type_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)
    

      output = model(input_ids, 
                    token_type_ids = token_type_ids, 
                    attention_mask = attention_mask)
      
      loss = criterion(output,labels.float())
      loss.backward()

      if not batch_idx % 50:

        print(f"epoch number = {epoch}", f"batch {batch_idx}/{len(trainloader)}", f"loss = {round(loss.item(),2)}")

      optim.step()
            
    model.eval()

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_accuracy(model, trainloader, device):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_accuracy(model, validloader, device):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, testloader, device):.2f}%')

In [None]:
compute_accuracy(model, testloader, device)