# Fine Tuning BERT for MultiLabel Medical Text Classification

In [1]:
! pip install transformers

Collecting transformers
  Downloading transformers-4.14.1-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.4 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 17.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 295 kB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 23.1 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 6.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transform

In [1]:
# Importing stock ml libraries

import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn import metrics

import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModelForMaskedLM

import logging
logging.basicConfig(level=logging.ERROR)

import warnings
warnings.simplefilter('ignore')

In [2]:
# # Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


# Pre-Processing data

In [3]:
data = pd.read_csv('clean_data.csv')
number_of_classes = data.shape[1]-1
classes_name = data.columns[1:]


new_df = pd.DataFrame()
new_df['text'] = data['CLEAN_TEXT']
new_df['labels'] = data.iloc[:, 1:].values.tolist()
new_df = new_df.iloc[:1000,:] # decrease size of dataset for reducing processing time

In [4]:
new_df

Unnamed: 0,text,labels
0,Admitted from rehabilitation for hypotension (...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0]"
1,The patient is a 65 year-old woman with end st...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,"Weakness, inability to talk. This is a 41-year...","[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0]"
3,"Baby Girl is a 1,385 gram, former 30 and wee...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
4,2-year-old male with a past medical history si...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...
995,"Bleomycin / Bactrim / IV Dye, Iodine Containin...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
996,Past medical/surgical history: Hodgkin's disea...,"[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
997,Patient recorded as having No Known Allergies ...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
998,Patient recorded as having No Known Allergies ...,"[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


# Preparing the Dataset and Dataloader

In [5]:
# Configuration

BERT_name = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'
# BERT_name = 'emilyalsentzer/Bio_ClinicalBERT'
# BERT_name = 'bert-base-uncased'

MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 1
LEARNING_RATE = 1e-05
# tokenizer = BertTokenizer.from_pretrained(BERT_name)
tokenizer = AutoTokenizer.from_pretrained(BERT_name)

In [6]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [7]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_data=new_df.sample(frac=train_size,random_state=200)
test_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

FULL Dataset: (1000, 2)
TRAIN Dataset: (800, 2)
TEST Dataset: (200, 2)


In [8]:
training_set.data

Unnamed: 0,text,labels
0,Shortness of breath and fatigue. This is a 70 ...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0]"
1,Patient recorded as having No Known Allergies ...,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,"Type 1 diabetes (diagnosed at the age of 24), ...","[0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]"
3,"Intubation, Tracheostomy, G tube placement 47 ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
4,Coronary artery disease with percutaneous tran...,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
...,...,...
795,This is a 27-1/7 week infant who is now being ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]"
796,Admission Date: Discharge Date: This...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
797,Patient recorded as having No Known Allergies ...,"[1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
798,Patient recorded as having No Known Allergies ...,"[1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0]"


In [9]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

# Creating the Neural Network for Fine Tuning

In [10]:
# Creating the customized model

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        # self.l1 = BertModel.from_pretrained(BERT_name)
        self.l1 = AutoModelForMaskedLM.from_pretrained(BERT_name)
        self.pre_classifier = torch.nn.Linear(30522, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, number_of_classes)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0, :]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = BERTClass()
model.to(device)

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertForMaskedLM(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                (L

In [11]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [12]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

# Fine Tuning the Model

In [13]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%50==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [14]:
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Loss:  0.7901211977005005


50it [35:23, 42.43s/it]

Epoch: 0, Loss:  0.48440951108932495


100it [1:10:31, 42.32s/it]


# Validating the Model


In [15]:
def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):

            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [16]:
outputs, targets = validation(testing_loader)

final_outputs = np.array(outputs) >=0.5

25it [03:08,  7.56s/it]


In [17]:
# Calculate score for multiple label classification
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [18]:
val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))

print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")

Hamming Score = 0.07734523809523809
Hamming Loss = 0.15733333333333333


# Make a single prediction

In [19]:
def encode_predict(text):
  inputs = tokenizer.encode_plus(text,None,add_special_tokens=True,max_length= 512,pad_to_max_length=True, return_token_type_ids=True)
  ids = torch.tensor(inputs['input_ids']).reshape(1,512).to(device, dtype = torch.long)
  mask = torch.tensor(inputs['attention_mask']).reshape(1,512).to(device, dtype = torch.long)
  token_type_ids = torch.tensor(inputs["token_type_ids"]).reshape(1,512).to(device, dtype = torch.long)
  return (ids, mask, token_type_ids)

In [20]:
with torch.no_grad():
  text = train_data.iloc[0]['text']
  ids, mask, token_type_ids = encode_predict(text)
  outputs = model(ids, mask, token_type_ids)
  prediction = torch.sigmoid(outputs).cpu().detach().numpy().tolist()

In [21]:
final_outputs = np.array(prediction) >=0.5
pd.DataFrame([final_outputs[0]], columns = classes_name.to_list())

Unnamed: 0,25000,2720,2724,2859,4019,41401,42731,4280,486,51881,53081,5849,5990,V053,V290
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
