In [96]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast

# specify GPU
device = torch.device("cuda")

In [97]:
df = pd.read_csv("combined_data_v3.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,uniq_id,patient_id,outcome,chart_labeled_date,text
0,1,1,215,True,"Mon, 07 Nov 2022 02:44:03 GMT",SOCIAL WORK ASSESSMENT Robert Wesley is a 5...
1,2,2,215,True,"Mon, 07 Nov 2022 02:44:11 GMT","SOCIAL WORK FOLLOW UP NOTE Chart reviewed,..."
2,3,3,220,True,"Mon, 07 Nov 2022 02:49:25 GMT",New Patient Visit Ashley Busch is a 29 Y fema...
3,4,4,360,True,"Mon, 07 Nov 2022 02:57:29 GMT",SW met with pt at bedside. Present was Interpr...
4,5,5,360,True,"Mon, 07 Nov 2022 02:58:26 GMT",Received SBAR regarding pt's admission. Pt is ...


In [98]:
# removes irrevelant columns from the original data
df = df.drop(['patient_id', 'uniq_id', 'chart_labeled_date', 'Unnamed: 0'], axis = 1)

In [99]:
df.head()

Unnamed: 0,outcome,text
0,True,SOCIAL WORK ASSESSMENT Robert Wesley is a 5...
1,True,"SOCIAL WORK FOLLOW UP NOTE Chart reviewed,..."
2,True,New Patient Visit Ashley Busch is a 29 Y fema...
3,True,SW met with pt at bedside. Present was Interpr...
4,True,Received SBAR regarding pt's admission. Pt is ...


In [100]:
# replaces True and Falses with 1 and 0 to make outcome more straightforward
df = df.replace(True, 1)
df = df.replace(False, 0)
df.head()

Unnamed: 0,outcome,text
0,1,SOCIAL WORK ASSESSMENT Robert Wesley is a 5...
1,1,"SOCIAL WORK FOLLOW UP NOTE Chart reviewed,..."
2,1,New Patient Visit Ashley Busch is a 29 Y fema...
3,1,SW met with pt at bedside. Present was Interpr...
4,1,Received SBAR regarding pt's admission. Pt is ...


In [101]:
# Using train test split, we split the text into training, validation, and test sets

train_text, temp_text, train_labels, temp_labels = train_test_split(df['text'], df['outcome'], 
                                                                    random_state=2022, 
                                                                    test_size=0.6, 
                                                                    stratify=df['outcome'])

val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
                                                                random_state=2022, 
                                                                test_size=0.4, 
                                                                stratify=temp_labels)

In [102]:
# import pretrained BERT model
bert = AutoModel.from_pretrained('bert-base-uncased')

# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [103]:
# tokenize and encode sequences in the training set
max_sentence = 100

tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = max_sentence,
    padding='max_length',
    truncation=True
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = max_sentence,
    padding='max_length',
    truncation=True
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = max_sentence,
    padding='max_length',
    truncation=True
)

In [104]:
# convert lists to tensors and adds attention mask 

train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

In [105]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

In [106]:
# freeze all the parameters for fine-tuning 
for param in bert.parameters():
    param.requires_grad = False

In [107]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):
        super(BERT_Arch, self).__init__()
        
        self.bert = bert 
        
        # dropout layer
        self.dropout = nn.Dropout(0.1)
      
        # relu activation function
        self.relu = nn.ReLU()

        # fully connected layer 1
        self.fc1 = nn.Linear(768,512)
      
        # fully connected layer 2 (Output layer)
        self.fc2 = nn.Linear(512,2)

        #softmax activation function
        self.softmax = nn.LogSoftmax(dim=1)

    #define the forward propagation
    def forward(self, sent_id, mask):
        #pass the inputs to the model  
        _, cls_hs = self.bert(sent_id, attention_mask=mask, return_dict=False)
      
        x = self.fc1(cls_hs)

        x = self.relu(x)

        x = self.dropout(x)

        # output layer
        x = self.fc2(x)
      
        # apply softmax activation
        x = self.softmax(x)

        return x

In [108]:
# pass the pre-trained BERT to our architecture
model = BERT_Arch(bert)

In [109]:
# push the model to GPU
model = model.to(device)

In [110]:
# Adam optimizer from huggingface transformers
from transformers import AdamW

# define the optimizer
optimizer = torch.optim.AdamW(model.parameters(),lr = 1e-5) 

In [111]:
from sklearn.utils.class_weight import compute_class_weight

#compute the class weights
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)

In [112]:
# converts list of class weights to a tensor
weights= torch.tensor(class_weights,dtype=torch.float)

# pushes to GPU
weights = weights.to(device)

# define the loss function
cross_entropy = nn.NLLLoss(weight=weights) 

# number of training epochs
epochs = 20

In [113]:
# function to train the model
def train():
    
    model.train()
    total_loss, total_accuracy = 0, 0
  
    # empty list to save model predictions
    total_preds=[]
  
    # iterate over batches
    for step, batch in enumerate(train_dataloader):
        
        # progress update after every 50 batches
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
        
        # push the batch to gpu
        batch = [r.to(device) for r in batch]
 
        sent_id, mask, labels = batch
        
        # clear previously calculated gradients 
        model.zero_grad()        

        # get model predictions for the current batch
        preds = model(sent_id, mask)

        # compute the loss between actual and predicted values
        loss = cross_entropy(preds, labels)

        # add on to the total loss
        total_loss = total_loss + loss.item()

        # backward propagation to calculate the gradients
        loss.backward()

        # clip the the gradients to 1.0 to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # update parameters
        optimizer.step()

        # model predictions are stored on GPU. So, push it to CPU
        preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

    # compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)
  
    # predictions are in the form of (no. of batches, size of batch, no. of classes).
    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    #returns the loss and predictions
    return avg_loss, total_preds

In [114]:
# function for evaluating the model
def evaluate():
    
    print("\nEvaluating...")
  
    # deactivate dropout layers
    model.eval()

    total_loss, total_accuracy = 0, 0
    
    # empty list to save the model predictions
    total_preds = []

    # iterate over batches
    for step, batch in enumerate(val_dataloader):
        
        # Progress update every 50 batches.
        if step % 50 == 0 and not step == 0:
            
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

        # push the batch to gpu
        batch = [t.to(device) for t in batch]

        sent_id, mask, labels = batch

        # deactivate autograd
        with torch.no_grad():
            
            # model predictions
            preds = model(sent_id, mask)

            # compute the validation loss between actual and predicted values
            loss = cross_entropy(preds,labels)

            total_loss = total_loss + loss.item()

            preds = preds.detach().cpu().numpy()

            total_preds.append(preds)

    # compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader) 

    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [115]:
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #train model
    train_loss, _ = train()
    
    #evaluate model
    valid_loss, _ = evaluate()
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 20

Evaluating...

Training Loss: 0.700
Validation Loss: 0.693

 Epoch 2 / 20

Evaluating...

Training Loss: 0.690
Validation Loss: 0.693

 Epoch 3 / 20

Evaluating...

Training Loss: 0.690
Validation Loss: 0.692

 Epoch 4 / 20

Evaluating...

Training Loss: 0.693
Validation Loss: 0.692

 Epoch 5 / 20

Evaluating...

Training Loss: 0.698
Validation Loss: 0.691

 Epoch 6 / 20

Evaluating...

Training Loss: 0.683
Validation Loss: 0.691

 Epoch 7 / 20

Evaluating...

Training Loss: 0.688
Validation Loss: 0.691

 Epoch 8 / 20

Evaluating...

Training Loss: 0.691
Validation Loss: 0.690

 Epoch 9 / 20

Evaluating...

Training Loss: 0.685
Validation Loss: 0.690

 Epoch 10 / 20

Evaluating...

Training Loss: 0.675
Validation Loss: 0.689

 Epoch 11 / 20

Evaluating...

Training Loss: 0.694
Validation Loss: 0.689

 Epoch 12 / 20

Evaluating...

Training Loss: 0.690
Validation Loss: 0.689

 Epoch 13 / 20

Evaluating...

Training Loss: 0.691
Validation Loss: 0.688

 Epoch 14 / 20

Eval

In [116]:
# get predictions for test data
with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))
    preds = preds.detach().cpu().numpy()

In [117]:
# model's performance
preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))

              precision    recall  f1-score   support

           0       0.72      0.93      0.81        42
           1       0.67      0.29      0.40        21

    accuracy                           0.71        63
   macro avg       0.69      0.61      0.61        63
weighted avg       0.70      0.71      0.68        63



We based our model on multiple sources:
1. https://skimai.com/fine-tuning-bert-for-sentiment-analysis/
2. https://huggingface.co/docs/transformers/training
3. https://www.kaggle.com/code/harshjain123/bert-for-everyone-tutorial-implementation
4. https://www.tensorflow.org/tfmodels/nlp/fine_tune_bert

We plan to use this perfomance as a baseline to compare against when we add our project novelty (applying in-context learning to BERT) to see if our addition improves performance.