In [40]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import transformers
from transformers import AutoModel, BertTokenizerFast
from transformers import AdamW
import torch
import torch.nn as nn
from torch.utils.data import RandomSampler, SequentialSampler, TensorDataset, DataLoader

In [29]:
device = torch.device("cuda")
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
df = pd.read_csv("combined_data_v3.csv")
df.head()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Unnamed: 0.1,Unnamed: 0,uniq_id,patient_id,outcome,chart_labeled_date,text
0,1,1,215,True,"Mon, 07 Nov 2022 02:44:03 GMT",SOCIAL WORK ASSESSMENT Robert Wesley is a 5...
1,2,2,215,True,"Mon, 07 Nov 2022 02:44:11 GMT","SOCIAL WORK FOLLOW UP NOTE Chart reviewed,..."
2,3,3,220,True,"Mon, 07 Nov 2022 02:49:25 GMT",New Patient Visit Ashley Busch is a 29 Y fema...
3,4,4,360,True,"Mon, 07 Nov 2022 02:57:29 GMT",SW met with pt at bedside. Present was Interpr...
4,5,5,360,True,"Mon, 07 Nov 2022 02:58:26 GMT",Received SBAR regarding pt's admission. Pt is ...


In [30]:
# removes irrevelant columns from the original data
df = df.drop(['patient_id', 'uniq_id', 'chart_labeled_date', 'Unnamed: 0'], axis = 1)
df.head()

Unnamed: 0,outcome,text
0,True,SOCIAL WORK ASSESSMENT Robert Wesley is a 5...
1,True,"SOCIAL WORK FOLLOW UP NOTE Chart reviewed,..."
2,True,New Patient Visit Ashley Busch is a 29 Y fema...
3,True,SW met with pt at bedside. Present was Interpr...
4,True,Received SBAR regarding pt's admission. Pt is ...


In [31]:
# replaces True and Falses with 1 and 0 to make outcome more straightforward
df = df.replace(True, 1)
df = df.replace(False, 0)
df.head()

Unnamed: 0,outcome,text
0,1,SOCIAL WORK ASSESSMENT Robert Wesley is a 5...
1,1,"SOCIAL WORK FOLLOW UP NOTE Chart reviewed,..."
2,1,New Patient Visit Ashley Busch is a 29 Y fema...
3,1,SW met with pt at bedside. Present was Interpr...
4,1,Received SBAR regarding pt's admission. Pt is ...


In [32]:
# Using train test split, we split the text into training, validation, and test sets

train_set = train_test_split(df['text'], df['outcome'], random_state = 2022, test_size = 0.8, stratify = df['outcome'])[0]
val_test_set = train_test_split(df['text'], df['outcome'], random_state = 2022, test_size = 0.8, stratify = df['outcome'])[1]
train_set_labels = train_test_split(df['text'], df['outcome'], random_state = 2022, test_size = 0.8, stratify = df['outcome'])[2]
val_test_set_labels = train_test_split(df['text'], df['outcome'], random_state = 2022, test_size = 0.8, stratify = df['outcome'])[3]

val_set = train_test_split(val_test_set, val_test_set_labels, random_state = 2022, test_size = 0.5, stratify = val_test_set_labels)[0]
test_set = train_test_split(val_test_set, val_test_set_labels, random_state = 2022, test_size = 0.5, stratify = val_test_set_labels)[1]
val_set_labels = train_test_split(val_test_set, val_test_set_labels, random_state = 2022, test_size = 0.5, stratify = val_test_set_labels)[2]
test_set_labels = train_test_split(val_test_set, val_test_set_labels, random_state = 2022, test_size = 0.5, stratify = val_test_set_labels)[3]

for i in range(15):
    ind = train_set.index[i]
    train_set[ind] = train_set[ind] + " Q: Does this person have food insecurity? A: " + str(train_set_labels[ind])

train_set = train_set.tolist()
val_set = val_set.tolist()
test_set = test_set.tolist()
train_set_labels = train_set_labels.tolist()
val_set_labels = val_set_labels.tolist()
test_set_labels = test_set_labels.tolist()

In [33]:
# Converts list of labels to tensor

train_labels = torch.tensor(train_set_labels)
val_labels = torch.tensor(val_set_labels)
test_labels = torch.tensor(test_set_labels)

# Tokenizes and encodes sequences in the training, validation, and test sets

max_sentence = 100

tokens_train = tokenizer.batch_encode_plus(train_set, max_length = max_sentence, padding='max_length', truncation=True)
tokens_val = tokenizer.batch_encode_plus(val_set, max_length = max_sentence, padding='max_length', truncation=True)
tokens_test = tokenizer.batch_encode_plus(test_set, max_length = max_sentence, padding='max_length', truncation=True)

In [34]:
# Applies attention masks and converts tokenized lists to tensors

train_attention_mask = torch.tensor(tokens_train['attention_mask'])
val_attention_mask = torch.tensor(tokens_val['attention_mask'])
test_attention_mask = torch.tensor(tokens_test['attention_mask'])
train_tensor = torch.tensor(tokens_train['input_ids'])
val_tensor = torch.tensor(tokens_val['input_ids'])
test_tensor = torch.tensor(tokens_test['input_ids'])

In [35]:
# Training and validation set dataLoader from pytorch

batch_size = 64
train_set_data = TensorDataset(train_tensor, train_attention_mask, train_labels)
val_set_data = TensorDataset(val_tensor, val_attention_mask, val_labels)
train_set_dataloader = DataLoader(train_set_data, sampler=RandomSampler(train_set_data), batch_size=batch_size)
val_set_dataloader = DataLoader(val_set_data, sampler=SequentialSampler(val_set_data), batch_size=batch_size)

In [36]:
# freeze all the parameters for fine-tuning 
for parameter in bert.parameters():
    parameter.requires_grad = False

In [37]:
# Initializes BERT architecture

fc1_input_size = 768
fc1_output_size = 512
fc2_output_size = 48
fc3_output_size = 2
class BERT(nn.Module):

    def __init__(self, bert):
        super(BERT, self).__init__()
        self.bert = bert
        self.fc1 = nn.Linear(fc1_input_size, fc1_output_size)
        self.fc2 = nn.Linear(fc1_output_size, fc2_output_size)
        self.fc3 = nn.Linear(fc2_output_size, fc3_output_size)
        self.dropout = nn.Dropout(0.08)
        self.leakyrelu = nn.LeakyReLU()
        self.softmax = nn.LogSoftmax(dim = 1)

    def forward(self, sent_id, mask):
        _, input = self.bert(sent_id, attention_mask=mask, return_dict=False)
      
        model = self.fc1(input)
        model = self.fc2(model)
        model = self.fc3(model)
        model = self.dropout(model)
        model = self.leakyrelu(model)
        model = self.softmax(model)

        return model
    
model = BERT(bert)
model = model.to(device)

In [42]:
# Computes weights and defines hyperparameters (optimizer, loss function, number of epochs)

weights = torch.tensor(compute_class_weight('balanced', classes=np.unique(train_set_labels), y = train_set_labels), dtype = torch.float)
weights = weights.to(device)
binary_cross_entropy = nn.NLLLoss(weight=weights)
optimizer = torch.optim.AdamW(model.parameters(),lr = 0.00001) 
num_epochs = 20

In [None]:
# Function to compute loss and train the model

def train():
    
    model.train()
    preds = []
    loss = 0
  
    for step, batch in enumerate(train_set_dataloader):
            
        batch = [b.to(device) for b in batch]
        sent_id, mask, labels = batch
        model.zero_grad()        
        curr_preds = model(sent_id, mask)
        curr_loss = binary_cross_entropy(curr_preds, labels)
        loss += curr_loss.item()
        curr_loss.backward()
        optimizer.step()
        curr_preds = curr_preds.detach().cpu().numpy()

    preds.append(curr_preds)
    loss_average = loss / len(train_set_dataloader)
    preds = np.concatenate(preds, axis = 0)

    return loss_average, preds

In [None]:
# Function to evaluate the model on the validation set

def evaluate():

    model.eval()
    preds = []
    accuracy = 0
    loss = 0

    for step, batch in enumerate(val_set_dataloader):
        batch = [b.to(device) for b in batch]
        sent_id, mask, labels = batch

        with torch.no_grad():
            curr_preds = model(sent_id, mask)
            curr_loss = binary_cross_entropy(curr_preds, labels)
            loss += lcurr_loss.item()
            curr_preds = curr_preds.detach().cpu().numpy()
            preds.append(curr_preds)

    loss_average = loss / len(val_set_dataloader) 
    preds = np.concatenate(preds, axis = 0)

    return loss_average, preds

In [None]:
# Iterate through training and validation with num_epochs and print loss

min_loss = float('inf')

train_set_loss_list = []
val_set_loss_list = []

for e in range(num_epochs):
    print('\n Epoch :' + str(e + 1))
    train_set_loss, _ = train()
    val_set_loss, _ = evaluate()
    
    print('\nTraining Loss: ' + str(train_set_loss))
    print('\nValidation Loss: ' + str(val_set_loss))

    if val_set_loss < min_loss:
        min_loss = val_set_loss

    train_set_loss_list.append(train_set_loss)
    val_set_loss_list.append(val_set_loss)

In [None]:
# Evaluates model performance on test set
with torch.no_grad():
    test_predictions = model(test_torch.to(device), test_attention_mask.to(device))
    test_predictions = test_predictions.detach().cpu().numpy()
    
# Print the model performance (Accuracy, F1-Score, Precision, Recall, etc.)
test_predictions = np.argmax(test_predictions, axis = 1)
print(classification_report(test_labels, test_predictions))