In [2]:
# import essential libraries for data analytics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import tensorflow as tf
from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModel

print("Torch CUDA:",torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')
print("GPU Available:", tf.config.list_physical_devices('GPU'))

2024-04-11 10:18:32.823759: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-11 10:18:32.823817: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-11 10:18:32.824374: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-11 10:18:32.828378: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Torch CUDA: True
Using device: cuda
GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


2024-04-11 10:18:34.328991: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-11 10:18:34.330700: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-04-11 10:18:34.331296: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.


In [3]:
notes = pd.read_csv('../data/notes_combined.csv')
notes.head()

Unnamed: 0,id,text,los_icu,icu_death
0,20001305,INDICATION: ___ with copd in resp distress in...,2.78,1
1,20001361,CHEST X-RAY DATED ___.\n\nCOMPARISON: None.\n...,6.05,0
2,20001770,EXAMINATION: LIVER OR GALLBLADDER US (SINGLE ...,2.87,0
3,20002506,EXAMINATION: CTA HEAD AND CTA NECK Q16 CT NEC...,6.56,0
4,20003425,ADDENDUM Findings were communicated to the EN...,4.0,0


In [4]:
# define a BioClinicalBertFeatureExtractor class
class BioClinicalBertFeatureExtractor:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = AutoModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT').to(self.device)

    # function that tokenize the text & groups token into blocks
    def tokenize_and_block(self, text, block_size=510):
        # tokenize the text
        tokenized_text = self.tokenizer.tokenize(text)
        # group tokens into blocks
        blocks = []
        for i in range(0, len(tokenized_text), block_size):
            block = tokenized_text[i:i+block_size-2]  # exclude [CLS] and [SEP]
            blocks.append(block)
        return blocks

    def add_special_tokens_and_padding(self, blocks, max_length=512):
        # add the [CLS] and [SEP] tokens, and padding to each block
        input_ids = []
        attention_masks = []
        for block in blocks:
            # add [CLS] and [SEP] tokens
            block = ['[CLS]'] + block + ['[SEP]']
            # pad block to match max_length
            padding_length = max_length - len(block)
            padded_block = block + ['[PAD]'] * padding_length
            # convert tokens to input IDs and attention masks
            input_id = self.tokenizer.convert_tokens_to_ids(padded_block)
            attention_mask = [1 if token != '[PAD]' else 0 for token in padded_block]
            input_ids.append(input_id)
            attention_masks.append(attention_mask)
        return input_ids, attention_masks

    def extract_features(self, input_ids, attention_masks):
        # convert input_ids and attention_masks to tensors
        input_ids = torch.tensor(input_ids).to(self.device)
        attention_masks = torch.tensor(attention_masks).to(self.device)
        # pass the blocks of tokens to the Bio_ClinicalBERT model
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_masks)
        # extract embeddings from all layers
        # in BERT, outputs.last_hidden_state contains the embeddings from all layers of the model
        # if only want last layer embeddings: code this instead >>> outputs.last_hidden_state[-1]
        all_layer_embeddings = outputs.last_hidden_state
        return all_layer_embeddings

    def reduce_dimension(self, all_layer_embeddings):
        # take mean along the second dimension (num of blocks, b x 768)
        averaged_layer_embeddings = torch.mean(all_layer_embeddings, dim=1)
        # take mean along the first dimension to get a single mean tensor (1x768)
        reduced_layer_embeddings = torch.mean(averaged_layer_embeddings, dim=0, keepdim=True)
        return reduced_layer_embeddings.cpu().numpy()

In [5]:
feature_extractor = BioClinicalBertFeatureExtractor()

def feature_extraction(text):
  # Step 1: tokeniztion & group the tokens in block
  blocks = feature_extractor.tokenize_and_block(text)
  # Step 2: adding special tokens [CLS][SEP] & padding
  input_ids, attention_masks = feature_extractor.add_special_tokens_and_padding(blocks)
  # Step 3: feature extraction
  embeddings = feature_extractor.extract_features(input_ids, attention_masks)
  # Step 4: reducing dimension into a single vector (1x768)
  reduced_embeddings = feature_extractor.reduce_dimension(embeddings)
  return reduced_embeddings

In [6]:
notes['text_embeddings'] = notes['text'].apply(feature_extraction)

In [7]:
# the text vector will be stored in a new column 'text_embeddings'
display(notes.head())

# check the number of patients in the dataset
print('Number of patients:', notes.id.nunique())

# check the dimension of the text vector
print(len(notes['text_embeddings'].iloc[0]))

Unnamed: 0,id,text,los_icu,icu_death,text_embeddings
0,20001305,INDICATION: ___ with copd in resp distress in...,2.78,1,"[[0.049554482, -0.037176017, -0.12742633, -0.0..."
1,20001361,CHEST X-RAY DATED ___.\n\nCOMPARISON: None.\n...,6.05,0,"[[0.07767977, -0.3104843, -0.00207543, 0.08938..."
2,20001770,EXAMINATION: LIVER OR GALLBLADDER US (SINGLE ...,2.87,0,"[[0.13288823, -0.19106844, -0.029916322, 0.103..."
3,20002506,EXAMINATION: CTA HEAD AND CTA NECK Q16 CT NEC...,6.56,0,"[[0.0045207953, -0.2325671, 0.02542379, 0.1055..."
4,20003425,ADDENDUM Findings were communicated to the EN...,4.0,0,"[[0.1222434, -0.2150542, -0.06553799, 0.045977..."


Number of patients: 20403
1


In [8]:
notes.to_csv('../data/notes_embedded.csv', index=False)