# Config

In [1]:
# file_path = "/content/drive/MyDrive/mimic-iii-clinicalnote-v4.1/bq-results-20231215-074521-1702626519432.csv"
# output_folder_path = "/content/drive/MyDrive/mimic-iii-clinicalnote-v4.1/processed_set"

In [2]:
file_path = "../data/mimic-iii-clinicalnote-v4.1.csv"
output_folder_path = "../data/processed_set"

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

# import package and load the data

In [4]:
import pandas as pd
import numpy as np
import torch
import os
import string
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertModel

2024-01-18 14:59:35.318620: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# load data

In [5]:
X_train = pd.read_csv(os.path.join(output_folder_path, 'X_train_raw.csv'))
X_test = pd.read_csv(os.path.join(output_folder_path, 'X_test_raw.csv'))
X_val = pd.read_csv(os.path.join(output_folder_path, 'X_val_raw.csv'))

X_train.drop_duplicates(subset=['icustay_id'])['icustay_id']

0         200009
12        200014
24        200016
36        200021
48        200025
           ...  
316248    299988
316260    299992
316272    299993
316284    299995
316296    299998
Name: icustay_id, Length: 26359, dtype: int64

In [6]:
X_train_biobert = X_train[['icustay_id', 'slice_start', 'processed_text']]
X_test_biobert = X_test[['icustay_id', 'slice_start', 'processed_text']]
X_val_biobert = X_val[['icustay_id', 'slice_start', 'processed_text']]
X_train_biobert

Unnamed: 0,icustay_id,slice_start,processed_text
0,200009,2189-11-30 10:34:32,missing
1,200009,2189-11-30 12:34:32,missing
2,200009,2189-11-30 14:34:32,missing
3,200009,2189-11-30 16:34:32,respiratory care note pt received intubated pl...
4,200009,2189-11-30 18:34:32,missing
...,...,...,...
316303,299998,2181-07-06 08:47:40,missing
316304,299998,2181-07-06 10:47:40,missing
316305,299998,2181-07-06 12:47:40,missing
316306,299998,2181-07-06 14:47:40,missing


In [7]:
X_train_biobert['processed_text'] = X_train_biobert['processed_text'].fillna('missing')
X_test_biobert['processed_text'] = X_test_biobert['processed_text'].fillna('missing')
X_val_biobert['processed_text'] = X_val_biobert['processed_text'].fillna('missing')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_biobert['processed_text'] = X_train_biobert['processed_text'].fillna('missing')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_biobert['processed_text'] = X_test_biobert['processed_text'].fillna('missing')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val_biobert['processed_text']

# BioBERT embedding

In [8]:
# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("You're using: ", device)

# Load pre-trained BioBERT model
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.2")
biobert_model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.2")

# Send the model to GPU if available
biobert_model = biobert_model.to(device)

# Function to generate embeddings with a progress bar
def get_biobert_embeddings(texts):
    embeddings = []
    for text in tqdm(texts, desc="Generating Embeddings"):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
        with torch.no_grad():
            outputs = biobert_model(**inputs)
        # Use the average of the last hidden state for sentence embedding
        embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.append(embedding.flatten().tolist())
    return embeddings

You're using:  cuda


In [9]:
X_train_biobert['bio_bert_embeddings'] = get_biobert_embeddings(X_train_biobert['processed_text'])
X_test_biobert['bio_bert_embeddings'] = get_biobert_embeddings(X_test_biobert['processed_text'])
X_val_biobert['bio_bert_embeddings'] = get_biobert_embeddings(X_val_biobert['processed_text'])

Generating Embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 316308/316308 [1:27:46<00:00, 60.06it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_biobert['bio_bert_embeddings'] = get_biobert_embeddings(X_train_biobert['processed_text'])
Generating Embeddings: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 105444/105444 [28:22<00:00, 61.94it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-co

In [None]:
# Save the datasets
X_train_biobert.to_csv(os.path.join(output_folder_path, 'X_train_biobert.csv'), index=False)
X_test_biobert.to_csv(os.path.join(output_folder_path, 'X_test_biobert.csv'), index=False)
X_val_biobert.to_csv(os.path.join(output_folder_path, 'X_val_biobert.csv'), index=False)
# y_train_id.to_csv(os.path.join(output_folder_path, 'y_train_id.csv'), index=False)
# y_test_id.to_csv(os.path.join(output_folder_path, 'y_test_id.csv'), index=False)
# y_val_id.to_csv(os.path.join(output_folder_path, 'y_val_id.csv'), index=False)

In [None]:
import pickle
# Saving X_train_bert
with open(os.path.join(output_folder_path, 'X_train_biobert.pkl'), 'wb') as file:
    pickle.dump(X_train_biobert, file)

# Saving X_test_bert
with open(os.path.join(output_folder_path, 'X_test_biobert.pkl'), 'wb') as file:
    pickle.dump(X_test_biobert, file)

# Saving X_val_bert
with open(os.path.join(output_folder_path, 'X_val_biobert.pkl'), 'wb') as file:
    pickle.dump(X_val_biobert, file)

In [None]:
X_train_biobert