# Config

In [None]:
file_path = "/content/drive/MyDrive/mimic-iii-clinicalnote-v4.1/bq-results-20231215-074521-1702626519432.csv"
output_folder_path = "/content/drive/MyDrive/mimic-iii-clinicalnote-v4.1/processed_set"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# import package and load the data

In [None]:
import pandas as pd
import numpy as np
import torch
import os
import string
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertModel

# load data

In [None]:
X_train = pd.read_csv(os.path.join(output_folder_path, 'X_train_raw.csv'))
X_test = pd.read_csv(os.path.join(output_folder_path, 'X_test_raw.csv'))
X_val = pd.read_csv(os.path.join(output_folder_path, 'X_val_raw.csv'))

X_train.drop_duplicates(subset=['icustay_id'])['icustay_id']

0         200006
12        200009
24        200014
36        200025
48        200028
           ...  
248736    299981
248748    299984
248760    299986
248772    299988
248784    299995
Name: icustay_id, Length: 20733, dtype: int64

In [None]:
X_train_biobert = X_train[['icustay_id', 'slice_start', 'processed_text']]
X_test_biobert = X_test[['icustay_id', 'slice_start', 'processed_text']]
X_val_biobert = X_val[['icustay_id', 'slice_start', 'processed_text']]
X_train_biobert

Unnamed: 0,icustay_id,slice_start,processed_text
0,200006,2159-09-03 11:28:14,missing
1,200006,2159-09-03 13:28:14,missing
2,200006,2159-09-03 15:28:14,missing
3,200006,2159-09-03 17:28:14,hospital ward name 4 icu nursing admitprogress...
4,200006,2159-09-03 19:28:14,patient admitted hospital ward name 6 ett stom...
...,...,...,...
248791,299995,2116-03-05 07:44:39,respiratory care pt extubated today 0745 post ...
248792,299995,2116-03-05 09:44:39,missing
248793,299995,2116-03-05 11:44:39,missing
248794,299995,2116-03-05 13:44:39,missing


In [None]:
X_train_biobert['processed_text'] = X_train_biobert['processed_text'].fillna('missing')
X_test_biobert['processed_text'] = X_test_biobert['processed_text'].fillna('missing')
X_val_biobert['processed_text'] = X_val_biobert['processed_text'].fillna('missing')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_biobert['processed_text'] = X_train_biobert['processed_text'].fillna('missing')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_biobert['processed_text'] = X_test_biobert['processed_text'].fillna('missing')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val_biobert['processed_text']

# BioBERT embedding

In [None]:
# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("You're using: ", device)

# Load pre-trained BioBERT model
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.2")
biobert_model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.2")

# Send the model to GPU if available
biobert_model = biobert_model.to(device)

# Function to generate embeddings with a progress bar
def get_biobert_embeddings(texts):
    embeddings = []
    for text in tqdm(texts, desc="Generating Embeddings"):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
        with torch.no_grad():
            outputs = biobert_model(**inputs)
        # Use the average of the last hidden state for sentence embedding
        embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.append(embedding.flatten().tolist())
    return embeddings

You're using:  cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [None]:
X_train_biobert['bio_bert_embeddings'] = get_biobert_embeddings(X_train_biobert['processed_text'])
X_test_biobert['bio_bert_embeddings'] = get_biobert_embeddings(X_test_biobert['processed_text'])
X_val_biobert['bio_bert_embeddings'] = get_biobert_embeddings(X_val_biobert['processed_text'])

Generating Embeddings: 100%|██████████| 248796/248796 [47:35<00:00, 87.12it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_biobert['bio_bert_embeddings'] = get_biobert_embeddings(X_train_biobert['processed_text'])
Generating Embeddings: 100%|██████████| 82944/82944 [15:51<00:00, 87.21it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_biobert['bio_bert_embeddings'] = get_biobert_embeddings(X_test_biobert['processed_text'])
Generating Embeddings: 100%|██████████| 82944/82944 [15:53<00:00, 86.99it/s]
A value is trying to be set on a copy of 

In [None]:
# Save the datasets
X_train_biobert.to_csv(os.path.join(output_folder_path, 'X_train_biobert.csv'), index=False)
X_test_biobert.to_csv(os.path.join(output_folder_path, 'X_test_biobert.csv'), index=False)
X_val_biobert.to_csv(os.path.join(output_folder_path, 'X_val_biobert.csv'), index=False)
# y_train_id.to_csv(os.path.join(output_folder_path, 'y_train_id.csv'), index=False)
# y_test_id.to_csv(os.path.join(output_folder_path, 'y_test_id.csv'), index=False)
# y_val_id.to_csv(os.path.join(output_folder_path, 'y_val_id.csv'), index=False)

In [None]:
X_train_biobert