# Config

In [1]:
file_path = "/content/drive/MyDrive/mimic-iii-clinicalnote-v4.1/bq-results-20231215-074521-1702626519432.csv"
output_folder_path = "/content/drive/MyDrive/mimic-iii-clinicalnote-v4.1/processed_set"

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# import package and load the data

In [3]:
import numpy as np
import torch
import os
import ast
import pandas as pd
import string
from tqdm import tqdm
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertModel

In [4]:
X_train = pd.read_csv(os.path.join(output_folder_path, 'X_train_raw.csv'))
X_test = pd.read_csv(os.path.join(output_folder_path, 'X_test_raw.csv'))
X_val = pd.read_csv(os.path.join(output_folder_path, 'X_val_raw.csv'))

X_train.drop_duplicates(subset=['icustay_id'])['icustay_id']

0         200006
12        200009
24        200014
36        200025
48        200028
           ...  
248736    299981
248748    299984
248760    299986
248772    299988
248784    299995
Name: icustay_id, Length: 20733, dtype: int64

In [5]:
X_train_bert = X_train[['icustay_id', 'slice_start', 'processed_text']]
X_test_bert = X_test[['icustay_id', 'slice_start', 'processed_text']]
X_val_bert = X_val[['icustay_id', 'slice_start', 'processed_text']]
X_train_bert

Unnamed: 0,icustay_id,slice_start,processed_text
0,200006,2159-09-03 11:28:14,missing
1,200006,2159-09-03 13:28:14,missing
2,200006,2159-09-03 15:28:14,missing
3,200006,2159-09-03 17:28:14,hospital ward name 4 icu nursing admitprogress...
4,200006,2159-09-03 19:28:14,patient admitted hospital ward name 6 ett stom...
...,...,...,...
248791,299995,2116-03-05 07:44:39,respiratory care pt extubated today 0745 post ...
248792,299995,2116-03-05 09:44:39,missing
248793,299995,2116-03-05 11:44:39,missing
248794,299995,2116-03-05 13:44:39,missing


In [6]:
X_train_bert['processed_text'] = X_train_bert['processed_text'].fillna('missing')
X_test_bert['processed_text'] = X_test_bert['processed_text'].fillna('missing')
X_val_bert['processed_text'] = X_val_bert['processed_text'].fillna('missing')
X_train_bert

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_bert['processed_text'] = X_train_bert['processed_text'].fillna('missing')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_bert['processed_text'] = X_test_bert['processed_text'].fillna('missing')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val_bert['processed_text'] = X_val_bert['

Unnamed: 0,icustay_id,slice_start,processed_text
0,200006,2159-09-03 11:28:14,missing
1,200006,2159-09-03 13:28:14,missing
2,200006,2159-09-03 15:28:14,missing
3,200006,2159-09-03 17:28:14,hospital ward name 4 icu nursing admitprogress...
4,200006,2159-09-03 19:28:14,patient admitted hospital ward name 6 ett stom...
...,...,...,...
248791,299995,2116-03-05 07:44:39,respiratory care pt extubated today 0745 post ...
248792,299995,2116-03-05 09:44:39,missing
248793,299995,2116-03-05 11:44:39,missing
248794,299995,2116-03-05 13:44:39,missing


# BERT embedding

In [7]:
# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("You're using: ", device)

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Send the model to GPU if available
bert_model = bert_model.to(device)

# Function to generate embeddings with a progress bar
def get_bert_embeddings(texts):
    embeddings = []
    for text in tqdm(texts, desc="Generating Embeddings"):
        # Tokenize and prepare the inputs
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
        with torch.no_grad():
            # Get the outputs from the model
            outputs = bert_model(**inputs)
        # Use the average of the last hidden state for sentence embedding
        embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.append(embedding.flatten().tolist())
    return embeddings

You're using:  cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
X_train_bert['bert_embeddings'] = get_bert_embeddings(X_train_bert['processed_text'])
X_test_bert['bert_embeddings'] = get_bert_embeddings(X_test_bert['processed_text'])
X_val_bert['bert_embeddings'] = get_bert_embeddings(X_val_bert['processed_text'])

Generating Embeddings:  66%|██████▌   | 163908/248796 [36:38<15:48, 89.48it/s]

In [None]:
X_train_bert = X_train_bert.drop(['processed_text'], axis=1)
X_test_bert = X_test_bert.drop(['processed_text'], axis=1)
X_val_bert = X_val_bert.drop(['processed_text'], axis=1)

# Save training, testing and validation set to folder

In [None]:
# Save the datasets
X_train_bert.to_csv(os.path.join(output_folder_path, 'X_train_bert.csv'), index=False)
X_test_bert.to_csv(os.path.join(output_folder_path, 'X_test_bert.csv'), index=False)
X_val_bert.to_csv(os.path.join(output_folder_path, 'X_val_bert.csv'), index=False)

In [None]:
X_train_bert

# Padding

In [None]:
type(X_train_bert['bert_embeddings'][0])

In [None]:
def prepare_unstructured_sequences(df, columns_to_select, sequence_length):
    """
    Prepare padded sequences from a structured DataFrame.

    Args:
    df (pd.DataFrame): The raw DataFrame.
    columns_to_drop (list): List of column names to be dropped from the DataFrame.
    sequence_length (int): The maximum length for padding the sequences.

    Returns:
    np.array: An array of padded sequences.
    """
    # Drop specified columns
    df_structured = df[columns_to_select]

    # Group by 'icustay_id' (assumed to be the first column)
    df_grouped = df_structured.groupby('icustay_id').agg(lambda x: x.tolist())

    # Extract sequences for each group
    sequences = []
    for _, row in tqdm(df_grouped.iterrows(), total=df_grouped.shape[0], desc="Processing Groups"):
        # sequence = [ast.literal_eval(vector) for vector in row[columns_to_select[1]]]
        # sequences.append(sequence)
        sequences.append(row[columns_to_select[1]])

    # Pad sequences to ensure uniform length
    padded_sequences = pad_sequences(sequences, padding='post', dtype='float32')

    return padded_sequences

# Usage Example
columns_to_select = ['icustay_id', 'bert_embeddings']
sequence_length = 12
X_train_bert_padded = prepare_unstructured_sequences(X_train_bert, columns_to_select, sequence_length)
X_test_bert_padded = prepare_unstructured_sequences(X_test_bert, columns_to_select, sequence_length)
X_val_bert_padded = prepare_unstructured_sequences(X_val_bert, columns_to_select, sequence_length)

In [None]:
X_train_bert_padded

# Save padding set in folder

In [None]:
# Save the datasets
# np.savetxt(os.path.join(output_folder_path, 'X_train_bert_padded.csv'), X_train_bert_padded, delimiter=',')
# np.savetxt(os.path.join(output_folder_path, 'X_test_bert_padded.csv'), X_test_bert_padded, delimiter=',')
# np.savetxt(os.path.join(output_folder_path, 'X_val_bert_padded.csv'), X_val_bert_padded, delimiter=',')