##### Improving Prediction Accuracy of Sepsis Mortality using Machine Learning and Natural Language Processing
## Tyler Kelly

# Machine Learning Pipeline

## Set Up and Install Dependencies

In [None]:
#pip install datasets transformers pandas shap

## Part 0 Preprocessing (Pull Code from Author's ipynb)

The following code is adapted from the github repository 'https://github.com/yuyinglu2000/Sepsis-Mortality'

In [1]:
#AC

import warnings
warnings.filterwarnings("ignore")
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import lightgbm as lgb

### Data Mapping

Begin by creating bigquery search to get the 38 unique features

In [2]:
#df_raw = pd.read_csv('Data/data_raw.csv')

In [3]:
#df_raw.shape
# Expect to get a dataframe 808188x38 (38 old columns)

In [5]:
#AC

# regroup the race
race_mapping = {
    'WHITE': 'White',
    'HISPANIC OR LATINO': 'Hispanic or Latin',
    'BLACK/AFRICAN AMERICAN': 'Black or African American',
    'BLACK/CARIBBEAN ISLAND': 'Black or African American',
    'HISPANIC/LATINO - DOMINICAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - CENTRAL AMERICAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - GUATEMALAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - PUERTO RICAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - SALVADORAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - HONDURAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - MEXICAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - CUBAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - COLUMBIAN': 'Hispanic or Latin',
    'BLACK/CAPE VERDEAN': 'Black or African American',
    'BLACK/AFRICAN': 'Black or African American',
    'SOUTH AMERICAN': 'Hispanic or Latin',
    'WHITE - BRAZILIAN': 'Hispanic or Latin',
    'WHITE - OTHER EUROPEAN': 'White',
    'WHITE - RUSSIAN': 'White',
    'WHITE - EASTERN EUROPEAN': 'White',
    'ASIAN': 'Others race',
    'ASIAN - SOUTH EAST ASIAN': 'Others race',
    'ASIAN - CHINESE': 'Others race',
    'ASIAN - ASIAN INDIAN': 'Others race',
    'ASIAN - KOREAN': 'Others race',
    'AMERICAN INDIAN/ALASKA NATIVE': 'Others race',
    'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER': 'Others race',
    'MULTIPLE RACE/ETHNICITY': 'Others race',
    'PORTUGUESE': 'Others race',
    'UNKNOWN': 'Others race',
    'OTHER': 'Others race',
    'UNABLE TO OBTAIN': 'Others race',
    'PATIENT DECLINED TO ANSWER': 'Others race'
}

#df_raw['race'] = df_raw['race'].map(race_mapping)

In [6]:
#AC

import pandas as pd

# Assuming df is your DataFrame
# df = ... (your DataFrame)

# Define a mapping for antibiotics to their respective groups
antibiotic_mapping = {
    'Gentamicin Sulfate': 'Aminoglycoside',
    'Tobramycin Sulfate': 'Aminoglycoside',
    'Streptomycin Sulfate': 'Aminoglycoside',
    'Neomycin Sulfate': 'Aminoglycoside',
    'Neomycin/Polymyxin B Sulfate': 'Aminoglycoside',
    'Meropenem': 'Carbapenem',
    'Meropenem Graded Challenge': 'Carbapenem',
    'Vancomycin': 'Glycopeptide',
    'Vancomycin Oral Liquid': 'Glycopeptide',
    'Vancomycin Antibiotic Lock': 'Glycopeptide',
    'Vancomycin Enema': 'Glycopeptide',
    'Vancomycin Intrathecal': 'Glycopeptide',
    'Vancomycin Ora': 'Glycopeptide',
    'Linezolid': 'Oxazolidinone',
    'Linezolid Suspension': 'Oxazolidinone',
    'Penicillin G Benzathine': 'Penicillin',
    'Penicillin G Potassium': 'Penicillin',
    'Penicillin V Potassium': 'Penicillin',
    'Sulfameth/Trimethoprim': 'Sulfonamide',
    'Sulfameth/Trimethoprim DS': 'Sulfonamide',
    'Sulfameth/Trimethoprim SS': 'Sulfonamide',
    'Sulfamethoxazole-Trimethoprim': 'Sulfonamide',
    'Sulfameth/Trimethoprim Suspension': 'Sulfonamide',
    'Tetracycline': 'Tetracycline',
    'Tetracycline HCl': 'Tetracycline'
    # Add more mappings as needed
}

# Applying the mapping to the 'antibiotic' column
#df_raw['antibiotic'] = df_raw['antibiotic'].map(antibiotic_mapping)

In [7]:
#AC
#df_raw['antibiotic'].unique()

### Get Dummy Variables

In [None]:
#AC
df_encoded = pd.get_dummies(df_raw, columns=df_raw.select_dtypes(include=['object']).columns)
df_dropped = df_encoded.dropna()
df_dropped.info()

In [None]:
### Check empty values *for tetracycline* ###
#AC
empty_values = df_dropped['antibiotic_Tetracycline'].isnull().any()
empty_values

In [None]:
df.info

After applying get_dummy_variables there is now 53 columns

### Drop Duplications

In [None]:
#AC with minor edits

duplicated_rows_mask = df_dropped['subject_id'].duplicated(keep=False)

# Extract the duplicated rows
duplicated_rows = df_dropped[duplicated_rows_mask]
new_data  = df_dropped.drop_duplicates()
duplicated_rows_mask = new_data['subject_id'].duplicated(keep=False)

# Extract the duplicated rows
duplicated_rows = new_data[duplicated_rows_mask]
# Separate out columns based on data types
int_float_cols = new_data.select_dtypes(include=['int64', 'float64']).columns
uint8_cols = new_data.select_dtypes(include=['uint8']).columns

# Sort dataframe
# For int and float columns: sort in descending order so that larger values come first
df_raw = new_data.sort_values(by=list(int_float_cols), ascending=False)

# For uint8 columns: sort in descending order so that 1 comes before 0
df_raw = df_raw.sort_values(by=list(uint8_cols), ascending=False)

# Drop duplicates based on subject_id, keeping the first (which are the desired rows after sorting)
df_reduced = df_raw.drop_duplicates(subset='subject_id', keep='first')

# Reset index if needed
df_reduced = df_reduced.reset_index(drop=True)
pd.set_option('display.max_columns', None)
df_reduced

In [None]:
df_reduced.shape

After reducing the dataframe we get the 6401 patients reported in the paper.

In [None]:
df_reduced.info

In [None]:
df_subject = df_reduced['subject_id']

In [None]:
df_subject.to_csv("data_subject_id_ready_to_query.csv", index=False)

## Part 1 Upload data_ready_to_merge.csv to BigQuery

Upload the subject_id dataset to bigquery and merge the dataset to radiology and discharge notes separately. Save the downloaded file to downloads (or find a way to save it directly to my BIOST 2021 Thesis / Main ) as
'data_radiology_notes.csv' and 'data_discharge_notes.csv' then joing below.

Place SQL code chunks below (if time write them into the script)

# READ THIS

data_full_notes_old.csv uses an old outdated dataset but I haven't figured out the correct sql to get the correct dataset at 6401.

For now, I will use `df_old` when using the outdated dataset and `df` when I get the new corrected one.

To get the radiology and discharge notes, I uploaded the data_after_cleaning table to big query and used the below code sql query to get a table with
data_after_cleaning joined to the notes columns found in the mimic-iv discharge and radiology tables.

Above, using the correct subject_id list to get the 6401 patients, I use the same method to obtain the rad_notes and discharge_notes tables using bigquery, and join in an identical manner.

In [8]:
df = pd.read_csv('Data/Old/data_full_notes_old.csv') # df_old
# change after fixing sql

In [9]:
df.shape

(303994, 58)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303994 entries, 0 to 303993
Data columns (total 58 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   int64_field_0                          303994 non-null  int64  
 1   subject_id                             303994 non-null  int64  
 2   hospital_expire_flag                   303994 non-null  int64  
 3   max_age                                303994 non-null  int64  
 4   los_icu                                303994 non-null  float64
 5   first_hosp_stay                        303994 non-null  bool   
 6   suspected_infection                    303994 non-null  int64  
 7   sofa_score                             303994 non-null  int64  
 8   sepsis3                                303994 non-null  bool   
 9   avg_urineoutput                        303994 non-null  float64
 10  glucose_min                            303994 non-null  

## Part 2 Truncate Notes for Word2Vec

In [11]:
from joblib import Parallel, delayed
import pandas as pd
import re
import multiprocessing

In [12]:
# === Step 1: Define cleaning function - Clean individual note text ===
def clean_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    text = re.sub(r'_+', '', text)    # Remove underlines
    text = re.sub(r'[^\w\s.,:;!?()\-\n]', '', text)  # Remove junk, keep clinical symbols
    return text.strip()

In [52]:
# === Step 2: Function to process one group. Process a group into (subject_id, note_type_1, combined_notes) ===
def process_group(record):
    subject_id = record['subject_id']
    note_type_1 = record['note_type_1'] #this column identifies addendums and base notes to just be 'radiology' notes
    texts = record['text']
    cleaned_notes = [clean_text(text) for text in texts]
    combined_notes = " ".join(cleaned_notes)
    return {
        'subject_id': subject_id,
        'note_type_1': note_type_1,
        'combined_notes': combined_notes
    }

In [14]:
# === Step 3: Group the notes. Load and group your data ===
# Replace this with your actual loading logic / dataframe
# Data loaded above as `df`
grouped_df = (
    df.groupby(['subject_id', 'note_type_1'])['text']
    .apply(list)
    .reset_index()
)

records = grouped_df.to_dict('records')

In [15]:
# === Step 4: Parallel processing with joblib ===

#this num_cores variable is used later for other parallel processing jobs
num_cores = multiprocessing.cpu_count() - 1

processed = Parallel(n_jobs=num_cores)(
    delayed(process_group)(record) for record in records
)

The following step creates a long dataframe for notes per patient by type (some patients have radiology, discharge, both, or none for notes), it is later converted to wide to potentially compare NLP to see if discharge notes significantly provide insight into mortality rates.

In [16]:
# === Step 5: Create DataFrame and save to csv ===
nlp_long_df = pd.DataFrame(processed).sort_values(by=['subject_id', 'note_type_1'])

nlp_long_df.to_csv("Data/Old/data_trunc_notes_old.csv", index=False) # change after fixing sql

In [17]:
nlp_long_df.shape

(10402, 3)

In [53]:
# === Step 6: Pivot to wide format for multiple columns ===
# Convert note_type_1 to columns like 'Radiology_notes', etc.
nlp_wide_df = nlp_long_df.pivot(
    index='subject_id',
    columns='note_type_1',
    values='combined_notes'
).reset_index()

nlp_wide_df.columns.name = None # Remove category label

# Rename columns to make clear
nlp_wide_df = nlp_wide_df.rename(columns={
    'radiology': 'Radiology_notes',
    'discharge': 'Discharge_summary_notes'
})

nlp_wide_df = nlp_wide_df.fillna("") #fills NA columns with empty strings

# Save
nlp_wide_df.to_csv("Data/Old/data_trunc_notes_wide_old.csv", index=False) # change after fixing sql

In [19]:
# === Step 7: Combine Radiology and Discharge notes per subject_id ===
nlp_combined_df = nlp_wide_df.copy()

# Concatenate the two columns into one
nlp_combined_df['combined_notes'] = (
    nlp_combined_df['Radiology_notes'].str.strip() + " " +
    nlp_combined_df['Discharge_summary_notes'].str.strip()
).str.strip()

# Combined DataFrame with just subject_id + combined text
nlp_combined_notes_df = nlp_combined_df[['subject_id', 'combined_notes']]

# Save
nlp_combined_notes_df.to_csv("Data/Old/data_trunc_notes_combined_old.csv", index=False) # change after fixing sql

The code chunk below creates `nlp_ready_df`, a dataframe that includes a row for each subject_id and appends 3 new columns with all text truncated based on radiology notes, discharge notes, and combined radiology and discharged notes. I lose information related to the note_id, note_id_type, and more importantly charttime, but here it allows word2vec to work properly. This part of my thesis focuses more on using embeddings from the clinical text to aid in predicting mortality rather than finding the best way to do it (i.e. finding the best time of day where it is more likely to happen than not, or finding the best drug at predicting it, etc.).

In [20]:
# === Step 8: Join nlp_wide_df and nlp_combined_notes_df to data_after_cleaning ===

# === Step i: Load the original cleaned dataset, df_reduced ===
# ie this is df_reduced

# For now, this is `df_clean` the cleaned dataset.

df_clean = pd.read_csv('Data/Old/data_after_cleaning.csv')

# === Step ii: Merge df_reduced with nlp_wide_df ===
# This adds the radiology and discharge notes as 2 new columns to df_reduced
# use data_clean until df_reduced is finalized

nlp_ready_df = df_clean.merge(
    nlp_wide_df,
    on='subject_id',
    how='left'
)

# === Step iii: Merge with combined notes ===
# This adds one new column of all notes combined together as a single note (per patient) to the nlp_ready_df above
nlp_ready_df = nlp_ready_df.merge(
    nlp_combined_notes_df,
    on='subject_id',
    how='left'
)



# Save
nlp_ready_df.to_csv("Data/Old/data_nlp_ready_old.csv", index=False)


In [21]:
# === Step 9: Check shape of dataframes ===
nlp_wide_df.shape

(5208, 3)

In [22]:
nlp_combined_notes_df.shape

(5208, 2)

In [24]:
#df_reduced.shape

In [25]:
nlp_ready_df.shape

(5208, 51)

In [26]:
nlp_ready_df.columns

Index(['subject_id', 'hospital_expire_flag', 'max_age', 'los_icu',
       'first_hosp_stay', 'suspected_infection', 'sofa_score', 'sepsis3',
       'avg_urineoutput', 'glucose_min', 'glucose_max', 'glucose_average',
       'sodium_max', 'sodium_min', 'sodium_average', 'diabetes_without_cc',
       'diabetes_with_cc', 'severe_liver_disease', 'aids', 'renal_disease',
       'heart_rate_min', 'heart_rate_max', 'heart_rate_mean', 'sbp_min',
       'sbp_max', 'sbp_mean', 'dbp_min', 'dbp_max', 'dbp_mean',
       'resp_rate_min', 'resp_rate_max', 'resp_rate_mean', 'spo2_min',
       'spo2_max', 'spo2_mean', 'coma', 'albumin',
       'race_Black or African American', 'race_Hispanic or Latin',
       'race_Others race', 'race_White', 'antibiotic_Vancomycin',
       'antibiotic_Vancomycin Antibiotic Lock', 'antibiotic_Vancomycin Enema',
       'antibiotic_Vancomycin Intrathecal',
       'antibiotic_Vancomycin Oral Liquid', 'gender_F', 'gender_M',
       'Discharge_summary_notes', 'Radiology_note

In [27]:
nlp_ready_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5208 entries, 0 to 5207
Data columns (total 51 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   subject_id                             5208 non-null   int64  
 1   hospital_expire_flag                   5208 non-null   int64  
 2   max_age                                5208 non-null   int64  
 3   los_icu                                5208 non-null   float64
 4   first_hosp_stay                        5208 non-null   bool   
 5   suspected_infection                    5208 non-null   int64  
 6   sofa_score                             5208 non-null   int64  
 7   sepsis3                                5208 non-null   bool   
 8   avg_urineoutput                        5208 non-null   float64
 9   glucose_min                            5208 non-null   int64  
 10  glucose_max                            5208 non-null   int64  
 11  gluc

## Part 3 Create Note File for Word2Vec

In [28]:
import pandas as pd

# Write the 'Radiology_notes' column to a text file, one line per document
with open("Data/Old/W2V_old/w2v_Radiology_notes.txt", "w", encoding="utf-8") as f: # change after fixing sql
    for line in nlp_ready_df["Radiology_notes"]:
        f.write(str(line).strip() + "\n")

In [29]:
# Write the 'Discharge_summary_notes' column to a text file, one line per document
with open("Data/Old/W2V_old/w2v_Discharge_notes.txt", "w", encoding="utf-8") as f: # change after fixing sql
    for line in nlp_ready_df["Discharge_summary_notes"]:
        f.write(str(line).strip() + "\n")

In [30]:
# Write the 'combined_notes' column to a text file, one line per document
with open("Data/Old/W2V_old/w2v_combined_notes.txt", "w", encoding="utf-8") as f: # change after fixing sql
    for line in nlp_ready_df["combined_notes"]:
        f.write(str(line).strip() + "\n")

## Part 4 Prepare Word2Vec - Proceed to main.rmd

## Part 5 Model Training - Proceed to Sepsis_Model_Training.ipynb

After completing / running / saving models in model training, upload them into the workspace in the following code chunks if necessary.

## Part 6 Create Dataset for BERT

# NOTE

Consider filtering data prior to setting up these BERT datasets

In [40]:
import os
import pandas as pd
from joblib import Parallel, delayed
import multiprocessing

import os
import pandas as pd
from joblib import Parallel, delayed

def write_single_note_clean(row, output_dir):
    try:
        subject_id = str(row["subject_id"]).strip()
        note_id = str(row["note_id"]).strip()
        note_text = row.get("text", "")

        # Ensure note_text is a clean string
        if not isinstance(note_text, str):
            note_text = "" if pd.isna(note_text) else str(note_text)
        note_text = note_text.strip()

        # Skip empty notes
        if not note_text:
            return

        # Make subject directory
        subject_dir = os.path.join(output_dir, subject_id)
        os.makedirs(subject_dir, exist_ok=True)

        # Save note
        file_path = os.path.join(subject_dir, f"{subject_id}_{note_id}.txt")
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(note_text)

    except Exception as e:
        print(f"Error writing note {row.get('note_id', 'unknown')}: {e}")



# Change output directories / metadata for discharge notes
def write_mimic_notes_parallel_for_bert(df, 
                                        output_dir="BERT/BERT_old/notes_old", 
                                        metadata_csv="BERT/BERT_old/metadata_notes_old.csv", 
                                        n_jobs=num_cores):
    os.makedirs(output_dir, exist_ok=True)
    rows = df.to_dict("records")

    # Write notes in parallel
    Parallel(n_jobs=n_jobs, prefer="threads")(
        delayed(write_single_note_clean)(row, output_dir) for row in rows
    )

    # Save metadata
    metadata_cols = ['subject_id', 'note_id', 'note_type_1', 'charttime']
    metadata_df = df.dropna(subset=['subject_id', 'note_id'])
    metadata_df = metadata_df[metadata_cols]
    metadata_df.to_csv(metadata_csv, index=False)

    print(f"✅ Notes saved to: {output_dir}")
    print(f"✅ Metadata saved to: {metadata_csv}")


In [37]:
from datasets import Dataset
from joblib import Parallel, delayed

def create_bert_dataset_from_notes(metadata_csv, notes_root_dir):
    df = pd.read_csv(metadata_csv)

    def load_text(row):
        subject_id = str(row['subject_id'])
        note_id = str(row['note_id'])
        file_path = os.path.join(notes_root_dir, subject_id, f"{subject_id}_{note_id}.txt")
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read().strip()
        except FileNotFoundError:
            return ""

    texts = Parallel(n_jobs=num_cores)(delayed(load_text)(row) for _, row in df.iterrows())
    df['text'] = texts
    df = df[df['text'].str.strip() != ""]  # Remove blanks

    dataset = Dataset.from_pandas(df.reset_index(drop=True))
    print(f"✅ Dataset created with {len(dataset)} entries")
    return dataset

In [38]:
# Test mimic_notes_parallel_for_bert
import time

sample_df = df.sample(100)
start = time.time()
write_mimic_notes_parallel_for_bert(df = sample_df, output_dir="BERT/BERT_old/notes_old_test", metadata_csv="BERT/BERT_old/metadata_notes_old_test.csv", n_jobs=1)
print(f"Time for 100 notes: {time.time() - start:.2f} seconds")

✅ Notes saved to: BERT/BERT_old/notes_old_test
✅ Metadata saved to: BERT/BERT_old/metadata_notes_old_test.csv
Time for 100 notes: 0.14 seconds


In [39]:
# Extract and save notes - use df from df_full_notes.csv
write_mimic_notes_parallel_for_bert(df)

✅ Notes saved to: BERT/BERT_old/notes_old
✅ Metadata saved to: BERT/BERT_old/metadata_notes_old.csv


In [42]:
# Test Rebuild dataset
import time

start = time.time()
create_bert_dataset_from_notes("BERT/BERT_old/metadata_notes_old_test.csv", "BERT/BERT_old/notes_test")
print(f"Time for 100 notes: {time.time() - start:.2f} seconds")

✅ Dataset created with 0 entries
Time for 100 notes: 2.11 seconds


In [46]:
# Rebuild Dataset
bert_dataset = create_bert_dataset_from_notes("BERT/BERT_old/metadata_notes_old.csv", "BERT/BERT_old/notes_old")

✅ Dataset created with 303994 entries


## Part 7 Prepare BERT ML Pipeline

### Tokenize Function

In [77]:
from transformers import AutoTokenizer

# ------------------------------
# Tokenize dataset with caching
# ------------------------------
def tokenize_bert_dataset_with_cache(hf_dataset, model_name, cache_dir="tokenizer_cache"):
    """
    Tokenize a HuggingFace Dataset and cache the result.
    """
    os.makedirs(cache_dir, exist_ok=True)
    
    # Hash dataset + model name to create cache filename
    hash_input = "".join([str(item) for item in hf_dataset]) + model_name
    dataset_hash = hashlib.md5(hash_input.encode('utf-8')).hexdigest()
    cache_path = os.path.join(cache_dir, f"tokenized_{dataset_hash}.pkl")
    
    if os.path.exists(cache_path):
        print(f"Loading cached tokenized dataset from {cache_path}")
        tokenized_dataset = joblib.load(cache_path)
    else:
        print(f"Tokenizing dataset for {model_name}...")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenized_dataset = hf_dataset.map(
            lambda x: tokenizer(x['text'], truncation=True, padding='max_length', max_length=512),
            batched=False
        )
        joblib.dump(tokenized_dataset, cache_path)
        print(f"Saved tokenized dataset cache: {cache_path}")
    
    return tokenized_dataset

### Extract Embeddings

The following code chunk creates a function that is passed into a function factory defined below that accepts a tokenized_dataset and extracts the embeddings. Effectively, a tokenized_dataset is created using tokenize_bert_dataset() but the embeddings still need to be extracted for expanding the feature space and being prepped for model training in run_full_workflow(), to be defined below.

In [76]:
# ------------------------------
# Patient-level embedding extraction with model-specific caching
# ------------------------------
def extract_patient_embeddings_from_df(df_embeddings, merge_note_types=True,
                                       model_name=None, cache_dir="embedding_cache"):
    """
    Aggregate embeddings per patient, cache per model.
    """
    os.makedirs(cache_dir, exist_ok=True)
    
    hash_input = df_embeddings.to_csv(index=False) + (model_name or "")
    dataset_hash = hashlib.md5(hash_input.encode('utf-8')).hexdigest()
    cache_path = os.path.join(cache_dir, f"patient_embeddings_{dataset_hash}.pkl")
    
    if os.path.exists(cache_path):
        print(f"Loading cached patient embeddings from {cache_path}")
        patient_embeddings_df = joblib.load(cache_path)
    else:
        patient_embeddings_df = df_embeddings.groupby(['subject_id', 'note_type_1']).mean().reset_index()
        joblib.dump(patient_embeddings_df, cache_path)
        print(f"Saved patient embeddings cache: {cache_path}")

    pivoted_dict = {}
    for note_type in patient_embeddings_df['note_type_1'].unique():
        temp_df = patient_embeddings_df[patient_embeddings_df['note_type_1'] == note_type].copy()
        temp_df = temp_df.drop(columns=['note_type_1'])
        pivoted_dict[note_type] = temp_df

    if merge_note_types:
        merged_df = pivoted_dict[list(pivoted_dict.keys())[0]].copy()
        merged_df = merged_df.rename(columns={col: f"{col}_{list(pivoted_dict.keys())[0]}" 
                                              for col in merged_df.columns if col != 'subject_id'})
        for note_type, df_note in list(pivoted_dict.items())[1:]:
            df_renamed = df_note.rename(columns={col: f"{col}_{note_type}" 
                                                 for col in df_note.columns if col != 'subject_id'})
            merged_df = pd.merge(merged_df, df_renamed, on='subject_id', how='left')
        return merged_df

    return pivoted_dict

The below code chunk defines two functions to save embeddings and run a full workflow in order to tokenize, extract embeddings, create pivots, perform radiology only embedding extraction, and finally save the datasets across different BERT models.

In [69]:
import os
import pandas as pd
from datasets import Dataset
# Ensure extract_patient_embeddings() is already imported

# ------------------------------
# Function to save only new embeddings
# ------------------------------
def save_embeddings_only(merged_df, radiology_df, model_name, base_output_dir="BERT/BERT_old"):
    safe_model_name = model_name.replace('/', '_')
    model_dir = os.path.join(base_output_dir, safe_model_name)
    os.makedirs(model_dir, exist_ok=True)

    # Save merged all-note-types DataFrame
    merged_path = os.path.join(model_dir, "merged_all_note_types.csv")
    merged_df.to_csv(merged_path, index=False)
    print(f"✅ Merged all-note-types DataFrame saved to: {merged_path}")

    # Save radiology-only DataFrame
    radiology_path = os.path.join(model_dir, "merged_radiology_only.csv")
    radiology_df.to_csv(radiology_path, index=False)
    print(f"✅ Radiology-only DataFrame saved to: {radiology_path}")

# ------------------------------
# Validation helper
# ------------------------------
def validate_bert_dataset(dataset, required_columns=None):
    if required_columns is None:
        required_columns = ['subject_id', 'note_type_1', 'text']

    # HuggingFace Dataset
    if isinstance(dataset, Dataset):
        dataset_columns = dataset.column_names
    # pandas DataFrame
    elif isinstance(dataset, pd.DataFrame):
        dataset_columns = dataset.columns.tolist()
    else:
        raise TypeError("Dataset must be a pandas DataFrame or HuggingFace Dataset.")

    missing = [col for col in required_columns if col not in dataset_columns]
    if missing:
        raise ValueError(f"❌ Dataset is missing required columns: {missing}")

    print(f"✅ Dataset validated. Columns present: {dataset_columns}")
    return True

### Full Workflow Function

In [75]:
import os
import torch
from torch.utils.data import DataLoader
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
import joblib
from tqdm import tqdm
import hashlib
import time

# ------------------------------
# Full workflow with progress ETA
# ------------------------------
def run_full_workflow(raw_notes_df, df_clean, bert_models,
                      batch_size=16, cache_dir="BERT/BERT_cache"):
    """
    End-to-end workflow with:
      - tokenization
      - GPU batching + caching of embeddings
      - patient-level aggregation (via extract_patient_embeddings_from_df)
      - merge with cleaned clinical data
      - radiology-only extraction
      - saving outputs for multiple BERT models
      
    Displays progress checkpoints with ETA estimates.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    if isinstance(raw_notes_df, pd.DataFrame):
        hf_dataset = Dataset.from_pandas(raw_notes_df.reset_index(drop=True))
        print(f"✅ Converted raw_notes DataFrame to HuggingFace Dataset with {len(hf_dataset)} entries")
    else:
        hf_dataset = raw_notes_df
        print(f"✅ Using provided HuggingFace Dataset with {len(hf_dataset)} entries")

    validate_bert_dataset(hf_dataset)

    for model_name in bert_models:
        print(f"\n=== Processing {model_name} ===")
        start_model_time = time.time()

        # ------------------------------
        # 1️⃣ Tokenize with caching
        # ------------------------------
        start_time = time.time()
        tokenized_dataset = tokenize_bert_dataset_with_cache(
            hf_dataset, model_name,
            cache_dir=os.path.join(cache_dir, "tokenizer_cache")
        )
        print(f"Tokenization completed in {time.time() - start_time:.1f}s")

        # ------------------------------
        # 2️⃣ GPU batching + embedding extraction with caching
        # ------------------------------
        embeddings_hash_input = "".join([str(item) for item in tokenized_dataset]) + model_name
        embeddings_cache_path = os.path.join(cache_dir, f"embeddings_{hashlib.md5(embeddings_hash_input.encode()).hexdigest()}.pkl")

        if os.path.exists(embeddings_cache_path):
            print(f"Loading cached embeddings from {embeddings_cache_path}")
            embeddings_df = joblib.load(embeddings_cache_path)
        else:
            print(f"Extracting embeddings for {model_name}...")
            model = AutoModel.from_pretrained(model_name)
            model.to(device)
            model.eval()

            # ------------------------------
            # Determine VRAM-safe batch size
            # ------------------------------
            if device.type == "cuda":
                total_vram = torch.cuda.get_device_properties(device).total_memory
                reserved_vram = torch.cuda.memory_reserved(device)
                free_vram = total_vram - reserved_vram
                # Rough heuristic: 1.5MB per token per sample, plus buffer for long sequences
                max_tokens = max(len(item['input_ids']) for item in tokenized_dataset)
                estimated_bytes_per_sample = 1.5e6 * max_tokens / 512  # scale relative to typical 512 tokens
                safe_batch_size = max(1, int(free_vram / (estimated_bytes_per_sample * 1.2)))  # 20% buffer
                batch_size = min(batch_size, safe_batch_size)
                print(f"⚡ Adjusted batch size for GPU memory: {batch_size}")

            dataloader = DataLoader(tokenized_dataset, batch_size=batch_size,
                                    collate_fn=lambda batch: {
                                        'input_ids': torch.stack([torch.tensor(b['input_ids']) for b in batch]).to(device),
                                        'attention_mask': torch.stack([torch.tensor(b['attention_mask']) for b in batch]).to(device),
                                        'subject_id': [b['subject_id'] for b in batch],
                                        'note_type_1': [b['note_type_1'] for b in batch]
                                    })

            embeddings_list = []
            subject_ids = []
            note_types = []

            total_batches = len(dataloader)
            with torch.no_grad():
                for i, batch in enumerate(tqdm(dataloader, desc="Embedding batches", unit="batch")):
                    batch_start = time.time()
                    try:
                        with torch.cuda.amp.autocast():
                            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
                            emb = outputs.pooler_output.cpu().numpy()
                    except RuntimeError as e:
                        if 'out of memory' in str(e):
                            print(f"⚠️ CUDA OOM at batch {i+1}, reducing batch size and retrying...")
                            torch.cuda.empty_cache()
                            # Simple fallback: process batch one by one
                            emb_list = []
                            for j in range(len(batch['input_ids'])):
                                single_input = batch['input_ids'][j].unsqueeze(0)
                                single_mask = batch['attention_mask'][j].unsqueeze(0)
                                with torch.cuda.amp.autocast():
                                    single_output = model(input_ids=single_input, attention_mask=single_mask)
                                    emb_list.append(single_output.pooler_output.cpu().numpy())
                            emb = np.vstack(emb_list)
                        else:
                            raise e
                    embeddings_list.append(emb)
                    subject_ids.extend(batch['subject_id'])
                    note_types.extend(batch['note_type_1'])
            
                    elapsed = time.time() - batch_start
                    remaining = elapsed * (total_batches - i - 1)
                    if i % 50 == 0 or i == total_batches - 1:
                        print(f"Batch {i+1}/{total_batches} complete, ETA ~{remaining/60:.1f} min")

            # Preallocate embeddings and ID arrays to avoid repeated appends/vstack
            total_rows = sum(emb.shape[0] for emb in embeddings_list)
            embedding_dim = embeddings_list[0].shape[1]
            all_embeddings = np.empty((total_rows, embedding_dim), dtype=np.float32)
            all_subject_ids = [None] * total_rows
            all_note_types = [None] * total_rows
            
            start = 0
            for emb, ids, notes in zip(embeddings_list, subject_ids, note_types):
                n = emb.shape[0]
                all_embeddings[start:start+n, :] = emb
                all_subject_ids[start:start+n] = ids if isinstance(ids, list) else [ids]
                all_note_types[start:start+n] = notes if isinstance(notes, list) else [notes]
                start += n
            
            embeddings_df = pd.DataFrame(all_embeddings, columns=[f"dim_{i}" for i in range(embedding_dim)])
            embeddings_df['subject_id'] = all_subject_ids
            embeddings_df['note_type_1'] = all_note_types

            joblib.dump(embeddings_df, embeddings_cache_path)
            print(f"Saved embeddings cache: {embeddings_cache_path}")
            del model
            try:
                del outputs
            except NameError:
                pass
            torch.cuda.empty_cache()

        # ------------------------------
        # 3️⃣ Patient-level aggregation with caching
        # ------------------------------
        agg_cache_path = os.path.join(cache_dir, f"patient_agg_{model_name.replace('/', '_')}.pkl")
        if os.path.exists(agg_cache_path):
            print(f"Loading cached patient-level embeddings from {agg_cache_path}")
            merged_df = joblib.load(agg_cache_path)
        else:
            start_time = time.time()
            merged_df = extract_patient_embeddings_from_df(
                embeddings_df, merge_note_types=True,
                model_name=model_name,
                cache_dir=cache_dir
            )
            joblib.dump(merged_df, agg_cache_path)
            print(f"Patient aggregation completed in {time.time() - start_time:.1f}s")
            print(f"Saved patient-level cache: {agg_cache_path}")

        # ------------------------------
        # 4️⃣ Merge with cleaned clinical data
        # ------------------------------
        merged_with_cleaned = pd.merge(df_clean, merged_df, on='subject_id', how='left')

        # ------------------------------
        # 5️⃣ Radiology-only DataFrame
        # ------------------------------
        radiology_cols = [col for col in merged_with_cleaned.columns if '_radiology' in col]
        radiology_df = pd.concat([merged_with_cleaned['subject_id'], merged_with_cleaned[radiology_cols]], axis=1)

        # ------------------------------
        # 6️⃣ Save outputs
        # ------------------------------
        output_dir = os.path.join(cache_dir, model_name.replace('/', '_'))
        save_embeddings_only(merged_with_cleaned, radiology_df, model_name, base_output_dir=output_dir)

        print(f"✅ Completed workflow for {model_name} in {(time.time() - start_model_time)/60:.1f} min")
        print(f"   - All note types shape: {merged_with_cleaned.shape}")
        print(f"   - Radiology-only shape: {radiology_df.shape}")


### Embedding Extraction and Assembling BERT Datasets for Model Training

In [None]:
# ------------------------------
# Perform Embedding Extraction and BERT Dataset Assembly for Each Transformer
# ------------------------------
bert_models = [
    'emilyalsentzer/Bio_ClinicalBERT',
    'dmis-lab/biobert-base-cased-v1.2',
    'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'
]

# raw_notes_df: can be either a pandas DataFrame or a HuggingFace Dataset
# df_clean: your clinical/demographic dataset
run_full_workflow(
    raw_notes_df=bert_dataset,   # your notes dataset
    df_clean=df_clean,           # your clinical dataset
    bert_models=bert_models,     # list of BERT models
    batch_size=16,               # optional, VRAM-adaptive will adjust automatically
    cache_dir="BERT/BERT_cache"  # optional, where tokenization/embedding caches go
)

Proceed to `sepsis_model_training.ipynb` for model training, testing, etc.