##### Improving Prediction Accuracy of Sepsis Mortality using Machine Learning and Natural Language Processing
## Tyler Kelly

# Machine Learning Pipeline

## Set Up and Install Dependencies

In [None]:
#pip install datasets transformers pandas shap

## Part 0 Preprocessing (Pull Code from Author's ipynb)

The following code is adapted from the github repository 'https://github.com/yuyinglu2000/Sepsis-Mortality'

In [1]:
#AC

import warnings
warnings.filterwarnings("ignore")
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import lightgbm as lgb

### Data Mapping

Begin by creating bigquery search to get the 38 unique features

In [None]:
#df = pd.read_csv('Data/data_raw.csv')

In [None]:
df_raw.shape
# Expect to get a dataframe 808188x38 (38 old columns)

In [None]:
#AC

# regroup the race
race_mapping = {
    'WHITE': 'White',
    'HISPANIC OR LATINO': 'Hispanic or Latin',
    'BLACK/AFRICAN AMERICAN': 'Black or African American',
    'BLACK/CARIBBEAN ISLAND': 'Black or African American',
    'HISPANIC/LATINO - DOMINICAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - CENTRAL AMERICAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - GUATEMALAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - PUERTO RICAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - SALVADORAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - HONDURAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - MEXICAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - CUBAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - COLUMBIAN': 'Hispanic or Latin',
    'BLACK/CAPE VERDEAN': 'Black or African American',
    'BLACK/AFRICAN': 'Black or African American',
    'SOUTH AMERICAN': 'Hispanic or Latin',
    'WHITE - BRAZILIAN': 'Hispanic or Latin',
    'WHITE - OTHER EUROPEAN': 'White',
    'WHITE - RUSSIAN': 'White',
    'WHITE - EASTERN EUROPEAN': 'White',
    'ASIAN': 'Others race',
    'ASIAN - SOUTH EAST ASIAN': 'Others race',
    'ASIAN - CHINESE': 'Others race',
    'ASIAN - ASIAN INDIAN': 'Others race',
    'ASIAN - KOREAN': 'Others race',
    'AMERICAN INDIAN/ALASKA NATIVE': 'Others race',
    'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER': 'Others race',
    'MULTIPLE RACE/ETHNICITY': 'Others race',
    'PORTUGUESE': 'Others race',
    'UNKNOWN': 'Others race',
    'OTHER': 'Others race',
    'UNABLE TO OBTAIN': 'Others race',
    'PATIENT DECLINED TO ANSWER': 'Others race'
}

df_raw['race'] = df_raw['race'].map(race_mapping)

In [None]:
#AC

import pandas as pd

# Assuming df is your DataFrame
# df = ... (your DataFrame)

# Define a mapping for antibiotics to their respective groups
antibiotic_mapping = {
    'Gentamicin Sulfate': 'Aminoglycoside',
    'Tobramycin Sulfate': 'Aminoglycoside',
    'Streptomycin Sulfate': 'Aminoglycoside',
    'Neomycin Sulfate': 'Aminoglycoside',
    'Neomycin/Polymyxin B Sulfate': 'Aminoglycoside',
    'Meropenem': 'Carbapenem',
    'Meropenem Graded Challenge': 'Carbapenem',
    'Vancomycin': 'Glycopeptide',
    'Vancomycin Oral Liquid': 'Glycopeptide',
    'Vancomycin Antibiotic Lock': 'Glycopeptide',
    'Vancomycin Enema': 'Glycopeptide',
    'Vancomycin Intrathecal': 'Glycopeptide',
    'Vancomycin Ora': 'Glycopeptide',
    'Linezolid': 'Oxazolidinone',
    'Linezolid Suspension': 'Oxazolidinone',
    'Penicillin G Benzathine': 'Penicillin',
    'Penicillin G Potassium': 'Penicillin',
    'Penicillin V Potassium': 'Penicillin',
    'Sulfameth/Trimethoprim': 'Sulfonamide',
    'Sulfameth/Trimethoprim DS': 'Sulfonamide',
    'Sulfameth/Trimethoprim SS': 'Sulfonamide',
    'Sulfamethoxazole-Trimethoprim': 'Sulfonamide',
    'Sulfameth/Trimethoprim Suspension': 'Sulfonamide',
    'Tetracycline': 'Tetracycline',
    'Tetracycline HCl': 'Tetracycline'
    # Add more mappings as needed
}

# Applying the mapping to the 'antibiotic' column
df_raw['antibiotic'] = df_raw['antibiotic'].map(antibiotic_mapping)

In [None]:
#AC
df_raw['antibiotic'].unique()

### Get Dummy Variables

In [None]:
#AC
df_encoded = pd.get_dummies(df_raw, columns=df_raw.select_dtypes(include=['object']).columns)
df_dropped = df_encoded.dropna()
df_dropped.info()

In [None]:
### Check empty values *for tetracycline* ###
#AC
empty_values = df_dropped['antibiotic_Tetracycline'].isnull().any()
empty_values

In [None]:
df.info

After applying get_dummy_variables there is now 53 columns

### Drop Duplications

In [None]:
#AC with minor edits

duplicated_rows_mask = df_dropped['subject_id'].duplicated(keep=False)

# Extract the duplicated rows
duplicated_rows = df_dropped[duplicated_rows_mask]
new_data  = df_dropped.drop_duplicates()
duplicated_rows_mask = new_data['subject_id'].duplicated(keep=False)

# Extract the duplicated rows
duplicated_rows = new_data[duplicated_rows_mask]
# Separate out columns based on data types
int_float_cols = new_data.select_dtypes(include=['int64', 'float64']).columns
uint8_cols = new_data.select_dtypes(include=['uint8']).columns

# Sort dataframe
# For int and float columns: sort in descending order so that larger values come first
df_raw = new_data.sort_values(by=list(int_float_cols), ascending=False)

# For uint8 columns: sort in descending order so that 1 comes before 0
df_raw = df_raw.sort_values(by=list(uint8_cols), ascending=False)

# Drop duplicates based on subject_id, keeping the first (which are the desired rows after sorting)
df_reduced = df_raw.drop_duplicates(subset='subject_id', keep='first')

# Reset index if needed
df_reduced = df_reduced.reset_index(drop=True)
pd.set_option('display.max_columns', None)
df_reduced

In [None]:
df_reduced.shape

After reducing the dataframe we get the 6401 patients reported in the paper.

In [None]:
df_reduced.info

In [None]:
df_subject = df_reduced['subject_id']

In [None]:
df_subject.to_csv("data_subject_id_ready_to_query.csv", index=False)

## Part 1 Upload data_ready_to_merge.csv to BigQuery

Upload the subject_id dataset to bigquery and merge the dataset to radiology and discharge notes separately. Save the downloaded file to downloads (or find a way to save it directly to my BIOST 2021 Thesis / Main ) as
'data_radiology_notes.csv' and 'data_discharge_notes.csv' then joing below.

Place SQL code chunks below (if time write them into the script)

# READ THIS

data_full_notes_old.csv uses an old outdated dataset but I haven't figured out the correct sql to get the correct dataset at 6401.

For now, I will use `df_old` when using the outdated dataset and `df` when I get the new corrected one.

To get the radiology and discharge notes, I uploaded the data_after_cleaning table to big query and used the below code sql query to get a table with
data_after_cleaning joined to the notes columns found in the mimic-iv discharge and radiology tables.

Above, using the correct subject_id list to get the 6401 patients, I use the same method to obtain the rad_notes and discharge_notes tables using bigquery, and join in an identical manner.

In [2]:
df = pd.read_csv('Data/Old/data_full_notes_old.csv') # df_old
# change after fixing sql

In [3]:
df.shape

(303994, 58)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303994 entries, 0 to 303993
Data columns (total 58 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   int64_field_0                          303994 non-null  int64  
 1   subject_id                             303994 non-null  int64  
 2   hospital_expire_flag                   303994 non-null  int64  
 3   max_age                                303994 non-null  int64  
 4   los_icu                                303994 non-null  float64
 5   first_hosp_stay                        303994 non-null  bool   
 6   suspected_infection                    303994 non-null  int64  
 7   sofa_score                             303994 non-null  int64  
 8   sepsis3                                303994 non-null  bool   
 9   avg_urineoutput                        303994 non-null  float64
 10  glucose_min                            303994 non-null  

## Part 2 Truncate Notes for Word2Vec

In [5]:
from joblib import Parallel, delayed
import pandas as pd
import re
import multiprocessing

In [6]:
# === Step 1: Define cleaning function - Clean individual note text ===
def clean_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    text = re.sub(r'_+', '', text)    # Remove underlines
    text = re.sub(r'[^\w\s.,:;!?()\-\n]', '', text)  # Remove junk, keep clinical symbols
    return text.strip()

In [7]:
# === Step 2: Function to process one group. Process a group into (subject_id, note_type, combined_notes) ===
def process_group(record):
    subject_id = record['subject_id']
    note_type_1 = record['note_type_1'] #this column identifies addendums and base notes to just be 'radiology' notes
    texts = record['text']
    cleaned_notes = [clean_text(text) for text in texts]
    combined_notes = " ".join(cleaned_notes)
    return {
        'subject_id': subject_id,
        'note_type_1': note_type_1,
        'combined_notes': combined_notes
    }

In [8]:
# === Step 3: Group the notes. Load and group your data ===
# Replace this with your actual loading logic / dataframe
# Data loaded above as `df`
grouped_df = (
    df.groupby(['subject_id', 'note_type_1'])['text']
    .apply(list)
    .reset_index()
)

records = grouped_df.to_dict('records')

In [9]:
# === Step 4: Parallel processing with joblib ===

#this num_cores variable is used later for other parallel processing jobs
num_cores = multiprocessing.cpu_count() - 1

processed = Parallel(n_jobs=num_cores)(
    delayed(process_group)(record) for record in records
)

The following step creates a long dataframe for notes per patient by type (some patients have radiology, discharge, both, or none for notes), it is later converted to wide to potentially compare NLP to see if discharge notes significantly provide insight into mortality rates.

In [10]:
# === Step 5: Create DataFrame and save to csv ===
nlp_long_df = pd.DataFrame(processed).sort_values(by=['subject_id', 'note_type_1'])

nlp_long_df.to_csv("Data/Old/data_trunc_notes_old.csv", index=False) # change after fixing sql

In [54]:
nlp_long_df.shape

(10402, 3)

In [11]:
# === Step 6: Pivot to wide format for multiple columns ===
# Convert note_type to columns like 'Radiology_notes', etc.
nlp_wide_df = nlp_long_df.pivot(
    index='subject_id',
    columns='note_type_1',
    values='combined_notes'
).reset_index()

nlp_wide_df.columns.name = None # Remove category label

# Rename columns to make clear
nlp_wide_df = nlp_wide_df.rename(columns={
    'radiology': 'Radiology_notes',
    'discharge': 'Discharge_summary_notes'
})

nlp_wide_df = nlp_wide_df.fillna("") #fills NA columns with empty strings

# Save
nlp_wide_df.to_csv("Data/Old/data_trunc_notes_wide_old.csv", index=False) # change after fixing sql

In [12]:
# === Step 7: Combine Radiology and Discharge notes per subject_id ===
nlp_combined_df = nlp_wide_df.copy()

# Concatenate the two columns into one
nlp_combined_df['combined_notes'] = (
    nlp_combined_df['Radiology_notes'].str.strip() + " " +
    nlp_combined_df['Discharge_summary_notes'].str.strip()
).str.strip()

# Combined DataFrame with just subject_id + combined text
nlp_combined_notes_df = nlp_combined_df[['subject_id', 'combined_notes']]

# Save
nlp_combined_notes_df.to_csv("Data/Old/data_trunc_notes_combined_old.csv", index=False) # change after fixing sql

The code chunk below creates `nlp_ready_df`, a dataframe that includes a row for each subject_id and appends 3 new columns with all text truncated based on radiology notes, discharge notes, and combined radiology and discharged notes. I lose information related to the note_id, note_id_type, and more importantly charttime, but here it allows word2vec to work properly. This part of my thesis focuses more on using embeddings from the clinical text to aid in predicting mortality rather than finding the best way to do it (i.e. finding the best time of day where it is more likely to happen than not, or finding the best drug at predicting it, etc.).

In [52]:
# === Step 8: Join nlp_wide_df and nlp_combined_notes_df to data_after_cleaning ===

# === Step i: Load the original cleaned dataset, df_reduced ===
# ie this is df_reduced

# For now, this is `df_clean` the cleaned dataset.

df_clean = pd.read_csv('Data/Old/data_after_cleaning.csv')

# === Step ii: Merge df_reduced with nlp_wide_df ===
# This adds the radiology and discharge notes as 2 new columns to df_reduced
# use data_clean until df_reduced is finalized

nlp_ready_df = df_clean.merge(
    nlp_wide_df,
    on='subject_id',
    how='left'
)

# === Step iii: Merge with combined notes ===
# This adds one new column of all notes combined together as a single note (per patient) to the nlp_ready_df above
nlp_ready_df = nlp_ready_df.merge(
    nlp_combined_notes_df,
    on='subject_id',
    how='left'
)



# Save
nlp_ready_df.to_csv("Data/Old/data_nlp_ready_old.csv", index=False)


In [24]:
# === Step 9: Check shape of dataframes ===
nlp_wide_df.shape

(5208, 3)

In [25]:
nlp_combined_notes_df.shape

(5208, 2)

In [26]:
df_reduced.shape

NameError: name 'df_reduced' is not defined

In [27]:
nlp_ready_df.shape

(5208, 51)

In [28]:
nlp_ready_df.columns

Index(['subject_id', 'hospital_expire_flag', 'max_age', 'los_icu',
       'first_hosp_stay', 'suspected_infection', 'sofa_score', 'sepsis3',
       'avg_urineoutput', 'glucose_min', 'glucose_max', 'glucose_average',
       'sodium_max', 'sodium_min', 'sodium_average', 'diabetes_without_cc',
       'diabetes_with_cc', 'severe_liver_disease', 'aids', 'renal_disease',
       'heart_rate_min', 'heart_rate_max', 'heart_rate_mean', 'sbp_min',
       'sbp_max', 'sbp_mean', 'dbp_min', 'dbp_max', 'dbp_mean',
       'resp_rate_min', 'resp_rate_max', 'resp_rate_mean', 'spo2_min',
       'spo2_max', 'spo2_mean', 'coma', 'albumin',
       'race_Black or African American', 'race_Hispanic or Latin',
       'race_Others race', 'race_White', 'antibiotic_Vancomycin',
       'antibiotic_Vancomycin Antibiotic Lock', 'antibiotic_Vancomycin Enema',
       'antibiotic_Vancomycin Intrathecal',
       'antibiotic_Vancomycin Oral Liquid', 'gender_F', 'gender_M',
       'Discharge_summary_notes', 'Radiology_note

In [29]:
nlp_ready_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5208 entries, 0 to 5207
Data columns (total 51 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   subject_id                             5208 non-null   int64  
 1   hospital_expire_flag                   5208 non-null   int64  
 2   max_age                                5208 non-null   int64  
 3   los_icu                                5208 non-null   float64
 4   first_hosp_stay                        5208 non-null   bool   
 5   suspected_infection                    5208 non-null   int64  
 6   sofa_score                             5208 non-null   int64  
 7   sepsis3                                5208 non-null   bool   
 8   avg_urineoutput                        5208 non-null   float64
 9   glucose_min                            5208 non-null   int64  
 10  glucose_max                            5208 non-null   int64  
 11  gluc

## Part 3 Create Note File for Word2Vec

In [30]:
import pandas as pd

# Write the 'Radiology_notes' column to a text file, one line per document
with open("Data/Old/W2V_old/w2v_Radiology_notes.txt", "w", encoding="utf-8") as f: # change after fixing sql
    for line in nlp_ready_df["Radiology_notes"]:
        f.write(str(line).strip() + "\n")

In [32]:
# Write the 'Discharge_summary_notes' column to a text file, one line per document
with open("Data/Old/W2V_old/w2v_Discharge_notes.txt", "w", encoding="utf-8") as f: # change after fixing sql
    for line in nlp_ready_df["Discharge_summary_notes"]:
        f.write(str(line).strip() + "\n")

In [33]:
# Write the 'combined_notes' column to a text file, one line per document
with open("Data/Old/W2V_old/w2v_combined_notes.txt", "w", encoding="utf-8") as f: # change after fixing sql
    for line in nlp_ready_df["combined_notes"]:
        f.write(str(line).strip() + "\n")

## Part 4 Prepare Word2Vec - Proceed to main.rmd

## Part 5 Model Training - Proceed to Sepsis_Model_Training.ipynb

After completing / running / saving models in model training, upload them into the workspace in the following code chunks if necessary.

## Part 6 Create Dataset for Bert

Metadata csv files for radiology and discharge are created and saved earlier

In [34]:
import os
import pandas as pd
from joblib import Parallel, delayed
import multiprocessing

import os
import pandas as pd
from joblib import Parallel, delayed

def write_single_note_clean(row, output_dir):
    try:
        subject_id = str(row["subject_id"]).strip()
        note_id = str(row["note_id"]).strip()
        note_text = row.get("text", "")

        # Ensure note_text is a clean string
        if not isinstance(note_text, str):
            note_text = "" if pd.isna(note_text) else str(note_text)
        note_text = note_text.strip()

        # Skip empty notes
        if not note_text:
            return

        # Make subject directory
        subject_dir = os.path.join(output_dir, subject_id)
        os.makedirs(subject_dir, exist_ok=True)

        # Save note
        file_path = os.path.join(subject_dir, f"{subject_id}_{note_id}.txt")
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(note_text)

    except Exception as e:
        print(f"Error writing note {row.get('note_id', 'unknown')}: {e}")



# Change output directories / metadata for discharge notes
def write_mimic_notes_parallel_for_bert(df, 
                                        output_dir="BERT/BERT_old/rad_notes", 
                                        metadata_csv="BERT/BERT_old/metadata_rad_notes_old.csv", 
                                        n_jobs=num_cores):
    os.makedirs(output_dir, exist_ok=True)
    rows = df.to_dict("records")

    # Write notes in parallel
    Parallel(n_jobs=n_jobs, prefer="threads")(
        delayed(write_single_note_clean)(row, output_dir) for row in rows
    )

    # Save metadata
    metadata_cols = ['subject_id', 'note_id', 'note_type_1', 'charttime']
    metadata_df = df.dropna(subset=['subject_id', 'note_id'])
    metadata_df = metadata_df[metadata_cols]
    metadata_df.to_csv(metadata_csv, index=False)

    print(f"‚úÖ Notes saved to: {output_dir}")
    print(f"‚úÖ Metadata saved to: {metadata_csv}")


In [35]:
from datasets import Dataset
from joblib import Parallel, delayed

def create_bert_dataset_from_notes(metadata_csv, notes_root_dir):
    df = pd.read_csv(metadata_csv)

    def load_text(row):
        subject_id = str(row['subject_id'])
        note_id = str(row['note_id'])
        file_path = os.path.join(notes_root_dir, subject_id, f"{subject_id}_{note_id}.txt")
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read().strip()
        except FileNotFoundError:
            return ""

    texts = Parallel(n_jobs=num_cores)(delayed(load_text)(row) for _, row in df.iterrows())
    df['text'] = texts
    df = df[df['text'].str.strip() != ""]  # Remove blanks

    dataset = Dataset.from_pandas(df.reset_index(drop=True))
    print(f"‚úÖ Dataset created with {len(dataset)} entries")
    return dataset

In [2]:
from transformers import AutoTokenizer

def tokenize_bert_dataset(dataset, model_name='emilyalsentzer/Bio_ClinicalBERT', text_column='text'):
    """
    Tokenizes a HuggingFace dataset for a specified BERT model.

    Args:
        dataset: HuggingFace dataset containing at least `text_column`.
        model_name: BERT model name to use for tokenization.
        text_column: Column name containing the text to tokenize.

    Returns:
        tokenized: HuggingFace dataset with tokenized fields.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize_function(example):
        return tokenizer(
            example[text_column],
            truncation=True,
            padding='max_length',
            max_length=512
        )

    print(f"üîÑ Tokenizing dataset for {model_name}...")
    tokenized = dataset.map(tokenize_function, batched=True)
    return tokenized

In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModel
import pandas as pd
import numpy as np
import os
import joblib
import hashlib

def extract_patient_embeddings(tokenized_dataset,
                               model_name='emilyalsentzer/Bio_ClinicalBERT',
                               batch_size=16,
                               use_pooler=True,
                               device=None,
                               cache_dir='embedding_cache',
                               seed=42,
                               merge_note_types=False):
    """
    Extract patient-level embeddings from a tokenized BERT dataset with caching,
    optionally merging all note types into a single wide DataFrame ready for ML.

    Args:
        tokenized_dataset: HuggingFace Dataset with tokenized clinical notes.
            Must include 'subject_id' and 'note_type'.
        model_name: Pretrained BERT model name.
        batch_size: Batch size for inference.
        use_pooler: If True, use pooler_output; else use mean of last_hidden_state.
        device: 'cuda' or 'cpu'. If None, auto-choose.
        cache_dir: Directory to store cached embeddings.
        seed: Random seed for reproducibility.
        merge_note_types: If True, return a single DataFrame with all note types pivoted
                          as separate columns per patient.

    Returns:
        pivoted_dict OR merged_df:
            - If merge_note_types=False: dict of note_type -> patient-level DataFrame
            - If merge_note_types=True: single wide DataFrame with all note types as columns
    """
    os.makedirs(cache_dir, exist_ok=True)

    # Set seeds for reproducibility
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

    # Generate hash-based cache filename
    hash_input = ''
    for item in tokenized_dataset:
        hash_input += str(item['subject_id'])
        hash_input += str(item['note_type'])
        hash_input += ''.join(map(str, item['input_ids']))
    dataset_hash = hashlib.md5(hash_input.encode('utf-8')).hexdigest()
    safe_model_name = model_name.replace('/', '_')
    cache_path = os.path.join(cache_dir,
                              f"{safe_model_name}_{dataset_hash}_pooler{use_pooler}.pkl")

    # Load cache if exists
    if os.path.exists(cache_path):
        print(f"Loading cached embeddings from {cache_path}...")
        patient_embeddings_df = joblib.load(cache_path)
    else:
        device = device or ('cuda' if torch.cuda.is_available() else 'cpu')

        # Load BERT model
        model = AutoModel.from_pretrained(model_name)
        model.to(device)
        model.eval()

        # Collate function
        def collate_fn(batch):
            collated = {
                'input_ids': torch.stack([torch.tensor(b['input_ids']) for b in batch]).to(device),
                'attention_mask': torch.stack([torch.tensor(b['attention_mask']) for b in batch]).to(device),
                'subject_id': [b['subject_id'] for b in batch],
                'note_type': [b['note_type'] for b in batch]
            }
            return collated

        dataloader = DataLoader(tokenized_dataset, batch_size=batch_size, collate_fn=collate_fn)

        embeddings_list = []
        subject_ids = []
        note_types = []

        with torch.no_grad():
            for batch in dataloader:
                outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
                emb = outputs.pooler_output if use_pooler else outputs.last_hidden_state.mean(dim=1)
                embeddings_list.append(emb.cpu().numpy())
                subject_ids.extend(batch['subject_id'])
                note_types.extend(batch['note_type'])

        all_embeddings = np.vstack(embeddings_list)
        df = pd.DataFrame(all_embeddings)
        df['subject_id'] = subject_ids
        df['note_type'] = note_types

        # Aggregate per patient per note_type
        patient_embeddings_df = df.groupby(['subject_id', 'note_type']).mean().reset_index()
        joblib.dump(patient_embeddings_df, cache_path)
        print(f"Saved embeddings to cache: {cache_path}")

    # Pivot each note_type into fully ready-to-merge DataFrames
    pivoted_dict = {}
    for note_type in patient_embeddings_df['note_type'].unique():
        temp_df = patient_embeddings_df[patient_embeddings_df['note_type'] == note_type].copy()
        temp_df = temp_df.drop(columns=['note_type'])
        pivoted_dict[note_type] = temp_df

    if merge_note_types:
        # Merge all note types into single wide DataFrame
        merged_df = pivoted_dict[list(pivoted_dict.keys())[0]].copy()
        merged_df = merged_df.rename(columns={col: f"{col}_{list(pivoted_dict.keys())[0]}" for col in merged_df.columns if col != 'subject_id'})

        for note_type, df_note in list(pivoted_dict.items())[1:]:
            df_renamed = df_note.rename(columns={col: f"{col}_{note_type}" for col in df_note.columns if col != 'subject_id'})
            merged_df = pd.merge(merged_df, df_renamed, on='subject_id', how='left')

        return merged_df

    return pivoted_dict


In [3]:
import os
import pandas as pd
from datasets import Dataset
# Ensure extract_patient_embeddings() is already imported

# ------------------------------
# Function to save only new embeddings
# ------------------------------
def save_embeddings_only(merged_df, radiology_df, model_name, base_output_dir="BERT/BERT_old"):
    safe_model_name = model_name.replace('/', '_')
    model_dir = os.path.join(base_output_dir, safe_model_name)
    os.makedirs(model_dir, exist_ok=True)

    # Save merged all-note-types DataFrame
    merged_path = os.path.join(model_dir, "merged_all_note_types.csv")
    merged_df.to_csv(merged_path, index=False)
    print(f"‚úÖ Merged all-note-types DataFrame saved to: {merged_path}")

    # Save radiology-only DataFrame
    radiology_path = os.path.join(model_dir, "merged_radiology_only.csv")
    radiology_df.to_csv(radiology_path, index=False)
    print(f"‚úÖ Radiology-only DataFrame saved to: {radiology_path}")

# ------------------------------
# End-to-end workflow (updated)
# ------------------------------
def run_full_workflow(raw_notes, data_after_cleaning_df, bert_models):
    """
    End-to-end workflow: tokenization, embedding extraction, pivoting, radiology-only extraction,
    and saving embedding datasets for multiple BERT models.

    Args:
        raw_notes: pandas DataFrame OR HuggingFace Dataset with columns ['subject_id', 'note_type', 'text']
        data_after_cleaning_df: pandas DataFrame with clinical/demographic features
        bert_models: list of BERT model names
    """
    # Convert pandas DataFrame to HuggingFace Dataset if needed
    if isinstance(raw_notes, pd.DataFrame):
        hf_dataset = Dataset.from_pandas(raw_notes.reset_index(drop=True))
        print(f"‚úÖ Converted raw_notes DataFrame to HuggingFace Dataset with {len(hf_dataset)} entries")
    else:
        hf_dataset = raw_notes
        print(f"‚úÖ Using provided HuggingFace Dataset with {len(hf_dataset)} entries")

    for model_name in bert_models:
        print(f"\n=== Processing {model_name} ===")

        # 1Ô∏è‚É£ Tokenize
        tokenized_dataset = tokenize_bert_dataset(hf_dataset, model_name=model_name)

        # 2Ô∏è‚É£ Extract embeddings, pivot all note types
        merged_df = extract_patient_embeddings(tokenized_dataset,
                                               model_name=model_name,
                                               merge_note_types=True)

        # 3Ô∏è‚É£ Merge with cleaned clinical data
        merged_with_cleaned = pd.merge(data_after_cleaning_df, merged_df,
                                       on='subject_id', how='left')

        # 4Ô∏è‚É£ Radiology-only DataFrame
        radiology_cols = [col for col in merged_with_cleaned.columns if col.endswith('_radiology')]
        radiology_df = pd.concat([merged_with_cleaned['subject_id'], merged_with_cleaned[radiology_cols]], axis=1)

        # 5Ô∏è‚É£ Save embedding datasets
        save_embeddings_only(merged_with_cleaned, radiology_df, model_name)

        print(f"‚úÖ Completed workflow for {model_name}")
        print(f"   - All note types shape: {merged_with_cleaned.shape}")
        print(f"   - Radiology-only shape: {radiology_df.shape}")

# ------------------------------
# Example usage
# ------------------------------
bert_models = [
    'emilyalsentzer/Bio_ClinicalBERT',
    'dmis-lab/biobert-base-cased-v1.2',
    'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'
]

# raw_notes: can be either a pandas DataFrame or a HuggingFace Dataset
# data_after_cleaning_df: your clinical/demographic dataset
run_full_workflow(raw_notes, data_after_cleaning_df, bert_models)


NameError: name 'raw_notes' is not defined

In [38]:
# Load Notes
bert_rad = pd.read_csv("Data/Old/data_full_notes_old.csv")

In [39]:
# Test mimic_notes_parallel_for_bert
import time

sample_df = df.sample(100)
start = time.time()
write_mimic_notes_parallel_for_bert(df = sample_df, output_dir="BERT/BERT_old/rad_notes_test", metadata_csv="BERT/BERT_old/metadata_rad_notes_old_test.csv", n_jobs=1)
print(f"Time for 100 notes: {time.time() - start:.2f} seconds")

‚úÖ Notes saved to: BERT/BERT_old/rad_notes_test
‚úÖ Metadata saved to: BERT/BERT_old/metadata_rad_notes_old_test.csv
Time for 100 notes: 0.11 seconds


In [40]:
# Extract and save notes
write_mimic_notes_parallel_for_bert(bert_rad)

‚úÖ Notes saved to: BERT/BERT_old/rad_notes
‚úÖ Metadata saved to: BERT/BERT_old/metadata_rad_notes_old.csv


In [42]:
# Test Rebuild dataset
import time

start = time.time()
create_bert_dataset_from_notes("BERT/BERT_old/metadata_rad_notes_old_test.csv", "BERT/BERT_old/rad_notes_test")
print(f"Time for 100 notes: {time.time() - start:.2f} seconds")

‚úÖ Dataset created with 100 entries
Time for 100 notes: 0.10 seconds


In [43]:
# Rebuild Dataset
rad_bert_dataset_rebuilt = create_bert_dataset_from_notes("BERT/BERT_old/metadata_rad_notes_old.csv", "BERT/BERT_old/rad_notes")

‚úÖ Dataset created with 303994 entries


In [44]:
# Tokenize with Clinical BERT
tokenized_dataset = tokenize_bert_dataset(rad_bert_dataset_rebuilt)

üîÑ Tokenizing dataset...


Map:   0%|          | 0/303994 [00:00<?, ? examples/s]

In [45]:
# Save for later
save_tokenized_dataset(tokenized_dataset)

Saving the dataset (0/4 shards):   0%|          | 0/303994 [00:00<?, ? examples/s]

‚úÖ Tokenized dataset saved to: BERT/BERT_old/tokenized_dataset


## Part 7 Clinical BERT ML Pipeline

In [3]:
# Placeholder for further clinical bert preprocessing if necessary

Once the preprocessing for clinical BERT is complete, proceed to `sepsis_model_training.ipynb` for model training, testing, etc.