##### Improving Prediction Accuracy of Sepsis using Natural Language Processing
## Tyler Kelly

# Set Up and Install Dependencies

In [None]:
#pip install datasets transformers pandas shap

## Part 0 Preprocessing (Pull Code from Author's ipynb)

The following code is adapted from the github repository 'https://github.com/yuyinglu2000/Sepsis-Mortality'

In [None]:
#AC

import warnings
warnings.filterwarnings("ignore")
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import lightgbm as lgb

### Data Mapping

Begin by creating bigquery search to get the 38 unique features

In [None]:
#df = pd.read_csv('Data/data_raw.csv')

In [None]:
df_raw.shape
# Expect to get a dataframe 808188x38 (38 old columns)

In [None]:
#AC

# regroup the race
race_mapping = {
    'WHITE': 'White',
    'HISPANIC OR LATINO': 'Hispanic or Latin',
    'BLACK/AFRICAN AMERICAN': 'Black or African American',
    'BLACK/CARIBBEAN ISLAND': 'Black or African American',
    'HISPANIC/LATINO - DOMINICAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - CENTRAL AMERICAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - GUATEMALAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - PUERTO RICAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - SALVADORAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - HONDURAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - MEXICAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - CUBAN': 'Hispanic or Latin',
    'HISPANIC/LATINO - COLUMBIAN': 'Hispanic or Latin',
    'BLACK/CAPE VERDEAN': 'Black or African American',
    'BLACK/AFRICAN': 'Black or African American',
    'SOUTH AMERICAN': 'Hispanic or Latin',
    'WHITE - BRAZILIAN': 'Hispanic or Latin',
    'WHITE - OTHER EUROPEAN': 'White',
    'WHITE - RUSSIAN': 'White',
    'WHITE - EASTERN EUROPEAN': 'White',
    'ASIAN': 'Others race',
    'ASIAN - SOUTH EAST ASIAN': 'Others race',
    'ASIAN - CHINESE': 'Others race',
    'ASIAN - ASIAN INDIAN': 'Others race',
    'ASIAN - KOREAN': 'Others race',
    'AMERICAN INDIAN/ALASKA NATIVE': 'Others race',
    'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER': 'Others race',
    'MULTIPLE RACE/ETHNICITY': 'Others race',
    'PORTUGUESE': 'Others race',
    'UNKNOWN': 'Others race',
    'OTHER': 'Others race',
    'UNABLE TO OBTAIN': 'Others race',
    'PATIENT DECLINED TO ANSWER': 'Others race'
}

df_raw['race'] = df_raw['race'].map(race_mapping)

In [None]:
#AC

import pandas as pd

# Assuming df is your DataFrame
# df = ... (your DataFrame)

# Define a mapping for antibiotics to their respective groups
antibiotic_mapping = {
    'Gentamicin Sulfate': 'Aminoglycoside',
    'Tobramycin Sulfate': 'Aminoglycoside',
    'Streptomycin Sulfate': 'Aminoglycoside',
    'Neomycin Sulfate': 'Aminoglycoside',
    'Neomycin/Polymyxin B Sulfate': 'Aminoglycoside',
    'Meropenem': 'Carbapenem',
    'Meropenem Graded Challenge': 'Carbapenem',
    'Vancomycin': 'Glycopeptide',
    'Vancomycin Oral Liquid': 'Glycopeptide',
    'Vancomycin Antibiotic Lock': 'Glycopeptide',
    'Vancomycin Enema': 'Glycopeptide',
    'Vancomycin Intrathecal': 'Glycopeptide',
    'Vancomycin Ora': 'Glycopeptide',
    'Linezolid': 'Oxazolidinone',
    'Linezolid Suspension': 'Oxazolidinone',
    'Penicillin G Benzathine': 'Penicillin',
    'Penicillin G Potassium': 'Penicillin',
    'Penicillin V Potassium': 'Penicillin',
    'Sulfameth/Trimethoprim': 'Sulfonamide',
    'Sulfameth/Trimethoprim DS': 'Sulfonamide',
    'Sulfameth/Trimethoprim SS': 'Sulfonamide',
    'Sulfamethoxazole-Trimethoprim': 'Sulfonamide',
    'Sulfameth/Trimethoprim Suspension': 'Sulfonamide',
    'Tetracycline': 'Tetracycline',
    'Tetracycline HCl': 'Tetracycline'
    # Add more mappings as needed
}

# Applying the mapping to the 'antibiotic' column
df_raw['antibiotic'] = df_raw['antibiotic'].map(antibiotic_mapping)

In [None]:
#AC
df_raw['antibiotic'].unique()

### Get Dummy Variables

In [None]:
#AC
df_encoded = pd.get_dummies(df_raw, columns=df_raw.select_dtypes(include=['object']).columns)
df_dropped = df_encoded.dropna()
df_dropped.info()

In [None]:
### Check empty values *for tetracycline* ###
#AC
empty_values = df_dropped['antibiotic_Tetracycline'].isnull().any()
empty_values

In [None]:
df.info

After applying get_dummy_variables there is now 53 columns

### Drop Duplications

In [None]:
#AC with minor edits

duplicated_rows_mask = df_dropped['subject_id'].duplicated(keep=False)

# Extract the duplicated rows
duplicated_rows = df_dropped[duplicated_rows_mask]
new_data  = df_dropped.drop_duplicates()
duplicated_rows_mask = new_data['subject_id'].duplicated(keep=False)

# Extract the duplicated rows
duplicated_rows = new_data[duplicated_rows_mask]
# Separate out columns based on data types
int_float_cols = new_data.select_dtypes(include=['int64', 'float64']).columns
uint8_cols = new_data.select_dtypes(include=['uint8']).columns

# Sort dataframe
# For int and float columns: sort in descending order so that larger values come first
df_raw = new_data.sort_values(by=list(int_float_cols), ascending=False)

# For uint8 columns: sort in descending order so that 1 comes before 0
df_raw = df_raw.sort_values(by=list(uint8_cols), ascending=False)

# Drop duplicates based on subject_id, keeping the first (which are the desired rows after sorting)
df_reduced = df_raw.drop_duplicates(subset='subject_id', keep='first')

# Reset index if needed
df_reduced = df_reduced.reset_index(drop=True)
pd.set_option('display.max_columns', None)
df_reduced

In [None]:
df_reduced.shape

After reducing the dataframe we get the 6401 patients reported in the paper.

In [None]:
df_reduced.info

In [None]:
df_reduced.to_csv("data_ready_to_merge.csv", index=False)

## Part 1 Upload data_ready_to_merge.csv to BigQuery

Upload the dataset to bigquery and merge the dataset to radiology and discharge notes. Save the downloaded file to downloads (or find a way to save it directly to my BIOST 2021 Thesis / Main ) as
'data_full_notes.csv'

Place SQL code chunks below (if time write them into the script)

# READ THIS

data_full_notes_old.csv uses an old outdated dataset but I haven't figured out the correct sql to get the correct dataset at 6401.

For now, I will use `df_old` when using the outdated dataset and `df` when I get the new corrected one.

In [None]:
df = pd.read_csv("Data/Old/data_full_notes_old.csv")
# change after fixing sql

## Part 2 Truncate Notes

In [None]:
from joblib import Parallel, delayed
import pandas as pd
import re
import multiprocessing

In [None]:
# === Step 1: Define cleaning function - Clean individual note text ===
def clean_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    text = re.sub(r'_+', '', text)    # Remove underlines
    text = re.sub(r'[^\w\s.,:;!?()\-\n]', '', text)  # Remove junk, keep clinical symbols
    return text.strip()

In [None]:
# === Step 2: Function to process one group. Process a group into (subject_id, note_type, combined_notes) ===
def process_group(record):
    subject_id = record['subject_id']
    note_type_1 = record['note_type_1'] #this column identifies addendums and base notes to just be 'radiology' notes
    texts = record['text']
    cleaned_notes = [clean_text(text) for text in texts]
    combined_notes = " ".join(cleaned_notes)
    return {
        'subject_id': subject_id,
        'note_type_1': note_type_1,
        'combined_notes': combined_notes
    }

In [None]:
# === Step 3: Group the notes. Load and group your data ===
# Replace this with your actual loading logic / dataframe
# Data loaded above as `df`
grouped_df = (
    df.groupby(['subject_id', 'note_type_1'])['text']
    .apply(list)
    .reset_index()
)

records = grouped_df.to_dict('records')

In [None]:
df.shape

(303994, 58)

In [None]:
# === Step 4: Parallel processing with joblib ===
num_cores = multiprocessing.cpu_count() - 1

processed = Parallel(n_jobs=num_cores)(
    delayed(process_group)(record) for record in records
)

In [None]:
# === Step 5: Create DataFrame and save to csv ===
nlp_long_df = pd.DataFrame(processed).sort_values(by=['subject_id', 'note_type_1'])

nlp_long_df.to_csv("Data/Old/data_trunc_notes_old.csv", index=False) # change after fixing sql

In [None]:
# === Step 6: Pivot to wide format for multiple columns ===
# Convert note_type to columns like 'Radiology_notes', etc.
nlp_wide_df = nlp_long_df.pivot(
    index='subject_id',
    columns='note_type_1',
    values='combined_notes'
).reset_index()

nlp_wide_df.columns.name = None # Remove category label

# Rename columns to make clear
nlp_wide_df = nlp_wide_df.rename(columns={
    'radiology': 'Radiology_notes',
    'discharge': 'Discharge_summary_notes'
})

nlp_wide_df = nlp_wide_df.fillna("") #fills NA columns with empty strings

# Save
nlp_wide_df.to_csv("Data/Old/data_trunc_notes_wide_old.csv", index=False) # change after fixing sql

In [None]:
# === Step 7: Combine Radiology and Discharge notes per subject_id ===
nlp_combined_df = nlp_wide_df.copy()

# Concatenate the two columns into one
nlp_combined_df['combined_notes'] = (
    nlp_combined_df['Radiology_notes'].str.strip() + " " +
    nlp_combined_df['Discharge_summary_notes'].str.strip()
).str.strip()

# Combined DataFrame with just subject_id + combined text
nlp_combined_notes_df = nlp_combined_df[['subject_id', 'combined_notes']]

# Save
nlp_combined_notes_df.to_csv("Data/Old/data_trunc_notes_combined_old.csv", index=False) # change after fixing sql

In [None]:
# === Step 8: Join nlp_wide_df and nlp_combined_notes_df to data_after_cleaning ===

# === Step i: Load the original cleaned dataset, df_reduced ===
# ie this is df_reduced
# for now used data_clean until sql is fixed
data_clean = pd.read_csv('Data/Old/data_after_cleaning.csv')

# === Step ii: Merge df_reduced with nlp_wide_df ===
# This adds the radiology and discharge notes as 2 new columns to df_reduced
# use data_clean until df_reduced is finalized

nlp_ready_df = data_clean.merge(
    nlp_wide_df,
    on='subject_id',
    how='left'
)

# === Step iii: Merge with combined notes ===
# This adds one new column of all notes combined together as a single note (per patient) to the nlp_ready_df above
nlp_ready_df = nlp_ready_df.merge(
    nlp_combined_notes_df,
    on='subject_id',
    how='left'
)

# Save
nlp_ready_df.to_csv("Data/Old/data_nlp_ready_old.csv", index=False)

In [None]:
# === Step 9: Check shape of dataframes ===
nlp_wide_df.shape

(5208, 3)

In [None]:
nlp_combined_notes_df.shape

(5208, 2)

In [None]:
df_reduced.shape

(5208, 49)

In [None]:
nlp_ready_df.shape

(5208, 52)

In [None]:
nlp_ready_df.columns

Index(['Unnamed: 0', 'subject_id', 'hospital_expire_flag', 'max_age',
       'los_icu', 'first_hosp_stay', 'suspected_infection', 'sofa_score',
       'sepsis3', 'avg_urineoutput', 'glucose_min', 'glucose_max',
       'glucose_average', 'sodium_max', 'sodium_min', 'sodium_average',
       'diabetes_without_cc', 'diabetes_with_cc', 'severe_liver_disease',
       'aids', 'renal_disease', 'heart_rate_min', 'heart_rate_max',
       'heart_rate_mean', 'sbp_min', 'sbp_max', 'sbp_mean', 'dbp_min',
       'dbp_max', 'dbp_mean', 'resp_rate_min', 'resp_rate_max',
       'resp_rate_mean', 'spo2_min', 'spo2_max', 'spo2_mean', 'coma',
       'albumin', 'race_Black or African American', 'race_Hispanic or Latin',
       'race_Others race', 'race_White', 'antibiotic_Vancomycin',
       'antibiotic_Vancomycin Antibiotic Lock', 'antibiotic_Vancomycin Enema',
       'antibiotic_Vancomycin Intrathecal',
       'antibiotic_Vancomycin Oral Liquid', 'gender_F', 'gender_M',
       'Discharge_summary_notes', '

In [None]:
nlp_ready_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5208 entries, 0 to 5207
Data columns (total 52 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Unnamed: 0                             5208 non-null   int64  
 1   subject_id                             5208 non-null   int64  
 2   hospital_expire_flag                   5208 non-null   int64  
 3   max_age                                5208 non-null   int64  
 4   los_icu                                5208 non-null   float64
 5   first_hosp_stay                        5208 non-null   bool   
 6   suspected_infection                    5208 non-null   int64  
 7   sofa_score                             5208 non-null   int64  
 8   sepsis3                                5208 non-null   bool   
 9   avg_urineoutput                        5208 non-null   float64
 10  glucose_min                            5208 non-null   float64
 11  gluc

## Part 3 Create Note File for Word2Vec

In [None]:
import pandas as pd

# Write the 'Radiology_notes' column to a text file, one line per document
with open("Data/Old/W2V_old/w2v_Radiology_notes.txt", "w", encoding="utf-8") as f: # change after fixing sql
    for line in nlp_ready_df["Radiology_notes"]:
        f.write(str(line).strip() + "\n")

In [None]:
# Write the 'combined_notes' column to a text file, one line per document
with open("Data/Old/W2V_old/w2v_combined_notes.txt", "w", encoding="utf-8") as f: # change after fixing sql
    for line in nlp_ready_df["combined_notes"]:
        f.write(str(line).strip() + "\n")

## Part 4 Prepare Word2Vec - Proceed to main.rmd

## Part 5 Model Training - Proceed to Sepsis_Model_Training.ipynb

After completing / running / saving models in model training, upload them into the workspace in the following code chunks if necessary.

## Part 6 Create Dataset for Bert

In [None]:
import pandas as pd
import os
from concurrent.futures import ProcessPoolExecutor
from functools import partial

def write_single_note_clean(row, output_dir):
    subject_id = str(row['subject_id'])
    note_id = str(row['note_id'])
    note_text = row['note_text'].strip()  # Clean text

    subject_dir = os.path.join(output_dir, subject_id)
    os.makedirs(subject_dir, exist_ok=True)

    file_path = os.path.join(subject_dir, f"{note_id}.txt")
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(note_text)

# Change for directories/output directories for combined/discharge notes
def write_mimic_notes_parallel_for_bert(df, output_dir="pat_notes/rad_notes", metadata_csv="rad_notes_metadata.csv", max_workers=8):
    os.makedirs(output_dir, exist_ok=True)
    rows = df.to_dict("records")

    task = partial(write_single_note_clean, output_dir=output_dir)
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        list(executor.map(task, rows))

    metadata_cols = ['subject_id', 'note_id', 'category', 'chartdate', 'charttime']
    df[metadata_cols].to_csv(metadata_csv, index=False)

    print(f"âœ… Notes saved to: {output_dir}")
    print(f"âœ… Metadata saved to: {metadata_csv}")


In [None]:
from datasets import Dataset

def create_bert_dataset_from_notes(metadata_csv, notes_root_dir):
    df = pd.read_csv(metadata_csv)

    def load_text(row):
        subject_id = str(row['subject_id'])
        note_id = str(row['note_id'])
        file_path = os.path.join(notes_root_dir, subject_id, f"{note_id}.txt")
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read().strip()
        except FileNotFoundError:
            return ""

    df['note_text'] = df.apply(load_text, axis=1)
    df = df[df['note_text'].str.strip() != ""]  # Remove blanks

    dataset = Dataset.from_pandas(df.reset_index(drop=True))
    print(f"âœ… Dataset created with {len(dataset)} entries")
    return dataset


In [None]:
from transformers import AutoTokenizer

def tokenize_bert_dataset(dataset, model_name='emilyalsentzer/Bio_ClinicalBERT', text_column='note_text'):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize_function(example):
        return tokenizer(
            example[text_column],
            truncation=True,
            padding='max_length',
            max_length=512
        )

    print("ðŸ”„ Tokenizing dataset...")
    tokenized = dataset.map(tokenize_function, batched=True)
    return tokenized


In [None]:
def save_tokenized_dataset(tokenized_dataset, output_path="Data/Old/BERT_old/clinical_bert_dataset"):
    tokenized_dataset.save_to_disk(output_path)
    print(f"âœ… Tokenized dataset saved to: {output_path}")


In [None]:
# Step 1: Extract and save notes
bert_rad = pd.read_csv("Data/Old/data_nlp_ready.csv")
write_mimic_notes_parallel_for_bert(bert_rad)

BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.

In [None]:
# Step 2: Rebuild dataset
rad_dataset = create_bert_dataset_from_notes("rad_notes_metadata.csv", "rad_notes")

In [None]:
# Step 3: Tokenize with Clinical BERT
tokenized_dataset = tokenize_bert_dataset(rad_dataset)

In [None]:
# Step 4: Save for later
save_tokenized_dataset(tokenized_dataset)