In [None]:
!pip install numpy pandas nltk scikit-learn xgboost transformers matplotlib tqdm joblib torch openpyxl tqdm

In [None]:
import numpy as np
import pandas as pd
import nltk
import os
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, recall_score, roc_curve, precision_recall_curve, auc
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from xgboost import XGBClassifier
from transformers import BertModel, BertTokenizer
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings
from sklearn.decomposition import PCA
warnings.filterwarnings('ignore')
import joblib
joblib.parallel_backend('loky', inner_max_num_threads=1)
import matplotlib
import torch
matplotlib.use('Agg')

# NLTK setup
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:

# Global parameters
patient_subset = 1   # 1 - All patients, 2 - Patients with Notes, 3 - Patients without Notes
feature_subset = 1   # 1 - Demo, 2 - Demo + Tabular, 3 - Demo + Tabular + Notes
feat_sel = 0         # 1 - Feature selection on (RandomForest), 0 = off
test_split_ratio = 0.2
summarized = 1       # 0 - No summarization, 1 - DeepSeek LLM based summarization
seed = 0
vectorize_text = 1   # 1 - TF-IDF, 2 - BERT



stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
min_max_scaler = MinMaxScaler()

In [None]:
###############################################################################
# Utility function: Append a single row (dict) to CSV, with an auto-incremented Run ID
###############################################################################
def append_results_to_csv(results_dict, csv_file):
    # Check if the file exists
    if os.path.exists(csv_file):
        df_existing = pd.read_csv(csv_file)
        if 'Run ID' in df_existing.columns:
            max_run_id = df_existing['Run ID'].max()
        else:

            max_run_id = 0
        new_run_id = max_run_id + 1
        results_dict['Run ID'] = new_run_id
            
        # Append the new row
        df_new = pd.DataFrame([results_dict])
        df_combined = pd.concat([df_existing, df_new], ignore_index=True)
        df_combined.to_csv(csv_file, index=False)
    else:
        # If no file, create a new DataFrame
        results_dict['Run ID'] = 1
        df_new = pd.DataFrame([results_dict])
        df_new.to_csv(csv_file, index=False)

In [None]:
# # include words that satisfy token_pattern=r'[a-zA-Z]{2,}'
def filter_tokens_in_notes(notes):
    pattern = re.compile(r'[a-zA-Z]{2,}')
    filtered_notes = []
    for note in notes:
        # Find all tokens that match the pattern
        filtered_tokens = pattern.findall(note)
        # Join tokens back to form the filtered note
        filtered_notes.append(' '.join(filtered_tokens))
    return filtered_notes

In [None]:
###############################################################################
# Preprocessing function
###############################################################################
def preprocess(suhi_df):
    suhi_df.dropna(subset=['day_readmit'], inplace=True)
    suhi_df.loc[suhi_df['day_readmit'] == 2, 'day_readmit'] = 0
    suhi_df['day_readmit'] = suhi_df['day_readmit'].astype(int)                    
    return suhi_df


In [None]:
# ---------------- CONFIG ----------------
PATIENT_SUBSETS = [1, 2]
FEATURE_SUBSETS = [2, 3]
ENGAGED_VALUES = [0, 1]
SEEDS =  [0, 1] # original value - range(30)
SUMMARIZED_OPTIONS = [0, 1]
VECTORIZERS = [1,2]  # 1=TF-IDF, 2=BERT [1, 2] 
FILE_PATH = '../data/suhi_data.xlsx'
OUTPUT_CSV = 'training_log.csv'
TEST_SPLIT_RATIO = 0.2
FEAT_SEL = False
N_COMPONENT = 50

BEST_PARAMS = {
    "RandomForestClassifier": { "n_estimators": 200, "max_depth": 5, "min_samples_split": 20 },
    "AdaBoostClassifier": { "n_estimators": 100, "algorithm": "SAMME", "learning_rate": 0.10 },
    "XGBClassifier": {  "n_estimators": 10, "max_depth": 5, "learning_rate": 0.10 },
}


In [None]:
def load_and_preprocess(file_path, patient_subset, engaged, summarized, feature_subset):
    df = pd.read_excel(file_path)
    df = df[df['engaged'] == engaged]

    if patient_subset == 2:
        df.dropna(subset=['COMBINED_NOTES'], inplace=True)

    if summarized == 1 and feature_subset != 2:
        df['FEW_SHORT_LLM_SUMMARY'] = df['FEW_SHORT_LLM_SUMMARY'].replace('nan', '')
        df['COMBINED_NOTES'] = df['FEW_SHORT_LLM_SUMMARY']

    df = preprocess(df)  
    return df


In [None]:
import torch
from transformers import BertModel, BertTokenizer

# Load BERT and tokenizer once
bert_model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model.eval()

# Pick device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)

print("Using device:", device)


In [None]:
def vectorize_text(df, feature_subset, vectorize_text, summarized):
    min_df = 20 if summarized == 0 else 10
    suhi_df = df
    text_embeddings = []
    
    if feature_subset == 3 and vectorize_text == 1:
        tfidf_vectorizer = TfidfVectorizer(min_df= min_df, )
        suhi_df['COMBINED_NOTES'].fillna('', inplace=True)
        suhi_df['COMBINED_NOTES'] = filter_tokens_in_notes(suhi_df['COMBINED_NOTES'])
        text_embeddings = tfidf_vectorizer.fit_transform(suhi_df['COMBINED_NOTES'])

    # If we include text features and vectorize them using BERT embeddings
    if patient_subset == 2 and feature_subset == 3 and vectorize_text == 2:
        # Load pre-trained BERT model
        suhi_df['COMBINED_NOTES'] = filter_tokens_in_notes(suhi_df['COMBINED_NOTES'])

       
        # Tokenize and encode the text
        for text in tqdm(suhi_df['COMBINED_NOTES'].tolist()):
            tokens = tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
            tokens = {key: value.to(device) for key, value in tokens.items()}
            with torch.no_grad():
                text_embedding = bert_model(**tokens)
                text_embeddings.append(text_embedding.pooler_output.cpu().squeeze().numpy())
                text_embeddings = np.array(text_embeddings)
                pca = PCA().fit(text_embeddings)
                pca = PCA(n_components=N_COMPONENT) 
                text_embeddings = pca.fit_transform(text_embeddings)

    # Drop textual/object columns (except for combining them if we do text vectorizing)
    text_columns = [col for col in suhi_df.columns if suhi_df[col].dtype == 'object']
    suhi_df.drop(columns=text_columns, inplace=True, errors='ignore')

    # Drop date columns
    date_columns = [col for col in suhi_df.columns if suhi_df[col].dtype == 'datetime64[ns]']
    suhi_df.drop(columns=date_columns, inplace=True, errors='ignore')

    # Drop columns that contain 'nores'
    nores_columns = [col for col in suhi_df.columns if 'nores' in col]
    suhi_df.drop(columns=nores_columns, inplace=True, errors='ignore')

    # If we have vectorized text, merge them in
    if feature_subset == 3 and vectorize_text == 1:
        COMBINED_NOTES_vectorized_df = pd.DataFrame(text_embeddings.toarray())
        COMBINED_NOTES_vectorized_df.columns = tfidf_vectorizer.get_feature_names_out()
        suhi_df.reset_index(drop=True, inplace=True)
        suhi_w_vectors_df = pd.concat([suhi_df, COMBINED_NOTES_vectorized_df], axis=1)

    elif feature_subset == 3 and vectorize_text == 2:
        COMBINED_NOTES_vectorized_df = pd.DataFrame(text_embeddings)
        suhi_df.reset_index(drop=True, inplace=True)
        suhi_w_vectors_df = pd.concat([suhi_df, COMBINED_NOTES_vectorized_df], axis=1)
    else:
        suhi_w_vectors_df = suhi_df

    suhi_w_vectors_df.columns = suhi_w_vectors_df.columns.astype(str)


    # Fill NaN with 0
    suhi_w_vectors_df.fillna(0, inplace=True)
    return suhi_w_vectors_df

In [None]:

def train_and_evaluate_models(X, y, seed):
    models = {
        "RandomForest": RandomForestClassifier(random_state=seed, **BEST_PARAMS["RandomForestClassifier"]),
        "AdaBoost": AdaBoostClassifier(random_state=seed, **BEST_PARAMS["AdaBoostClassifier"]),
        "XGBoost": XGBClassifier(random_state=seed, **BEST_PARAMS["XGBClassifier"]),
    }

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT_RATIO, random_state=seed)
    results = {}

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]

        results[name] = {
            "accuracy": round(accuracy_score(y_test, y_pred), 4),
            "roc_auc": round(roc_auc_score(y_test, y_proba), 4),
            "sensitivity": round(recall_score(y_test, (y_proba >= 0.35).astype(int)), 4),
            "specificity": round(recall_score(y_test, (y_proba >= 0.35).astype(int), pos_label=0), 4),
        }
    return results

In [None]:
for patient_subset in PATIENT_SUBSETS:
    for feature_subset in FEATURE_SUBSETS:
        for engaged in ENGAGED_VALUES:
            for seed in SEEDS:
                for summarized in SUMMARIZED_OPTIONS:
                    for vectorize_option in VECTORIZERS:
                        try:
                            print(f"Running: Patient={patient_subset}, Feature={feature_subset}, "
                                  f"Engaged={engaged}, Seed={seed}, Summarized={summarized}, "
                                  f"Vectorizer={vectorize_option}")

                            df = load_and_preprocess(FILE_PATH, patient_subset, engaged, summarized, feature_subset)
                            df = vectorize_text(df, feature_subset, vectorize_option, summarized)
                            X = df.drop('day_readmit', axis=1)
                            y = df['day_readmit']


                            results = train_and_evaluate_models(X, y, seed)

                            final_results = {
                                "Patient Subset": patient_subset,
                                "Feature Subset": feature_subset,
                                "Engaged": engaged,
                                "Seed": seed,
                                "Summarized": summarized,
                                "Vectorizer": vectorize_option,
                                **{f"{clf}_{metric}": val for clf, metrics in results.items() for metric, val in metrics.items()},
                            }

                            append_results_to_csv(final_results, OUTPUT_CSV)

                        except Exception as e:
                            print(f"Error: {e} | Params: {patient_subset, feature_subset, engaged, seed, summarized, vectorize_option}")
                            continue
