In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import re
import joblib
import json
import os
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

### Set Up Paths

In [3]:
BASE_PATH = "/content/drive/MyDrive/Work/Capstone-TeamFolder/Capstone_Organized" # CHANGE THIS WHEN YOU USE THE NOTEBOOK, ENSURE THE FINAL DESTINATION IS Capstone_Organized
INPUT_PATH = BASE_PATH + "/1-Data/ED_Model_Training_Dataset.csv"
ML_ARTIFACT_PATH = BASE_PATH + "/3-Model_Training/3.1-Traditional_ML/3.1.0-Traditional_ML_Artifacts"

os.makedirs(ML_ARTIFACT_PATH, exist_ok=True)

OUTPUT_MODEL_PATH = os.path.join(ML_ARTIFACT_PATH, "gb_model.joblib")
OUTPUT_PIPELINE_PATH = os.path.join(ML_ARTIFACT_PATH, "ml_preprocessor.joblib")
OUTPUT_FEATURES_PATH = os.path.join(ML_ARTIFACT_PATH, "ml_feature_columns.json")

### Import Dataset

In [4]:
# IMPORT DATA
df_full = pd.read_csv(INPUT_PATH)

print(f"Loaded {len(df_full)} rows.")
print("\n=====DataFrame Info=====")
df_full.info()

print("\n=====DataFrame Head=====")
display(df_full.head())

Loaded 4200 rows.

=====DataFrame Info=====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4200 entries, 0 to 4199
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   age                    4200 non-null   int64  
 1   sex                    4200 non-null   object 
 2   triage_notes           4200 non-null   object 
 3   heart_rate             4200 non-null   int64  
 4   bp_systolic            4200 non-null   int64  
 5   bp_diastolic           4200 non-null   int64  
 6   resp_rate              4200 non-null   int64  
 7   temperature_C          4200 non-null   float64
 8   oxygen_saturation      4200 non-null   float64
 9   ESI                    4200 non-null   int64  
 10  recent_admissions_30d  4200 non-null   int64  
 11  admitted               4200 non-null   int64  
dtypes: float64(2), int64(8), object(2)
memory usage: 393.9+ KB

=====DataFrame Head=====


Unnamed: 0,age,sex,triage_notes,heart_rate,bp_systolic,bp_diastolic,resp_rate,temperature_C,oxygen_saturation,ESI,recent_admissions_30d,admitted
0,67,M,67-year-old male presents with fever for 2 hou...,148,182,67,29,37.8,94.0,5,0,1
1,74,M,74-year-old male presents with dizziness for 3...,96,176,75,29,36.7,89.0,3,0,1
2,3,M,3-year-old male presents with headache for 6 h...,89,146,97,26,37.8,88.0,4,0,1
3,28,M,28-year-old male presents with dizziness for 2...,147,156,90,13,35.9,85.0,1,0,1
4,19,M,19-year-old male presents with chest pain for ...,65,187,77,22,39.9,91.0,3,0,0


### Preprocessing Data

In [5]:
# PII MASKING: age

def bucket_age(a):
    """Bins the age column into categorical ranges."""
    if pd.isna(a): return None
    a = int(a)
    if a <= 17: return "0-17"
    if a <= 34: return "18-34"
    if a <= 49: return "35-49"
    if a <= 64: return "50-64"
    return "65+"

df_full["age_bucket"] = df_full["age"].apply(bucket_age)

In [6]:
# PII MASKING: triage_notes

EMAIL_RE  = re.compile(r"\b[\w\.-]+@[\w\.-]+\.\w+\b", re.IGNORECASE)
PHONE_RE  = re.compile(r"\b(?:\+?1[-.\s]?)?(?:\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}\b")
SSN_RE    = re.compile(r"\b\d{3}-?\d{2}-?\d{4}\b")
DATE_RE   = re.compile(r"\b(?:\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{4}-\d{2}-\d{2})\b")

# Normalize unicode dashes to ASCII hyphen
DASHES = dict.fromkeys(map(ord, "\u2010\u2011\u2012\u2013\u2014\u2015"), "-")

AGE_PATTERNS = [
    re.compile(r"\b(\d{1,3})\W*(?:year|yrs?|yo|y\/?o)\W*(?:old|of\s+age)?\b", re.IGNORECASE), # Catch X-year-old, X y/o, X yrs, etc.
    re.compile(r"\bage[d]?\s*(\d{1,3})\b", re.IGNORECASE), # Catch aged X / age X
    re.compile(r"\b(in\s+(?:his|her|their|a|the)\s+)\d{2}s\b", re.IGNORECASE) # Catching decades (in his 40s)
]

# Gendered words/titles and pronouns
GENDER_RE   = re.compile(r"\b(male|female|man|woman|boy|girl|gentleman|lady|mr\.?|mrs\.?|ms\.?)\b", re.IGNORECASE)
PRONOUN_RE  = re.compile(r"\b(he|she|him|her|his|hers)\b", re.IGNORECASE)
_PRONOUN_MAP = {'he':'they','she':'they','him':'them','her':'them','his':'their','hers':'their'}

def _neutralize_pronouns(text: str) -> str:
    def _sub(m):
        src = m.group(1)
        repl = _PRONOUN_MAP[src.lower()]
        if src.isupper():     return repl.upper()
        if src[0].isupper():  return repl.capitalize()
        return repl
    return PRONOUN_RE.sub(_sub, text)

def _remove_age(m):
    """Removes the entire matched age phrase."""
    return ""

def redact_text(s):
    """Redact PII + demographics; REMOVES all numeric ages."""
    if pd.isna(s):
        return None
    t = str(s).translate(DASHES)

    # PII
    t = EMAIL_RE.sub("[EMAIL]", t)
    t = PHONE_RE.sub("[PHONE]", t)
    t = SSN_RE.sub("[SSN]", t)
    t = DATE_RE.sub("[DATE]", t)

    # Ages → REMOVE (using the aggressive patterns above)
    for rx in AGE_PATTERNS:
        # This replaces the entire matched phrase (e.g., "67-year-old") with "[AGE]"
        t = rx.sub(_remove_age, t)

    # Gendered terms and pronouns
    t = GENDER_RE.sub("the patient", t)
    t = _neutralize_pronouns(t)

    return re.sub(r"\s+", " ", t).strip()

df_full["triage_notes_redacted"] = df_full["triage_notes"].apply(redact_text)
test_note_redacted = df_full["triage_notes_redacted"].iloc[0]
print("\nOriginal Note:")
print(test_note_redacted)


Original Note:
the patient presents with fever for 2 hours, rated 7/10. Onset after a fall, associated with productive cough. Denies recent travel or sick contacts. Past medical history includes no significant history. Current medications: metformin. Vital signs on arrival: HR 148 bpm, BP 182/67 mmHg, RR 29 breaths/min, Temp 37.8 °C, O₂ sat 94%. Patient triaged as ESI level 5.


In [7]:
# TEXT CLEANING

def clean_text_for_ml(text: str) -> str:
    """
    Cleans the triage notes by removing bracketed tags, non-alphabetic characters, and excess whitespace.
    """

    if not isinstance(text, str):
        return ""
    text = re.sub(r'\\[.*?\\]', ' ', text) # Remove text within brackets like [AGE]
    text = re.sub(r'[^a-zA-Z\s]', ' ', text) # Remove special characters and punctuation
    text = text.lower() # Convert to lowercase
    text = re.sub(r'\s+', ' ', text).strip() # Consolidate whitespace
    return text

print("\nCleaned Note:")
print(clean_text_for_ml(test_note_redacted))


Cleaned Note:
the patient presents with fever for hours rated onset after a fall associated with productive cough denies recent travel or sick contacts past medical history includes no significant history current medications metformin vital signs on arrival hr bpm bp mmhg rr breaths min temp c o sat patient triaged as esi level


In [8]:
# DEFINE TARGET & SPLIT DATA

TARGET_COLUMN = 'admitted'

y=df_full[TARGET_COLUMN]
X=df_full.drop(columns=[TARGET_COLUMN, 'age', 'triage_notes'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=14)

print(f"Training set size: {len(X_train)} rows")
print(f"Test set size: {len(X_test)} rows")

Training set size: 3360 rows
Test set size: 840 rows


In [9]:
# FEATURE GROUPS

NUMERIC_FEATURES = [
    'heart_rate', 'bp_systolic', 'bp_diastolic', 'resp_rate',
    'temperature_C', 'oxygen_saturation', 'ESI', 'recent_admissions_30d'
]

CATEGORICAL_FEATURES = ['sex', 'age_bucket']
TEXT_FEATURE = 'triage_notes_redacted'

In [10]:
# CREATE PREPROCESSING TRANSFORMERS

tfidf_pipe = TfidfVectorizer(
    max_features=184,
    stop_words='english',
    preprocessor=clean_text_for_ml  # Use our new ML-specific cleaner
)

categorical_pipe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# CREATE FULL PREPROCESSING PIPELINE

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', 'passthrough', NUMERIC_FEATURES),
        ('categorical', categorical_pipe, CATEGORICAL_FEATURES),
        ('text', tfidf_pipe, TEXT_FEATURE)
    ],
    remainder='drop'
)

In [11]:
# FIT THE PREPROCESSING PIPELINE ON TRAINING DATASET
preprocessor.fit(X_train)

# SAVE THE PIPELINE ARTIFACT
joblib.dump(preprocessor, OUTPUT_PIPELINE_PATH)
print(f"Preprocessing pipeline saved to: {OUTPUT_PIPELINE_PATH}")

Preprocessing pipeline saved to: /content/drive/MyDrive/Work/Capstone-TeamFolder/Capstone_Organized/3-Model_Training/3.1-Traditional_ML/3.1.0-Traditional_ML_Artifacts/ml_preprocessor.joblib


### Train Model

In [12]:
# TRANSFORM DATA WITH SAVED PIPELINE
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# GET FEATURE NAMES FOR SAVING
ohe_feature_names = preprocessor.named_transformers_['categorical'].get_feature_names_out(CATEGORICAL_FEATURES)
tfidf_feature_names = preprocessor.named_transformers_['text'].get_feature_names_out()
final_feature_names = NUMERIC_FEATURES + list(ohe_feature_names) + list(tfidf_feature_names)

print(f"Total features created: {len(final_feature_names)}")
with open(OUTPUT_FEATURES_PATH, 'w') as f:
    json.dump(final_feature_names, f)
print(f"Feature name list saved to: {OUTPUT_FEATURES_PATH}")

Total features created: 142
Feature name list saved to: /content/drive/MyDrive/Work/Capstone-TeamFolder/Capstone_Organized/3-Model_Training/3.1-Traditional_ML/3.1.0-Traditional_ML_Artifacts/ml_feature_columns.json


In [13]:
# TRAIN GRADIENT BOOSTING MODEL (WITH GRIDSEARCH CV)
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [5, 10, 15],
    'scale_pos_weight': [1, 2, 3]
}

xgb_model = xgb.XGBClassifier(
    random_state=14,
    objective='binary:logistic',
    eval_metric='logloss'
)

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_transformed, y_train)

print(f"\n=============== GridSearchCV Complete ===============")
print(f"Best hyperparameters: {grid_search.best_params_}")
print(f"Best AUC score from CV: {grid_search.best_score_:.4f}")

Fitting 3 folds for each of 81 candidates, totalling 243 fits

Best hyperparameters: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 300, 'scale_pos_weight': 2}
Best AUC score from CV: 0.5337


### Model Evaluation

In [14]:
# GET THE BEST MODEL
best_ml_model = grid_search.best_estimator_

# EVALUATE ON THE TEST SET
print("\n--- Final Evaluation on Test Set ---")

X_test_transformed_df = pd.DataFrame(X_test_transformed.toarray(), columns=final_feature_names) # Align columns (best practices) and convert to dense array

y_pred_proba = best_ml_model.predict_proba(X_test_transformed_df)[:, 1]
y_pred = best_ml_model.predict(X_test_transformed_df)

roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"Retrained Model Test AUC: {roc_auc:.4f}")

print("\n========== Test Set Classification Report (threshold 0.5) ==========")
print(classification_report(y_test, y_pred))

print("\n========== Test Set Confusion Matrix ==========")
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
print(f"True Positives (TP): {tp}")
print(f"True Negatives (TN): {tn}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")

# --- 3. Save the Final Model Artifact ---
joblib.dump(best_ml_model, OUTPUT_MODEL_PATH)
print(f"\nSuccessfully saved retrained ML model to: {OUTPUT_MODEL_PATH}")


--- Final Evaluation on Test Set ---
Retrained Model Test AUC: 0.5204

              precision    recall  f1-score   support

           0       0.69      0.91      0.78       574
           1       0.35      0.11      0.17       266

    accuracy                           0.65       840
   macro avg       0.52      0.51      0.47       840
weighted avg       0.58      0.65      0.59       840


True Positives (TP): 29
True Negatives (TN): 520
False Positives (FP): 54
False Negatives (FN): 237

Successfully saved retrained ML model to: /content/drive/MyDrive/Work/Capstone-TeamFolder/Capstone_Organized/3-Model_Training/3.1-Traditional_ML/3.1.0-Traditional_ML_Artifacts/gb_model.joblib
