In [2]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


IMPORT LIBRARIES

In [3]:
import os
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import (roc_auc_score, average_precision_score, accuracy_score,
                             precision_score, recall_score, f1_score, confusion_matrix,
                             classification_report, roc_curve, precision_recall_curve)
import joblib
import math
import time
import json
from tqdm import tqdm

In [4]:
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except Exception:
    XGBOOST_AVAILABLE = False

try:
    import shap
    SHAP_AVAILABLE = True
except Exception:
    SHAP_AVAILABLE = False


LOAD DATA

In [5]:
DATA_PATH = "/content/drive/MyDrive/final dataset.csv"
OUTPUT_DIR = "/content/drive/MyDrive/readmission_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

RANDOM_STATE = 42
TEST_SIZE = 0.2
CV_FOLDS = 3
N_JOBS = -1
SCORING_METRIC = "roc_auc"
RISK_THRESHOLD = 0.5

In [6]:
def find_target_col(df):
    """Find a readmission-like target column name in dataframe."""
    candidates = [c for c in df.columns if any(k in c.lower() for k in ['readmit','readmission','readmitted','readmit_30','readmission_30'])]
    return candidates[0] if candidates else None

def remove_existing_risk_cols(df):
    risk_cols = [c for c in df.columns if 'risk' in c.lower()]
    if risk_cols:
        print("Removing existing risk columns:", risk_cols)
        df = df.drop(columns=risk_cols, errors='ignore')
    return df

def safe_to_numeric(s):
    try:
        return pd.to_numeric(s, errors='coerce')
    except Exception:
        return s

LOAD & CLEAN

In [7]:
print("Loading data from:", DATA_PATH)
df_raw = pd.read_csv(DATA_PATH, dtype=object)
print("Initial shape:", df_raw.shape)
df_raw = remove_existing_risk_cols(df_raw)

# Detect target
target_col = find_target_col(df_raw)
if target_col is None:
    raise RuntimeError("No readmission target column found. Please include a column name containing 'readmit' or 'readmission'.")

print("Detected target column:", target_col)

Loading data from: /content/drive/MyDrive/final dataset.csv
Initial shape: (5000, 37)
Detected target column: Readmission


In [8]:
df_raw.columns

Index(['Patient Name', 'Admission ID', 'Age', 'Sex', 'Weight',
       'Admission Date', 'Admission Time', 'Consultant Doctor Name',
       'Doctor Name', 'Doctor ID', 'Problem Type', 'Discharge Date',
       'Discharge Time', 'Readmission', 'Blood Pressure', 'Insulin',
       'Blood Group', 'Cholesterol', 'Platelets', 'Diabetics',
       'Problem Description', 'Nurse Name', 'Patient Phone Number',
       'Patient Mail ID', 'weather', 'air_quality_index', 'social_event_count',
       'Hemoglobin (g/dL)', 'WBC Count (10^9/L)', 'Platelet Count (10^9/L)',
       'Urine Protein (mg/dL)', 'Urine Glucose (mg/dL)', 'ECG Result',
       'Pulse Rate (bpm)', 'State', 'City', 'Location'],
      dtype='object')

NORMALIZE THE VALUES

In [9]:
df = df_raw.copy()
df[target_col] = df[target_col].astype(str).str.strip().str.lower().map({
    'yes':'1','y':'1','true':'1','1':'1','no':'0','n':'0','false':'0','0':'0'
})
df[target_col] = pd.to_numeric(df[target_col], errors='coerce')
print("Target value counts (including NaN):")
print(df[target_col].value_counts(dropna=False).to_string())

Target value counts (including NaN):
Readmission
0    2500
1    2500


DROP ROWS WITH MISSING VALUES

In [10]:
before = len(df)
df = df[df[target_col].notna()].copy()
print(f"Dropped {before - len(df)} rows with missing target. Remaining: {len(df)}")

Dropped 0 rows with missing target. Remaining: 5000


EXPLORATORY DATA ANALYTICS

In [11]:
print("\n--- Basic EDA ---")
print("Columns:", list(df.columns))
print("\nNumeric sample summary (attempt coercion):")
# attempt to coerce commonly numeric columns for summary
sample_numeric_cols = []
for c in df.columns:
    coerced = pd.to_numeric(df[c], errors='coerce')
    if coerced.notna().sum() > max(10, 0.01 * len(df)):  # if column has many numeric-like values
        sample_numeric_cols.append(c)
print("Numeric-candidate columns:", sample_numeric_cols[:20])
if 'age' in [c.lower() for c in df.columns]:
    ac = [c for c in df.columns if c.lower()=='age'][0]
    print(df[ac].astype(str).describe())


--- Basic EDA ---
Columns: ['Patient Name', 'Admission ID', 'Age', 'Sex', 'Weight', 'Admission Date', 'Admission Time', 'Consultant Doctor Name', 'Doctor Name', 'Doctor ID', 'Problem Type', 'Discharge Date', 'Discharge Time', 'Readmission', 'Blood Pressure', 'Insulin', 'Blood Group', 'Cholesterol', 'Platelets', 'Diabetics', 'Problem Description', 'Nurse Name', 'Patient Phone Number', 'Patient Mail ID', 'weather', 'air_quality_index', 'social_event_count', 'Hemoglobin (g/dL)', 'WBC Count (10^9/L)', 'Platelet Count (10^9/L)', 'Urine Protein (mg/dL)', 'Urine Glucose (mg/dL)', 'ECG Result', 'Pulse Rate (bpm)', 'State', 'City', 'Location']

Numeric sample summary (attempt coercion):
Numeric-candidate columns: ['Age', 'Weight', 'Readmission', 'Cholesterol', 'Platelets', 'Patient Phone Number', 'air_quality_index', 'social_event_count', 'Hemoglobin (g/dL)', 'WBC Count (10^9/L)', 'Platelet Count (10^9/L)', 'Urine Protein (mg/dL)', 'Urine Glucose (mg/dL)', 'Pulse Rate (bpm)']
count     5000
un

In [12]:
def quick_plots(df_local, target):
    try:
        sns.set()
        plt.figure(figsize=(5,3))
        sns.countplot(x=target, data=df_local)
        plt.title("Target distribution")
        plt.show()
    except Exception as e:
        print("Skipping quick_plots due to:", e)

In [13]:
print("\n--- Feature engineering ---")
df_fe = df.copy()


--- Feature engineering ---


In [14]:
cols_lower = {c.lower(): c for c in df_fe.columns}

In [15]:
age_col = None
for candidate in ['age','patient_age','age_years']:
    if candidate in cols_lower:
        age_col = cols_lower[candidate]
        break

if age_col:
    df_fe[age_col] = pd.to_numeric(df_fe[age_col], errors='coerce')
    df_fe['age_bucket'] = pd.cut(df_fe[age_col], bins=[0,30,50,65,80,200], labels=['<=30','31-50','51-65','66-80','80+'])
    print("Created age_bucket from", age_col)
else:
    print("No age column found.")

Created age_bucket from Age


In [16]:
admit_col = None
discharge_col = None
for c in df_fe.columns:
    low = c.lower()
    if 'admit' in low and admit_col is None:
        admit_col = c
    if 'discharg' in low and discharge_col is None:
        discharge_col = c

if admit_col:
    df_fe[admit_col] = pd.to_datetime(df_fe[admit_col], errors='coerce')
if discharge_col:
    df_fe[discharge_col] = pd.to_datetime(df_fe[discharge_col], errors='coerce')

if admit_col and discharge_col:
    df_fe['los_days'] = (df_fe[discharge_col] - df_fe[admit_col]).dt.days
    df_fe.loc[(df_fe['los_days'] < 0) | (df_fe['los_days'] > 3650), 'los_days'] = np.nan
    print("Computed los_days from", admit_col, "and", discharge_col)
else:
    print("Admission/discharge dates not both found -> skipping LOS feature.")

# Comorbidity columns detection and count
comorbidity_keywords = ['diabetes','hypertension','hyperten','cancer','copd','asthma','heart','renal','kidney','stroke']
comorb_cols = [c for c in df_fe.columns if any(k in c.lower() for k in comorbidity_keywords)]
print("Detected comorbidity-like columns:", comorb_cols[:20])
for c in comorb_cols:
    df_fe[c] = pd.to_numeric(df_fe[c], errors='coerce').fillna(df_fe[c].astype(str).str.lower().map({'yes':1,'y':1,'true':1,'1':1,'no':0,'n':0,'false':0}))
    df_fe[c] = pd.to_numeric(df_fe[c], errors='coerce').fillna(0).astype(int)
if comorb_cols:
    df_fe['comorbidity_count'] = df_fe[comorb_cols].sum(axis=1)
else:
    df_fe['comorbidity_count'] = 0

# Date-based features (admit weekday/month)
if admit_col:
    df_fe['admit_weekday'] = df_fe[admit_col].dt.weekday
    df_fe['admit_month'] = df_fe[admit_col].dt.month

# Coerce some lab-like columns to numeric if present
lab_keywords = ['hemoglobin','hb','wbc','platelet','creatinine','cholesterol','glucose','pulse']
for c in df_fe.columns:
    if any(k in c.lower() for k in lab_keywords):
        df_fe[c] = pd.to_numeric(df_fe[c], errors='coerce')

# Drop personal-identifying or long-text columns
drop_candidates = [c for c in df_fe.columns if any(k in c.lower() for k in ['name','address','phone','mail','email','notes','description','image','photo','url'])]
if drop_candidates:
    print("Dropping candidate PII / long-text columns (count={}): {}".format(len(drop_candidates), drop_candidates[:10]))
    df_fe = df_fe.drop(columns=drop_candidates, errors='ignore')

print("Feature engineering completed. Shape:", df_fe.shape)

Admission/discharge dates not both found -> skipping LOS feature.
Detected comorbidity-like columns: []
Dropping candidate PII / long-text columns (count=7): ['Patient Name', 'Consultant Doctor Name', 'Doctor Name', 'Problem Description', 'Nurse Name', 'Patient Phone Number', 'Patient Mail ID']
Feature engineering completed. Shape: (5000, 32)


PREPARE FEATURE MATRIX

In [17]:
print("\nPreparing feature matrix X and target y...")
# Ensure target is int
df_fe[target_col] = df_fe[target_col].astype(int)
X = df_fe.drop(columns=[target_col], errors='ignore')
y = df_fe[target_col]

# Identify numeric and categorical features
numeric_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
# Treat small-cardinality object columns as categorical
cat_cols = [c for c in X.columns if (X[c].dtype == object or X[c].nunique() < 50) and c not in numeric_cols]

# Remove columns that are obviously datetime objects from feature lists
numeric_cols = [c for c in numeric_cols if not np.issubdtype(type(X[c].dtype), np.datetime64)]
# Make sure we don't include admit/discharge date columns directly
for dtcol in [admit_col, discharge_col]:
    if dtcol in cat_cols: cat_cols.remove(dtcol)
    if dtcol in numeric_cols: numeric_cols.remove(dtcol)

print("Numeric cols count:", len(numeric_cols))
print("Categorical cols count:", len(cat_cols))

# Preprocessor
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, cat_cols)
], remainder='drop', sparse_threshold=0)


Preparing feature matrix X and target y...
Numeric cols count: 9
Categorical cols count: 21


SPLIT

In [18]:
print("\nTrain/test split (test_size={})".format(TEST_SIZE))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE,
                                                    random_state=RANDOM_STATE, stratify=y)
print("Train size:", X_train.shape, "Test size:", X_test.shape)


Train/test split (test_size=0.2)
Train size: (4000, 31) Test size: (1000, 31)


MODEL TRAINING & HYPER TUNING

In [19]:
results = {}

def evaluate_pipeline(name, pipeline, X_test, y_test):
    y_prob = pipeline.predict_proba(X_test)[:,1]
    y_pred = (y_prob >= RISK_THRESHOLD).astype(int)
    metrics = {}
    metrics['roc_auc'] = roc_auc_score(y_test, y_prob)
    metrics['pr_auc']  = average_precision_score(y_test, y_prob)
    metrics['accuracy'] = accuracy_score(y_test, y_pred)
    metrics['precision'] = precision_score(y_test, y_pred, zero_division=0)
    metrics['recall'] = recall_score(y_test, y_pred, zero_division=0)
    metrics['f1'] = f1_score(y_test, y_pred, zero_division=0)
    metrics['confusion_matrix'] = confusion_matrix(y_test, y_pred)
    print(f"\n{name} Evaluation:")
    print(f"ROC-AUC: {metrics['roc_auc']:.4f} | PR-AUC: {metrics['pr_auc']:.4f} | Acc: {metrics['accuracy']:.4f} | F1: {metrics['f1']:.4f}")
    print("Confusion matrix:\n", metrics['confusion_matrix'])
    print("Classification report:\n", classification_report(y_test, y_pred, digits=4))
    return metrics

LOGIESTIC REGRESSION

In [20]:
print("\nTraining Logistic Regression with GridSearchCV...")
lr_pipe = Pipeline(steps=[('pre', preprocessor),
                          ('clf', LogisticRegression(solver='saga', max_iter=3000, class_weight='balanced', random_state=RANDOM_STATE))])
lr_param_grid = {
    'clf__C': [0.01, 0.1, 1.0, 5.0],
    'clf__penalty': ['l2']  # keep L2 for stability; L1 is also possible with saga
}
lr_search = GridSearchCV(lr_pipe, lr_param_grid, scoring=SCORING_METRIC, cv=StratifiedKFold(n_splits=CV_FOLDS), n_jobs=N_JOBS, verbose=1)
lr_search.fit(X_train, y_train)
print("LR best params:", lr_search.best_params_)
lr_best = lr_search.best_estimator_
results['LogisticRegression'] = evaluate_pipeline("Logistic Regression", lr_best, X_test, y_test)



Training Logistic Regression with GridSearchCV...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
LR best params: {'clf__C': 0.1, 'clf__penalty': 'l2'}

Logistic Regression Evaluation:
ROC-AUC: 0.4805 | PR-AUC: 0.4975 | Acc: 0.4860 | F1: 0.4787
Confusion matrix:
 [[250 250]
 [264 236]]
Classification report:
               precision    recall  f1-score   support

           0     0.4864    0.5000    0.4931       500
           1     0.4856    0.4720    0.4787       500

    accuracy                         0.4860      1000
   macro avg     0.4860    0.4860    0.4859      1000
weighted avg     0.4860    0.4860    0.4859      1000



RANDOM FOREST

In [21]:
print("\nTraining Random Forest with RandomizedSearchCV...")
rf_pipe = Pipeline(steps=[('pre', preprocessor),
                          ('clf', RandomForestClassifier(class_weight='balanced', random_state=RANDOM_STATE, n_jobs=1))])
rf_param_dist = {
    'clf__n_estimators': [200, 400, 600],
    'clf__max_depth': [6, 10, 15, None],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4],
    'clf__max_features': ['sqrt', 0.3, 0.6]
}
rf_search = RandomizedSearchCV(rf_pipe, rf_param_dist, n_iter=20, scoring=SCORING_METRIC,
                               cv=StratifiedKFold(n_splits=CV_FOLDS), random_state=RANDOM_STATE, n_jobs=N_JOBS, verbose=1)
rf_search.fit(X_train, y_train)
print("RF best params:", rf_search.best_params_)
rf_best = rf_search.best_estimator_
results['RandomForest'] = evaluate_pipeline("Random Forest", rf_best, X_test, y_test)



Training Random Forest with RandomizedSearchCV...
Fitting 3 folds for each of 20 candidates, totalling 60 fits
RF best params: {'clf__n_estimators': 600, 'clf__min_samples_split': 5, 'clf__min_samples_leaf': 4, 'clf__max_features': 'sqrt', 'clf__max_depth': None}

Random Forest Evaluation:
ROC-AUC: 0.4834 | PR-AUC: 0.5005 | Acc: 0.4970 | F1: 0.4830
Confusion matrix:
 [[262 238]
 [265 235]]
Classification report:
               precision    recall  f1-score   support

           0     0.4972    0.5240    0.5102       500
           1     0.4968    0.4700    0.4830       500

    accuracy                         0.4970      1000
   macro avg     0.4970    0.4970    0.4966      1000
weighted avg     0.4970    0.4970    0.4966      1000



XG BOOST

In [22]:
if XGBOOST_AVAILABLE:
    print("\nTraining XGBoost with RandomizedSearchCV...")
    xgb_pipe = Pipeline(steps=[('pre', preprocessor),
                               ('clf', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE))])
    # compute scale_pos_weight for imbalance
    pos = int(y_train.sum())
    neg = len(y_train) - pos
    scale_pos_weight = (neg / pos) if pos>0 else 1.0
    xgb_param_dist = {
        'clf__n_estimators': [100, 200, 400],
        'clf__max_depth': [3, 5, 7],
        'clf__learning_rate': [0.01, 0.05, 0.1],
        'clf__subsample': [0.6, 0.8, 1.0],
        'clf__colsample_bytree': [0.6, 0.8, 1.0],
        'clf__scale_pos_weight': [scale_pos_weight]
    }
    xgb_search = RandomizedSearchCV(xgb_pipe, xgb_param_dist, n_iter=20, scoring=SCORING_METRIC,
                                    cv=StratifiedKFold(n_splits=CV_FOLDS), random_state=RANDOM_STATE, n_jobs=N_JOBS, verbose=1)
    xgb_search.fit(X_train, y_train)
    print("XGB best params:", xgb_search.best_params_)
    xgb_best = xgb_search.best_estimator_
    results['XGBoost'] = evaluate_pipeline("XGBoost", xgb_best, X_test, y_test)
else:
    print("\nXGBoost not available — training HistGradientBoosting as a strong fallback.")
    hgb_pipe = Pipeline(steps=[('pre', preprocessor),
                               ('clf', HistGradientBoostingClassifier(random_state=RANDOM_STATE))])
    hgb_param_dist = {
        'clf__max_iter': [100, 200],
        'clf__max_leaf_nodes': [15, 31, 63]
    }
    hgb_search = RandomizedSearchCV(hgb_pipe, hgb_param_dist, n_iter=6, scoring=SCORING_METRIC,
                                    cv=StratifiedKFold(n_splits=CV_FOLDS), random_state=RANDOM_STATE, n_jobs=N_JOBS, verbose=1)
    hgb_search.fit(X_train, y_train)
    hgb_best = hgb_search.best_estimator_
    results['HistGB'] = evaluate_pipeline("HistGradientBoosting", hgb_best, X_test, y_test)


XGBoost not available — training HistGradientBoosting as a strong fallback.
Fitting 3 folds for each of 6 candidates, totalling 18 fits

HistGradientBoosting Evaluation:
ROC-AUC: 0.4920 | PR-AUC: 0.5042 | Acc: 0.5020 | F1: 0.4866
Confusion matrix:
 [[266 234]
 [264 236]]
Classification report:
               precision    recall  f1-score   support

           0     0.5019    0.5320    0.5165       500
           1     0.5021    0.4720    0.4866       500

    accuracy                         0.5020      1000
   macro avg     0.5020    0.5020    0.5016      1000
weighted avg     0.5020    0.5020    0.5016      1000



MODEL SELECTION

In [23]:
print("\n--- Model selection by ROC-AUC ---")
best_name = max(results.items(), key=lambda kv: kv[1]['roc_auc'])[0]
print("Model scores (ROC-AUC):")
for k,v in results.items():
    print(f" - {k}: {v['roc_auc']:.4f} (Acc={v['accuracy']:.4f}, F1={v['f1']:.4f})")
print("Selected best model:", best_name)

if best_name == 'LogisticRegression':
    best_pipeline = lr_best
elif best_name == 'RandomForest':
    best_pipeline = rf_best
elif best_name == 'XGBoost' and XGBOOST_AVAILABLE:
    best_pipeline = xgb_best
elif 'XGBoost' not in results and 'HistGB' in results:
    best_pipeline = hgb_best
else:
    # fallback
    best_pipeline = rf_best if 'rf_best' in locals() else lr_best

# Save best pipeline
best_model_path = os.path.join(OUTPUT_DIR, "best_readmission_pipeline.joblib")
joblib.dump(best_pipeline, best_model_path)
print("Saved best pipeline to:", best_model_path)


--- Model selection by ROC-AUC ---
Model scores (ROC-AUC):
 - LogisticRegression: 0.4805 (Acc=0.4860, F1=0.4787)
 - RandomForest: 0.4834 (Acc=0.4970, F1=0.4830)
 - HistGB: 0.4920 (Acc=0.5020, F1=0.4866)
Selected best model: HistGB
Saved best pipeline to: /content/drive/MyDrive/readmission_output/best_readmission_pipeline.joblib


EXPLANABILITY

In [24]:
print("\nComputing feature names (post-preprocessing)...")
# try to reconstruct feature names
feature_names = []
try:
    pre = best_pipeline.named_steps['pre']
    # ColumnTransformer.get_feature_names_out is available on modern sklearn
    try:
        feature_names = pre.get_feature_names_out()
    except Exception:
        # attempt manual concatenation
        num_names = numeric_cols
        cat_encoder = pre.named_transformers_['cat'].named_steps['onehot'] if 'cat' in pre.named_transformers_ else None
        if cat_encoder is not None:
            cat_names = cat_encoder.get_feature_names_out(cat_cols)
            feature_names = list(num_names) + list(cat_names)
        else:
            feature_names = list(num_names) + list(cat_cols)
except Exception as e:
    print("Could not extract feature names automatically:", e)
    feature_names = list(X.columns)

# Feature importances for tree models
clf = best_pipeline.named_steps.get('clf', None)
if hasattr(clf, "feature_importances_"):
    try:
        importances = clf.feature_importances_
        # align length
        fn = feature_names if len(feature_names)==len(importances) else [f"f{i}" for i in range(len(importances))]
        imp_df = pd.DataFrame({'feature': fn, 'importance': importances}).sort_values('importance', ascending=False)
        imp_df.to_csv(os.path.join(OUTPUT_DIR, "feature_importances.csv"), index=False)
        print("Saved feature importances to output directory.")
    except Exception as e:
        print("Could not produce feature importances:", e)
else:
    print("Best model does not expose feature_importances_ (likely LR).")

# SHAP explanations (if available)
if SHAP_AVAILABLE:
    try:
        print("Computing SHAP values (sample) — this can take time...")
        # generate background by sampling training set after preprocessing
        # Use best_pipeline to transform
        preproc = best_pipeline.named_steps['pre']
        # take a small background
        bg = X_train.sample(min(200, len(X_train)), random_state=RANDOM_STATE)
        X_bg = preproc.transform(bg)
        # create shap explainer for estimator
        explainer = shap.Explainer(best_pipeline.named_steps['clf'], X_bg, feature_names=feature_names)
        sample_for_shap = preproc.transform(X_test.sample(min(100, len(X_test)), random_state=RANDOM_STATE))
        shap_vals = explainer(sample_for_shap)
        # save summary plot
        shap.summary_plot(shap_vals, feature_names=feature_names, show=False)
        plt.savefig(os.path.join(OUTPUT_DIR, "shap_summary.png"), bbox_inches='tight')
        plt.close()
        print("Saved SHAP summary plot.")
    except Exception as e:
        print("SHAP failed:", e)
else:
    print("SHAP not installed; skip SHAP analysis.")



Computing feature names (post-preprocessing)...
Best model does not expose feature_importances_ (likely LR).
SHAP not installed; skip SHAP analysis.


SCORE FULL DATA

In [25]:
print("\nScoring full dataset and saving predictions...")
X_full = X.copy()
pred_probs = best_pipeline.predict_proba(X_full)[:,1]
df_out = df_fe.copy()
df_out['Predicted_Risk_Score'] = pred_probs
df_out['Predicted_Readmission'] = np.where(df_out['Predicted_Risk_Score'] >= RISK_THRESHOLD, 'Yes', 'No')

out_csv = os.path.join(OUTPUT_DIR, "readmission_predictions_with_features.csv")
df_out.to_csv(out_csv, index=False)
print("Saved predictions CSV to:", out_csv)


Scoring full dataset and saving predictions...
Saved predictions CSV to: /content/drive/MyDrive/readmission_output/readmission_predictions_with_features.csv


PLOT ROC AND PR CURVES

In [26]:
print("Plotting ROC and PR curves for best model on test set...")
y_prob_test = best_pipeline.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_prob_test)
precision, recall, _ = precision_recall_curve(y_test, y_prob_test)
roc_auc_val = roc_auc_score(y_test, y_prob_test)
pr_auc_val = average_precision_score(y_test, y_prob_test)

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f'ROC (AUC={roc_auc_val:.3f})')
plt.plot([0,1],[0,1],'--', color='gray')
plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate'); plt.title('ROC Curve'); plt.legend(); plt.grid(True)
plt.savefig(os.path.join(OUTPUT_DIR, "roc_curve.png"))
plt.close()

plt.figure(figsize=(6,5))
plt.plot(recall, precision, label=f'PR (AUC={pr_auc_val:.3f})')
plt.xlabel('Recall'); plt.ylabel('Precision'); plt.title('Precision-Recall Curve'); plt.legend(); plt.grid(True)
plt.savefig(os.path.join(OUTPUT_DIR, "pr_curve.png"))
plt.close()

print("Saved ROC and PR plots to output directory.")


Plotting ROC and PR curves for best model on test set...
Saved ROC and PR plots to output directory.


CREATE INTERPRETABLE RISK SCORING SYSTEM MAPPING

In [28]:
def risk_group(score):
    if score > 0.7:
        return "HIGH"
    if score > 0.3:
        return "MODERATE"
    return "LOW"

df_out['RiskGroup'] = df_out['Predicted_Risk_Score'].apply(risk_group)
df_out.to_csv(out_csv, index=False)  # overwrite with risk group included
print("Added RiskGroup and updated CSV:", out_csv)

# Save a JSON summary of metrics
summary = {
    'best_model': best_name,
    'metrics_test': {
        'roc_auc': results[best_name]['roc_auc'],
        'pr_auc': results[best_name]['pr_auc'],
        'accuracy': results[best_name]['accuracy'],
        'precision': results[best_name]['precision'],
        'recall': results[best_name]['recall'],
        'f1': results[best_name]['f1']
    },
    'timestamp': datetime.utcnow().isoformat()
}
with open(os.path.join(OUTPUT_DIR, "training_summary.json"), "w") as f:
    json.dump(summary, f, indent=2)

print("\nAll done. Artifacts written to:", OUTPUT_DIR)
print("Best model:", best_name, "ROC-AUC:", results[best_name]['roc_auc'])

Added RiskGroup and updated CSV: /content/drive/MyDrive/readmission_output/readmission_predictions_with_features.csv

All done. Artifacts written to: /content/drive/MyDrive/readmission_output
Best model: HistGB ROC-AUC: 0.491992


In [29]:
# === Save your best trained model ===
import joblib

# Suppose you already have your trained best model (e.g., best_pipeline)
# Replace best_pipeline with your actual trained pipeline/model variable
joblib.dump(best_pipeline, "patient_readmission_model.pkl")

print("Model successfully saved as 'patient_readmission_model.pkl'")


Model successfully saved as 'patient_readmission_model.pkl'


In [34]:
import joblib
import pandas as pd
import numpy as np

# Load the saved model
model = joblib.load("patient_readmission_model.pkl")
print("Model loaded successfully!")

# === Helper Functions ===
def create_age_bucket(age):
    if pd.isna(age): return np.nan
    if age <= 30: return '<=30'
    if age <= 50: return '31-50'
    if age <= 65: return '51-65'
    if age <= 80: return '66-80'
    return '80+'

def safe_input(prompt, dtype=str):
    """Helper to safely take input and cast it."""
    val = input(prompt)
    if dtype == float:
        try:
            return float(val)
        except:
            return np.nan
    return val.strip()

# === Collect user input ===
print("\nPlease enter patient details for readmission prediction:\n")

Admission_ID = safe_input("Admission ID: ")
Age = safe_input("Age (in years): ", float)
Sex = safe_input("Sex (Male/Female): ")
Weight = safe_input("Weight (kg): ", float)
Admission_Date = safe_input("Admission Date (YYYY-MM-DD): ")
Admission_Time = safe_input("Admission Time (HH:MM:SS): ")
Doctor_ID = safe_input("Doctor ID: ")
Problem_Type = safe_input("Problem Type (e.g., Cardiology, Neurology): ")
Discharge_Time = safe_input("Discharge Time (HH:MM:SS): ")
Blood_Pressure = safe_input("Blood Pressure (e.g., 120/80): ")
Insulin = safe_input("Insulin (if not available, press Enter): ")
Blood_Group = safe_input("Blood Group (A+, O-, etc.): ")
Cholesterol = safe_input("Cholesterol (mg/dL): ", float)
Platelets = safe_input("Platelets (10^9/L): ", float)
Diabetics = safe_input("Diabetics (Yes/No): ")
weather = safe_input("Weather condition (e.g., Clear, Rainy): ")
air_quality_index = safe_input("Air Quality Index: ", float)
social_event_count = safe_input("Social Event Count (last 30 days): ", float)
Hemoglobin = safe_input("Hemoglobin (g/dL): ", float)
WBC_Count = safe_input("WBC Count (10^9/L): ", float)
Platelet_Count = safe_input("Platelet Count (10^9/L): ", float)
Urine_Protein = safe_input("Urine Protein (mg/dL): ", float)
Urine_Glucose = safe_input("Urine Glucose (mg/dL): ", float)
ECG_Result = safe_input("ECG Result (Normal/Abnormal): ")
Pulse_Rate = safe_input("Pulse Rate (bpm): ", float)
State = safe_input("State: ")
City = safe_input("City: ")
Location = safe_input("Location: ") # Added Location input


# === Process Inputs ===
age_bucket = create_age_bucket(Age)
# You'll need to implement the actual logic for comorbidity_count based on your data
# For this example, we'll keep the placeholder or derive it if possible from collected inputs.
# For a real application, map Diabetics and other relevant inputs to comorbidity count.
comorbidity_count = 1.0 # placeholder


# Extract systolic blood pressure (first value)
try:
    Blood_Pressure = str(Blood_Pressure).split("/")[0]
    Blood_Pressure = float(Blood_Pressure)
except:
    Blood_Pressure = np.nan

# === Create DataFrame ===
new_patient_data = pd.DataFrame({
    'Admission ID': [Admission_ID],
    'Age': [Age],
    'Sex': [Sex],
    'Weight': [Weight],
    'Admission Date': [Admission_Date],
    'Admission Time': [Admission_Time],
    'Doctor ID': [Doctor_ID],
    'Problem Type': [Problem_Type],
    'Discharge Time': [Discharge_Time],
    'Blood Pressure': [Blood_Pressure],
    'Insulin': [Insulin],
    'Blood Group': [Blood_Group],
    'Cholesterol': [Cholesterol],
    'Platelets': [Platelets],
    'Diabetics': [Diabetics],
    'weather': [weather],
    'air_quality_index': [air_quality_index],
    'social_event_count': [social_event_count],
    'Hemoglobin (g/dL)': [Hemoglobin],
    'WBC Count (10^9/L)': [WBC_Count],
    'Platelet Count (10^9/L)': [Platelet_Count],
    'Urine Protein (mg/dL)': [Urine_Protein],
    'Urine Glucose (mg/dL)': [Urine_Glucose],
    'ECG Result': [ECG_Result],
    'Pulse Rate (bpm)': [Pulse_Rate],
    'State': [State],
    'City': [City],
    'Location': [Location], # Added Location to DataFrame
    'age_bucket': [age_bucket],
    'comorbidity_count': [comorbidity_count]
})

# Convert date columns
new_patient_data['Admission Date'] = pd.to_datetime(new_patient_data['Admission Date'], errors='coerce')

# === Make Prediction ===
prediction = model.predict(new_patient_data)[0]
probability = model.predict_proba(new_patient_data)[:, 1][0]

# === Output ===
risk_label = "High" if probability > 0.7 else "Medium" if probability > 0.3 else "Low"

print("\n Prediction Results:")
print(f"Predicted Readmission: {'Yes' if prediction == 1 else 'No'}")
print(f"Readmission Probability: {probability:.2f}")
print(f"Risk Level: {risk_label}")

Model loaded successfully!

Please enter patient details for readmission prediction:

Admission ID: ADM2009
Age (in years): 45
Sex (Male/Female): fem ale
Weight (kg): 56
Admission Date (YYYY-MM-DD): 2025-09-25
Admission Time (HH:MM:SS): 21:05:55
Doctor ID: DOC405
Problem Type (e.g., Cardiology, Neurology): cardiology
Discharge Time (HH:MM:SS): 23:06:32
Blood Pressure (e.g., 120/80): 90
Insulin (if not available, press Enter): 
Blood Group (A+, O-, etc.): o
Cholesterol (mg/dL): 52
Platelets (10^9/L): 54566
Diabetics (Yes/No): 546565
Weather condition (e.g., Clear, Rainy): Rainy
Air Quality Index: 52
Social Event Count (last 30 days): 2
Hemoglobin (g/dL): 45
WBC Count (10^9/L): 453632165
Platelet Count (10^9/L): 336512326
Urine Protein (mg/dL): 52
Urine Glucose (mg/dL): 48
ECG Result (Normal/Abnormal): normal
Pulse Rate (bpm): 23
State: missouri
City: kansascity
Location: missouri,kansascity

 Prediction Results:
Predicted Readmission: Yes
Readmission Probability: 0.61
Risk Level: Medium