# Readmission Risk from Demographics and Medication Patterns

This notebook provides a **minimal runnable demo** of our project and links to the full analysis notebooks (baselines, neural network, and variant model).

In [5]:
import pandas as pd
import numpy as np
import joblib

from pathlib import Path

from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    average_precision_score,
    confusion_matrix,
    classification_report,
)

from IPython.display import IFrame, display

# Import our own code
from src.train import split_data
from src.models_baseline import train_logreg, train_logreg_l1

DATA_PATH = Path("data_processed") / "admissions_features.csv"
DATA_PATH

PosixPath('data_processed/admissions_features.csv')

In [2]:
df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head()

(25000, 64)


Unnamed: 0,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,on_diabetes_med,med_up,med_down,...,diag_3_Respiratory,diag_3_nan,glucose_test_high,glucose_test_no,glucose_test_normal,glucose_test_nan,A1Ctest_high,A1Ctest_no,A1Ctest_normal,A1Ctest_nan
0,8,72,1,18,2,0,0,0,0,0,...,False,False,False,True,False,False,False,True,False,False
1,3,34,2,13,0,0,0,0,0,0,...,False,False,False,True,False,False,False,True,False,False
2,5,45,0,18,0,0,0,0,0,0,...,False,False,False,True,False,False,False,True,False,False
3,2,36,0,12,1,0,0,0,0,0,...,False,False,False,True,False,False,False,True,False,False
4,1,42,0,7,0,0,0,0,0,0,...,True,False,False,True,False,False,False,True,False,False


In [3]:
label_col = "label"  # this is how we’ve been naming it

X_train, X_val, X_test, y_train, y_val, y_test = split_data(df, label_col=label_col)

print("Train size:", X_train.shape)
print("Val size:  ", X_val.shape)
print("Test size: ", X_test.shape)

print("Train positive rate:", y_train.mean())
print("Val positive rate:  ", y_val.mean())
print("Test positive rate: ", y_test.mean())

Train size: (17500, 63)
Val size:   (3750, 63)
Test size:  (3750, 63)
Train positive rate: 0.4701714285714286
Val positive rate:   0.47013333333333335
Test positive rate:  0.47013333333333335


In [13]:
# Train standard L2-regularized logistic regression on all features
with joblib.parallel_backend("threading"):
    logreg = train_logreg(X_train, y_train)

# Evaluate on validation and test sets
def eval_model(model, X, y, set_name=""):
    probs = model.predict_proba(X)[:, 1]
    preds = (probs >= 0.5).astype(int)

    metrics = {
        "set": set_name,
        "Accuracy": accuracy_score(y, preds),
        "AUROC": roc_auc_score(y, probs),
        "AUPRC": average_precision_score(y, probs),
    }
    return metrics, preds, probs

val_metrics, val_preds, val_probs = eval_model(logreg, X_val, y_val, "val")
test_metrics, test_preds, test_probs = eval_model(logreg, X_test, y_test, "test")

print("Validation metrics (L2 logreg):")
display(val_metrics)

print("\nTest metrics (L2 logreg):")
display(test_metrics)

Validation metrics (L2 logreg):


{'set': 'val',
 'Accuracy': 0.6144,
 'AUROC': 0.6490457971140262,
 'AUPRC': 0.6265652047519238}


Test metrics (L2 logreg):


{'set': 'test',
 'Accuracy': 0.6074666666666667,
 'AUROC': 0.6380377730346515,
 'AUPRC': 0.6239669035305411}

In [14]:
cm = confusion_matrix(y_test, test_preds)
print("Confusion matrix (test, L2 logreg):")
print(cm)

print("\nClassification report (test, L2 logreg):")
print(classification_report(y_test, test_preds, digits=3))

Confusion matrix (test, L2 logreg):
[[1407  580]
 [ 892  871]]

Classification report (test, L2 logreg):
              precision    recall  f1-score   support

           0      0.612     0.708     0.657      1987
           1      0.600     0.494     0.542      1763

    accuracy                          0.607      3750
   macro avg      0.606     0.601     0.599      3750
weighted avg      0.606     0.607     0.603      3750



In [15]:
logreg_l1 = train_logreg_l1(X_train, y_train)
test_metrics_l1, test_preds_l1, test_probs_l1 = eval_model(logreg_l1, X_test, y_test, "test")

print("Test metrics (L1 logreg):")
display(test_metrics_l1)

Test metrics (L1 logreg):


{'set': 'test',
 'Accuracy': 0.608,
 'AUROC': 0.6381933503678618,
 'AUPRC': 0.6239882577738436}

## Full Analysis Notebooks (with all figures)

The cells below embed the HTML versions of our four main notebooks:

1. `01_build_cohort`: data creation and preprocessing  
2. `02_model_baselines`: logistic regression and metrics  
3. `03_model_nn`: neural network model and results  
4. `04_lr_on_embeddings`: logistic regression on NN hidden-layer embeddings  

These were run offline; the HTML snapshots show all the figures without needing to retrain during grading.

In [18]:
IFrame("notebooks/01_build_cohort.html", width="100%", height=600)

In [21]:
IFrame("notebooks/02_model_baselines.html", width="100%", height=600)

In [22]:
IFrame("notebooks/03_model_nn.html", width="100%", height=600)

In [23]:
IFrame("notebooks/04_lr_on_embeddings.html", width="100%", height=600)