# AI at the Doctor's Office – End-to-End Demo

This single notebook:

1. Loads the **UCI Heart Disease** dataset.
2. Trains two models:
   - Logistic Regression (baseline)
   - Small Neural Network (MLPClassifier)
3. Evaluates both models.
4. Generates explainability artifacts with **LIME** and **SHAP**.
5. Provides an interactive panel to explore how changing patient features
   affects predicted risk.

You can run this top-to-bottom during your presentation.

In [None]:
# Install dependencies if needed (safe to re-run)
%pip install -q scikit-learn pandas numpy shap lime ipywidgets matplotlib seaborn

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    classification_report,
    confusion_matrix,
    RocCurveDisplay
)

import shap
from lime.lime_tabular import LimeTabularExplainer
from ipywidgets import interact, IntSlider

shap.initjs()

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## 1. Load and Inspect the Heart Disease Dataset

In [None]:
heart = fetch_openml(name="heart-disease-uci", version=1, as_frame=True)
df = heart.frame.copy()
df.head()

In [None]:
df.info()

In [None]:
df.describe(include='all')

## 2. Preprocessing and Train/Validation/Test Split

In [None]:
target_col = "num"
y_raw = df[target_col].astype(int)
y = (y_raw > 0).astype(int)  # 1 = disease present

X = df.drop(columns=[target_col])

numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(exclude=["int64", "float64"]).columns.tolist()

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=RANDOM_STATE, stratify=y
)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=RANDOM_STATE, stratify=y_temp
)

X_train.shape, X_valid.shape, X_test.shape

In [None]:
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

## 3. Model 1 – Logistic Regression

In [None]:
logreg = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)

logreg_clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", logreg),
    ]
)

logreg_clf.fit(X_train, y_train)

y_valid_proba = logreg_clf.predict_proba(X_valid)[:, 1]
y_valid_pred = (y_valid_proba >= 0.5).astype(int)

print("Validation accuracy (LogReg):", accuracy_score(y_valid, y_valid_pred))
print("Validation ROC-AUC (LogReg):", roc_auc_score(y_valid, y_valid_proba))

In [None]:
y_test_proba = logreg_clf.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_proba >= 0.5).astype(int)

print("Test accuracy (LogReg):", accuracy_score(y_test, y_test_pred))
print("Test ROC-AUC (LogReg):", roc_auc_score(y_test, y_test_proba))
print()
print(classification_report(y_test, y_test_pred))

In [None]:
cm = confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(4,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix – Logistic Regression")
plt.show()

RocCurveDisplay.from_predictions(y_test, y_test_proba)
plt.title("ROC Curve – Logistic Regression")
plt.show()

## 4. Model 2 – Small Neural Network (MLPClassifier)

In [None]:
mlp = MLPClassifier(
    hidden_layer_sizes=(32, 16),
    activation="relu",
    solver="adam",
    max_iter=500,
    random_state=RANDOM_STATE
)

mlp_clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", mlp),
    ]
)

mlp_clf.fit(X_train, y_train)

y_valid_proba_mlp = mlp_clf.predict_proba(X_valid)[:, 1]
y_valid_pred_mlp = (y_valid_proba_mlp >= 0.5).astype(int)

print("Validation accuracy (MLP):", accuracy_score(y_valid, y_valid_pred_mlp))
print("Validation ROC-AUC (MLP):", roc_auc_score(y_valid, y_valid_proba_mlp))

In [None]:
y_test_proba_mlp = mlp_clf.predict_proba(X_test)[:, 1]
y_test_pred_mlp = (y_test_proba_mlp >= 0.5).astype(int)

print("Test accuracy (MLP):", accuracy_score(y_test, y_test_pred_mlp))
print("Test ROC-AUC (MLP):", roc_auc_score(y_test, y_test_proba_mlp))
print()
print(classification_report(y_test, y_test_pred_mlp))

In [None]:
cm_mlp = confusion_matrix(y_test, y_test_pred_mlp)
plt.figure(figsize=(4,4))
sns.heatmap(cm_mlp, annot=True, fmt="d", cmap="Greens")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix – MLPClassifier")
plt.show()

RocCurveDisplay.from_predictions(y_test, y_test_proba_mlp)
plt.title("ROC Curve – MLPClassifier")
plt.show()

## 5. LIME – Local Explanations for a Single Patient

In [None]:
class_names = ["no_disease", "disease"]

lime_explainer = LimeTabularExplainer(
    training_data=np.array(X_train),
    feature_names=X_train.columns.tolist(),
    class_names=class_names,
    discretize_continuous=True,
    random_state=RANDOM_STATE,
)

idx = 0
instance = X_test.iloc[idx].values

def logreg_predict_proba(x):
    df_x = pd.DataFrame(x, columns=X_train.columns)
    return logreg_clf.predict_proba(df_x)

lime_exp = lime_explainer.explain_instance(
    data_row=instance,
    predict_fn=logreg_predict_proba,
    num_features=10,
)

lime_exp.show_in_notebook(show_table=True)

## 6. SHAP – Global and Local Explanations

In [None]:
background = X_train.sample(n=min(100, len(X_train)), random_state=RANDOM_STATE)

def logreg_predict_proba_1d(x):
    df_x = pd.DataFrame(x, columns=X_train.columns)
    return logreg_clf.predict_proba(df_x)[:, 1]

explainer = shap.KernelExplainer(logreg_predict_proba_1d, background, link="logit")

X_test_sample = X_test.sample(n=min(100, len(X_test)), random_state=RANDOM_STATE)
shap_values = explainer.shap_values(X_test_sample, nsamples=200)

shap.summary_plot(shap_values, X_test_sample, show=True)

In [None]:
patient_idx = 0
x_patient = X_test_sample.iloc[patient_idx : patient_idx + 1]
shap_values_patient = explainer.shap_values(x_patient, nsamples=200)

shap.force_plot(explainer.expected_value, shap_values_patient, x_patient, matplotlib=True)

## 7. Interactive Patient Explorer (Clinician Teaching View)

In [None]:
base_patient = X_train.median(numeric_only=True)

for col in X.select_dtypes(exclude=["int64", "float64"]).columns:
    base_patient[col] = X_train[col].mode()[0]

age_range = (int(X_train["age"].min()), int(X_train["age"].max()))
trestbps_range = (int(X_train["trestbps"].min()), int(X_train["trestbps"].max()))
chol_range = (int(X_train["chol"].min()), int(X_train["chol"].max()))
thalach_range = (int(X_train["thalach"].min()), int(X_train["thalach"].max()))

In [None]:
def explore_patient(age, trestbps, chol, thalach):
    patient = base_patient.copy()
    patient["age"] = age
    patient["trestbps"] = trestbps
    patient["chol"] = chol
    patient["thalach"] = thalach

    for col in X_train.columns:
        if col not in patient.index:
            if col in numeric_features:
                patient[col] = X_train[col].median()
            else:
                patient[col] = X_train[col].mode()[0]

    df_patient = pd.DataFrame([patient[X_train.columns]])

    proba_logreg = logreg_clf.predict_proba(df_patient)[0, 1]
    proba_mlp = mlp_clf.predict_proba(df_patient)[0, 1]

    print(f"Predicted probability of heart disease:")
    print(f"  Logistic Regression: {proba_logreg:.3f}")
    print(f"  MLP (Neural Net):    {proba_mlp:.3f}")

    return df_patient

interact(
    explore_patient,
    age=IntSlider(value=int(base_patient["age"]), min=age_range[0], max=age_range[1], step=1, description="Age"),
    trestbps=IntSlider(value=int(base_patient["trestbps"]), min=trestbps_range[0], max=trestbps_range[1], step=1, description="Rest BP"),
    chol=IntSlider(value=int(base_patient["chol"]), min=chol_range[0], max=chol_range[1], step=1, description="Chol"),
    thalach=IntSlider(value=int(base_patient["thalach"]), min=thalach_range[0], max=thalach_range[1], step=1, description="Max HR"),
);