In [28]:
# Step 1: Load the Healthcare Dataset
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score

# Load dataset
df = pd.read_csv("D:/datasets/dpp/week6_healthcare_dataset.csv")

df.head()


Unnamed: 0,Age,Sex,Smoker,Height_cm,Weight_kg,BMI,SBP,DBP,LDL,HDL,...,SBP_Change,Glucose_Change,LDL_to_HDL,TG_to_HDL,CRP_per_BMI,Age_x_BMI,Age_x_SBP,BMI_x_Glucose,Metabolic_Risk_Index,Cardiometabolic_Risk
0,70,Female,No,157.4,56.9,22.4,127,68,126,56,...,12,7,2.25,2.303571,0.070089,1568.0,8890,2688.0,2.482,0
1,39,Male,No,163.5,90.8,34.5,106,80,104,68,...,-1,17,1.529412,1.279412,0.053913,1345.5,4134,3105.0,1.66,0
2,46,Female,No,172.5,63.1,21.4,108,67,75,80,...,8,-11,0.9375,1.8125,0.213084,984.4,4968,1605.0,1.512,0
3,52,Female,No,163.2,86.2,32.8,106,61,69,54,...,-15,14,1.277778,1.12963,0.033232,1705.6,5512,3083.2,1.764,0
4,35,Female,No,181.7,65.2,20.1,99,79,109,60,...,10,13,1.816667,2.933333,0.089055,703.5,3465,1849.2,1.398,0


In [29]:
#Step 2: Define Target and Base Features
target = "Cardiometabolic_Risk"

#Start with a small, interpretable feature set
numeric_features = [
    "Age",
    "BMI",
    "SBP",
    "Fasting_Glucose"
]

categorical_features = [
    "Sex",
    "Smoker",
    "Family_History",
    "Activity_Level"
]


In [30]:
#Step 3: Baseline Model (NO Polynomial Features)
X = df[numeric_features + categorical_features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

baseline_preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), numeric_features),

        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_features)
    ]
)

baseline_model = Pipeline([
    ("prep", baseline_preprocessor),
    ("clf", LogisticRegression(max_iter=800))
])

baseline_model.fit(X_train, y_train)

baseline_auc = roc_auc_score(
    y_test, baseline_model.predict_proba(X_test)[:, 1]
)

baseline_auc


0.8467690845739626

In [31]:
#Step 4: Add PolynomialFeatures (Degree = 2)
#Now we upgrade feature intelligence, not the model.

poly_preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("poly", PolynomialFeatures(
                degree=2,
                include_bias=False
            )),
            ("scaler", StandardScaler())
        ]), numeric_features),

        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_features)
    ]
)

poly_model = Pipeline([
    ("prep", poly_preprocessor),
    ("clf", LogisticRegression(max_iter=800))
])

poly_model.fit(X_train, y_train)

poly_auc = roc_auc_score(
    y_test, poly_model.predict_proba(X_test)[:, 1]
)

poly_auc


0.8472838137472284

In [32]:
# Model Comparison (Key Teaching Moment)
results = pd.DataFrame({
    "Model": [
        "Baseline Linear Model",
        "PolynomialFeatures (degree=2)"
    ],
    "ROC_AUC": [
        baseline_auc,
        poly_auc
    ]
})

results

Unnamed: 0,Model,ROC_AUC
0,Baseline Linear Model,0.846769
1,PolynomialFeatures (degree=2),0.847284
