In [12]:
# Step 1: Load the Healthcare Dataset
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score

# Load dataset
df = pd.read_csv("D:/datasets/dpp/week6_healthcare_dataset.csv")

target = "Cardiometabolic_Risk"
df.head()

Unnamed: 0,Age,Sex,Smoker,Height_cm,Weight_kg,BMI,SBP,DBP,LDL,HDL,...,SBP_Change,Glucose_Change,LDL_to_HDL,TG_to_HDL,CRP_per_BMI,Age_x_BMI,Age_x_SBP,BMI_x_Glucose,Metabolic_Risk_Index,Cardiometabolic_Risk
0,70,Female,No,157.4,56.9,22.4,127,68,126,56,...,12,7,2.25,2.303571,0.070089,1568.0,8890,2688.0,2.482,0
1,39,Male,No,163.5,90.8,34.5,106,80,104,68,...,-1,17,1.529412,1.279412,0.053913,1345.5,4134,3105.0,1.66,0
2,46,Female,No,172.5,63.1,21.4,108,67,75,80,...,8,-11,0.9375,1.8125,0.213084,984.4,4968,1605.0,1.512,0
3,52,Female,No,163.2,86.2,32.8,106,61,69,54,...,-15,14,1.277778,1.12963,0.033232,1705.6,5512,3083.2,1.764,0
4,35,Female,No,181.7,65.2,20.1,99,79,109,60,...,10,13,1.816667,2.933333,0.089055,703.5,3465,1849.2,1.398,0


In [15]:
#Step 2: Define Raw vs Constructed Feature Sets

raw_features = [
    "Age","Sex","Smoker","BMI","SBP","DBP","LDL","HDL",
    "Triglycerides","Fasting_Glucose","CRP","eGFR",
    "SBP_Prev","Glucose_Prev","Antihypertensive",
    "Statin","Family_History","Activity_Level"
]

constructed_features = raw_features + [
    "Pulse_Pressure","SBP_Change","Glucose_Change",
    "LDL_to_HDL","TG_to_HDL","CRP_per_BMI",
    "Age_x_BMI","Age_x_SBP","BMI_x_Glucose",
    "Metabolic_Risk_Index"
]


In [16]:
#Step 3: Train–Test Split
X_train_r, X_test_r, y_train, y_test = train_test_split(
    df[raw_features], df[target],
    test_size=0.25, random_state=42, stratify=df[target]
)

X_train_c, X_test_c, _, _ = train_test_split(
    df[constructed_features], df[target],
    test_size=0.25, random_state=42, stratify=df[target]
)


In [17]:
#Step 4: Build Preprocessing Pipelines
cat_cols = ["Sex","Smoker","Antihypertensive","Statin","Family_History","Activity_Level"]

num_raw = [c for c in raw_features if c not in cat_cols]
num_con = [c for c in constructed_features if c not in cat_cols]


In [18]:
#Step 5: Model 1 – Baseline (Raw Features)
baseline = Pipeline([
    ("prep", ColumnTransformer([
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_raw),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ])),
    ("clf", LogisticRegression(max_iter=900))
])

baseline.fit(X_train_r, y_train)
auc_baseline = roc_auc_score(y_test, baseline.predict_proba(X_test_r)[:,1])


In [19]:
#Step 6: Model 2 – Raw + Constructed Features
constructed = Pipeline([
    ("prep", ColumnTransformer([
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_con),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ])),
    ("clf", LogisticRegression(max_iter=900))
])

constructed.fit(X_train_c, y_train)
auc_constructed = roc_auc_score(y_test, constructed.predict_proba(X_test_c)[:,1])


In [20]:
#Step 7: Model 3 – Constructed + PolynomialFeatures
poly_model = Pipeline([
    ("prep", ColumnTransformer([
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("poly", PolynomialFeatures(degree=2, include_bias=False)),
            ("scaler", StandardScaler())
        ]), num_con),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ])),
    ("clf", LogisticRegression(max_iter=900))
])

poly_model.fit(X_train_c, y_train)
auc_poly = roc_auc_score(y_test, poly_model.predict_proba(X_test_c)[:,1])


In [21]:
#Step 8: Compare Results
import pandas as pd

results = pd.DataFrame({
    "Model": [
        "Baseline (Raw Features)",
        "With Constructed Features",
        "Constructed + PolynomialFeatures"
    ],
    "ROC_AUC": [
        auc_baseline,
        auc_constructed,
        auc_poly
    ]
})

results


Unnamed: 0,Model,ROC_AUC
0,Baseline (Raw Features),0.839523
1,With Constructed Features,0.837544
2,Constructed + PolynomialFeatures,0.816321
