In [38]:
# Step 1: Load the Healthcare Dataset
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin


# Load dataset
df = pd.read_csv("D:/datasets/dpp/week6_healthcare_dataset.csv")

df.head()

Unnamed: 0,Age,Sex,Smoker,Height_cm,Weight_kg,BMI,SBP,DBP,LDL,HDL,...,SBP_Change,Glucose_Change,LDL_to_HDL,TG_to_HDL,CRP_per_BMI,Age_x_BMI,Age_x_SBP,BMI_x_Glucose,Metabolic_Risk_Index,Cardiometabolic_Risk
0,70,Female,No,157.4,56.9,22.4,127,68,126,56,...,12,7,2.25,2.303571,0.070089,1568.0,8890,2688.0,2.482,0
1,39,Male,No,163.5,90.8,34.5,106,80,104,68,...,-1,17,1.529412,1.279412,0.053913,1345.5,4134,3105.0,1.66,0
2,46,Female,No,172.5,63.1,21.4,108,67,75,80,...,8,-11,0.9375,1.8125,0.213084,984.4,4968,1605.0,1.512,0
3,52,Female,No,163.2,86.2,32.8,106,61,69,54,...,-15,14,1.277778,1.12963,0.033232,1705.6,5512,3083.2,1.764,0
4,35,Female,No,181.7,65.2,20.1,99,79,109,60,...,10,13,1.816667,2.933333,0.089055,703.5,3465,1849.2,1.398,0


In [39]:
#Select Base Numeric Features
numeric_features = [
    "Age",
    "BMI",
    "Fasting_Glucose",
    "SBP",
    "TG_to_HDL"
]

#Custom Transformer for Ratio & Difference Features
class RatioDifferenceTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()

        # Avoid division by zero
        eps = 1e-6

        # Ratio Features
        X["Glucose_to_Age"] = X["Fasting_Glucose"] / (X["Age"] + eps)
        X["BMI_to_Insulin"] = X["BMI"] / (X["TG_to_HDL"] + eps)

        # Difference Features
        X["BP_Gap"] = X["SBP"] - X["Age"]
        X["Glucose_BMI_Diff"] = X["Fasting_Glucose"] - X["BMI"]

        return X


In [40]:
#Interaction Features using sklearn PolynomialFeatures
interaction_features = [
    "Age",
    "BMI",
    "Fasting_Glucose"
]

poly = PolynomialFeatures(
    degree=2,
    interaction_only=True,
    include_bias=False
)

#combining everything in one pipeline
preprocessing_pipeline = Pipeline(steps=[
    ("ratio_diff", RatioDifferenceTransformer()),
    ("scaling", StandardScaler())
])



In [41]:
#Apply Interaction Features Separately
X_base = df[numeric_features]

# Apply ratio & difference features
X_extended = RatioDifferenceTransformer().fit_transform(X_base)

# Generate interaction features
X_interactions = poly.fit_transform(X_extended[interaction_features])

interaction_feature_names = poly.get_feature_names_out(interaction_features)

X_interactions_df = pd.DataFrame(
    X_interactions,
    columns=interaction_feature_names,
    index=df.index
)

# Combine all features
X_final = pd.concat([X_extended, X_interactions_df], axis=1)

print(X_final.head())


   Age   BMI  Fasting_Glucose  SBP  TG_to_HDL  Glucose_to_Age  BMI_to_Insulin  \
0   70  22.4              120  127   2.303571        1.714286        9.724027   
1   39  34.5               90  106   1.279412        2.307692       26.965496   
2   46  21.4               75  108   1.812500        1.630435       11.806890   
3   52  32.8               94  106   1.129630        1.807692       29.036040   
4   35  20.1               92   99   2.933333        2.628571        6.852270   

   BP_Gap  Glucose_BMI_Diff   Age   BMI  Fasting_Glucose  Age BMI  \
0      57              97.6  70.0  22.4            120.0   1568.0   
1      67              55.5  39.0  34.5             90.0   1345.5   
2      62              53.6  46.0  21.4             75.0    984.4   
3      54              61.2  52.0  32.8             94.0   1705.6   
4      64              71.9  35.0  20.1             92.0    703.5   

   Age Fasting_Glucose  BMI Fasting_Glucose  
0               8400.0               2688.0  
1     

In [43]:
X = X_final
y = df["Cardiometabolic_Risk"]  # if target exists

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

model = LogisticRegression(max_iter=50000)
model.fit(X_train, y_train)

print("Model trained with engineered features")

Model trained with engineered features
