1. Use GridSearchCV & RandomizedSearchCV to optimize model hyperparameters.

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("../data/heart_disease_cleaned.csv")

X = df.drop("target", axis=1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [26]:
# RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

random_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier(random_state=42)

random_search = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                                   n_iter=10, cv=5, scoring='accuracy', verbose=1, random_state=42)
random_search.fit(X_train, y_train)

print("Best Parameters from RandomizedSearchCV:")
print(random_search.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters from RandomizedSearchCV:
{'n_estimators': 50, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 10}


In [27]:
# GridSearchCV
from sklearn.model_selection import GridSearchCV

grid_params = {
    'n_estimators': [random_search.best_params_['n_estimators']],
    'max_depth': [random_search.best_params_['max_depth']],
    'min_samples_split': [random_search.best_params_['min_samples_split']],
    'min_samples_leaf': [random_search.best_params_['min_samples_leaf']]
}

grid_search = GridSearchCV(estimator=rf, param_grid=grid_params,
                           cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train, y_train)

print("Best Parameters from GridSearchCV:")
print(grid_search.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters from GridSearchCV:
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}


2. Compare optimized models with baseline performance.

In [28]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

print("RandomForestClassifier (Optimized)")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_prob))
print("-" * 50)

RandomForestClassifier (Optimized)
              precision    recall  f1-score   support

           0       0.91      0.88      0.89        33
           1       0.86      0.89      0.88        28

    accuracy                           0.89        61
   macro avg       0.88      0.89      0.88        61
weighted avg       0.89      0.89      0.89        61

Accuracy: 0.8852459016393442
AUC: 0.948051948051948
--------------------------------------------------


2.7 Model Export & Deployment

In [30]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
import joblib

# Only scale the already numeric (non-binary) features
numerical = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# Preprocessor for numerical scaling
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical)
], remainder='passthrough')  # passthrough for already one-hot encoded binary columns

# Get best parameters
best_params = grid_search.best_params_

# Full pipeline
pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(**best_params, random_state=42))
])

pipeline.fit(X_train, y_train)

# Save final model
joblib.dump(pipeline, "../models/final_model.pkl")
print("✅ Pipeline exported as final_model.pkl")

✅ Pipeline exported as final_model.pkl
