# 06 – Hyperparameter Tuning

1. Import libraries

In [1]:
import os
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import joblib
from scipy.stats import randint

2. Load dataset

In [2]:
DATA_PATH = Path("C:/Users/youss/Desktop/Heart_Disease_Project/data/heart_disease.csv")
df = pd.read_csv(DATA_PATH)

# define features and target
target = "target" if "target" in df.columns else df.columns[-1]
X = df.drop(columns=[target])
y = df[target]

# separate numerical and categorical features
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in num_cols]

# preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])

# base Random Forest model
rf = RandomForestClassifier(random_state=42)
pipe = Pipeline([("preprocess", preprocessor), ("model", rf)])

3. RandomizedSearchCV

In [3]:
param_dist = {
    "model__n_estimators": randint(200, 800),
    "model__max_depth": randint(3, 20),
    "model__min_samples_split": randint(2, 20),
    "model__min_samples_leaf": randint(1, 10),
    "model__max_features": ["sqrt", "log2", None],
}

rs = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=25,            # number of random combinations to try
    scoring="f1",         # optimize for F1-score
    cv=5,                 # 5-fold cross-validation
    n_jobs=-1,            # use all available cores
    random_state=42,
    verbose=1
)

rs.fit(X, y)
print("Best params (RandomizedSearch):", rs.best_params_)
print("Best F1 score (RandomizedSearch):", rs.best_score_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best params (RandomizedSearch): {'model__max_depth': 17, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 8, 'model__n_estimators': 720}
Best F1 score (RandomizedSearch): 0.9905651859877211


4. Optional GridSearchCV

In [4]:
# narrow search space around the best values found by RandomizedSearch
best_params = rs.best_params_

param_grid = {
    "model__n_estimators": [best_params["model__n_estimators"] - 50,
                            best_params["model__n_estimators"],
                            best_params["model__n_estimators"] + 50],
    "model__max_depth": [best_params["model__max_depth"] - 2,
                         best_params["model__max_depth"],
                         best_params["model__max_depth"] + 2],
}

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="f1",
    cv=5,
    n_jobs=-1,
    verbose=1
)

gs.fit(X, y)
print("Best params (GridSearch):", gs.best_params_)
print("Best F1 score (GridSearch):", gs.best_score_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best params (GridSearch): {'model__max_depth': 15, 'model__n_estimators': 670}
Best F1 score (GridSearch): 0.9972093023255815


5. Save Best Model

In [7]:
# choose the best between RandomizedSearch and GridSearch
final_model = gs.best_estimator_ if gs.best_score_ > rs.best_score_ else rs.best_estimator_

os.makedirs("../models", exist_ok=True)
joblib.dump(final_model, "../models/final_model.pkl")
print("Saved tuned pipeline to ../models/final_model.pkl")

Saved tuned pipeline to ../models/final_model.pkl
