In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [3]:
df = pd.read_csv('heart_disease.csv')

In [5]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,diagnosis,target
0,63,1,1,145.0,233.0,1,2,150,0,2.3,3,1.0,6.0,0,0
1,67,1,4,160.0,286.0,0,2,108,1,1.5,2,3.0,3.0,2,1
2,67,1,4,120.0,229.0,0,2,129,1,2.6,2,2.0,7.0,1,1
3,37,1,3,130.0,250.0,0,0,187,0,3.5,3,1.0,3.0,0,0
4,41,0,2,130.0,204.0,0,2,172,0,1.4,1,1.0,3.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45,1,1,110.0,264.0,0,0,132,0,1.2,2,1.0,7.0,1,1
299,68,1,4,144.0,193.0,1,0,141,0,3.4,2,2.0,7.0,2,1
300,57,1,4,130.0,131.0,0,0,115,1,1.2,2,1.0,7.0,3,1
301,57,0,2,130.0,236.0,0,2,174,0,0.0,2,1.0,3.0,1,1


In [7]:
df.columns = [col.strip() for col in df.columns]

In [9]:
X = df.drop(['target','diagnosis'], axis=1)
y = df['diagnosis']

categorical_cols = ['sex','cp','fbs','restecg','exang','slope','ca','thal']
X_encoded = pd.get_dummies(X, columns=categorical_cols)

# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [13]:
rf_baseline = RandomForestClassifier(random_state=42)
rf_baseline.fit(X_train, y_train)
y_pred_base = rf_baseline.predict(X_test)

acc_base = accuracy_score(y_test, y_pred_base)
print("Baseline Accuracy:", acc_base)
print(classification_report(y_test, y_pred_base))

Baseline Accuracy: 0.5081967213114754
              precision    recall  f1-score   support

           0       0.73      0.91      0.81        33
           1       0.00      0.00      0.00        11
           2       0.14      0.14      0.14         7
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00         3

    accuracy                           0.51        61
   macro avg       0.17      0.21      0.19        61
weighted avg       0.41      0.51      0.46        61



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

best_grid = grid_search.best_estimator_
y_pred_grid = best_grid.predict(X_test)
acc_grid = accuracy_score(y_test, y_pred_grid)

print("GridSearchCV Best Params:", grid_search.best_params_)
print("GridSearchCV Accuracy:", acc_grid)
print(classification_report(y_test, y_pred_grid))

GridSearchCV Best Params: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 50}
GridSearchCV Accuracy: 0.5737704918032787
              precision    recall  f1-score   support

           0       0.74      0.97      0.84        33
           1       0.18      0.18      0.18        11
           2       0.17      0.14      0.15         7
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00         3

    accuracy                           0.57        61
   macro avg       0.22      0.26      0.24        61
weighted avg       0.45      0.57      0.51        61



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rand_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)
rand_search.fit(X_train, y_train)

best_rand = rand_search.best_estimator_
y_pred_rand = best_rand.predict(X_test)
acc_rand = accuracy_score(y_test, y_pred_rand)

print("RandomizedSearchCV Best Params:", rand_search.best_params_)
print("RandomizedSearchCV Accuracy:", acc_rand)
print(classification_report(y_test, y_pred_rand))

RandomizedSearchCV Best Params: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 20}
RandomizedSearchCV Accuracy: 0.5409836065573771
              precision    recall  f1-score   support

           0       0.74      0.97      0.84        33
           1       0.00      0.00      0.00        11
           2       0.12      0.14      0.13         7
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00         3

    accuracy                           0.54        61
   macro avg       0.17      0.22      0.20        61
weighted avg       0.42      0.54      0.47        61



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
