In [3]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import numpy as np


df = pd.read_csv('heart_disease_cleaned.csv')
selected_features = ['age', 'cp', 'trestbps', 'chol', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
X = df[selected_features]
y = df['num']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Baseline Model

baseline_model = RandomForestClassifier(random_state=42)
baseline_model.fit(X_train, y_train)
y_pred_base = baseline_model.predict(X_test)
baseline_acc = accuracy_score(y_test, y_pred_base)
print("Baseline Accuracy:", baseline_acc)

# Grid Search

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print("GridSearchCV Best Params:", grid_search.best_params_)


y_pred_grid = grid_search.predict(X_test)
grid_acc = accuracy_score(y_test, y_pred_grid)
print("GridSearchCV Accuracy:", grid_acc)
print(classification_report(y_test, y_pred_grid))

#  Randomized Search

param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

random_search = RandomizedSearchCV(RandomForestClassifier(random_state=42),
                                   param_distributions=param_dist,
                                   n_iter=20, cv=5, n_jobs=-1, verbose=1, random_state=42)
random_search.fit(X_train, y_train)

print("RandomizedSearchCV Best Params:", random_search.best_params_)


y_pred_random = random_search.predict(X_test)
random_acc = accuracy_score(y_test, y_pred_random)
print("RandomizedSearchCV Accuracy:", random_acc)
print(classification_report(y_test, y_pred_random))

# حفظ أفضل نموذج

best_model = random_search.best_estimator_ if random_acc >= grid_acc else grid_search.best_estimator_
joblib.dump(best_model, 'best_heart_model.pkl')
print("Best model saved as 'best_heart_model.pkl'")


Baseline Accuracy: 0.6
Fitting 5 folds for each of 108 candidates, totalling 540 fits
GridSearchCV Best Params: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
GridSearchCV Accuracy: 0.6
              precision    recall  f1-score   support

           0       0.80      0.97      0.88        36
           1       0.00      0.00      0.00         9
           2       0.33      0.20      0.25         5
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00         3

    accuracy                           0.60        60
   macro avg       0.23      0.23      0.23        60
weighted avg       0.51      0.60      0.55        60

Fitting 5 folds for each of 20 candidates, totalling 100 fits
RandomizedSearchCV Best Params: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 15, 'bootstrap': True}
RandomizedSearchCV Accuracy: 0.6
              precision    recall  f1-score   support
