In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import os

# 1. Link notebook with Google Drive and access data from your personal GDrive
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [44]:
# 2. Set the data path for dataset and model location
dataset_dir = "/content/gdrive/My Drive/Heart/Dataset/"  # insert the path here
model_loc = "/content/gdrive/My Drive/Heart/Model/"  # insert the path here

print("Dataset directory contents:", os.listdir(dataset_dir))
df = pd.read_csv(dataset_dir + 'heart.csv')

Dataset directory contents: ['heart.csv', 'Assignment 2 - Zakwan Zakirin, Shazly Iman.ipynb']


In [45]:
# Checking the dataset
print("First few rows of the dataset:")
print(df.head())

First few rows of the dataset:
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   3       145   233    1        0      150      0      2.3      0   
1   37    1   2       130   250    0        1      187      0      3.5      0   
2   41    0   1       130   204    0        0      172      0      1.4      2   
3   56    1   1       120   236    0        1      178      0      0.8      2   
4   57    0   0       120   354    0        1      163      1      0.6      2   

   ca  thal  target  
0   0     1       1  
1   0     2       1  
2   0     2       1  
3   0     2       1  
4   0     2       1  


In [46]:
# Separating features and target variable
X = df.drop('target', axis=1)
y = df['target']


In [47]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [48]:
# Define the parameter grid for Grid Search
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 4]
}

In [49]:
# Initialize the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

In [50]:
# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [51]:
# Get the best parameters and score from Grid Search
best_params_grid = grid_search.best_params_
best_score_grid = grid_search.best_score_

# Evaluate the best model from Grid Search on the test set
best_model_grid = grid_search.best_estimator_
y_pred_grid = best_model_grid.predict(X_test)
test_accuracy_grid = accuracy_score(y_test, y_pred_grid)

In [52]:
# Display Grid Search results
results_grid = pd.DataFrame(grid_search.cv_results_)
results_grid = results_grid[['param_n_estimators', 'param_max_depth', 'param_min_samples_split', 'param_min_samples_leaf', 'mean_test_score']]

print("\nGrid Search:")
print("Best Parameters:", best_params_grid)
print("Best Cross-Validation Score:", best_score_grid)
print("Test Accuracy:", test_accuracy_grid)
print("Grid Search Results:\n", results_grid)


Grid Search:
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
Best Cross-Validation Score: 0.8139455782312925
Test Accuracy: 0.8524590163934426
Grid Search Results:
    param_n_estimators param_max_depth param_min_samples_split  \
0                 100              10                       2   
1                 200              10                       2   
2                 100              10                       5   
3                 200              10                       5   
4                 100              10                       2   
5                 200              10                       2   
6                 100              10                       5   
7                 200              10                       5   
8                 100              20                       2   
9                 200              20                       2   
10                100              20                       5   

In [53]:
# Define the parameter grid for Random Search
param_distr = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, 40, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 6, 8]
}

In [54]:
# Perform Random Search with cross-validation
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_distr, n_iter=100, cv=5, scoring='accuracy', n_jobs=-1, verbose=2, random_state=42)
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [55]:
# Get the best parameters and score from Random Search
best_params_random = random_search.best_params_
best_score_random = random_search.best_score_

# Evaluate the best model from Random Search on the test set
best_model_random = random_search.best_estimator_
y_pred_random = best_model_random.predict(X_test)
test_accuracy_random = accuracy_score(y_test, y_pred_random)

In [56]:
# Display Random Search results
results_random = pd.DataFrame(random_search.cv_results_)
results_random = results_random[['param_n_estimators', 'param_max_depth', 'param_min_samples_split', 'param_min_samples_leaf', 'mean_test_score']]

print("\nRandom Search:")
print("Best Parameters:", best_params_random)
print("Best Cross-Validation Score:", best_score_random)
print("Test Accuracy:", test_accuracy_random)
print("Random Search Results:\n", results_random)



Random Search:
Best Parameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 6, 'max_depth': 40}
Best Cross-Validation Score: 0.8181972789115646
Test Accuracy: 0.8852459016393442
Random Search Results:
    param_n_estimators param_max_depth param_min_samples_split  \
0                 300              30                       2   
1                 400              10                       2   
2                 100              10                       2   
3                 200            None                       2   
4                 300              10                      10   
..                ...             ...                     ...   
95                300              40                       2   
96                200              20                       5   
97                300              10                       2   
98                500              30                      10   
99                500              10                      

In [57]:
# Compare Grid Search and Random Search
comparison_df = pd.DataFrame({
    'Metric': ['Best Parameters', 'Best Cross-Validation Score', 'Test Accuracy'],
    'Grid Search': [best_params_grid, best_score_grid, test_accuracy_grid],
    'Random Search': [best_params_random, best_score_random, test_accuracy_random]
})

print("\nComparison of Grid Search and Random Search:")
print(comparison_df)
# Optionally, save the comparison to a CSV file
comparison_df.to_csv(model_loc + 'grid_vs_random_comparison.csv', index=False)
print(f"Comparison results saved to {model_loc}grid_vs_random_comparison.csv")


Comparison of Grid Search and Random Search:
                        Metric  \
0              Best Parameters   
1  Best Cross-Validation Score   
2                Test Accuracy   

                                         Grid Search  \
0  {'max_depth': 10, 'min_samples_leaf': 4, 'min_...   
1                                           0.813946   
2                                           0.852459   

                                       Random Search  
0  {'n_estimators': 100, 'min_samples_split': 10,...  
1                                           0.818197  
2                                           0.885246  
Comparison results saved to /content/gdrive/My Drive/Heart/Model/grid_vs_random_comparison.csv
