In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
from scipy.stats import randint

# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Convert target to binary (0 = not spam, 1 = spam)
y = y.astype(int)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Baseline model
rf_baseline = RandomForestClassifier(random_state=42)
rf_baseline.fit(X_train, y_train)
y_pred_baseline = rf_baseline.predict(X_test)
baseline_acc = accuracy_score(y_test, y_pred_baseline)

# Define hyperparameter distribution for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(50, 300),
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(RandomForestClassifier(random_state=42),
                                   param_distributions=param_dist,
                                   n_iter=30,  # Run 30 different combinations
                                   cv=5,       # 5-fold cross-validation
                                   n_jobs=-1,
                                   verbose=1,
                                   random_state=42)

random_search.fit(X_train, y_train)

# Evaluate best model
best_rf = random_search.best_estimator_
y_pred_best = best_rf.predict(X_test)
best_acc = accuracy_score(y_test, y_pred_best)

# Print results
print(f"Baseline Accuracy: {baseline_acc:.4f}")
print(f"Best Accuracy after Hyperparameter Tuning: {best_acc:.4f}")
print(f"Best Parameters: {random_search.best_params_}")


Fitting 5 folds for each of 30 candidates, totalling 150 fits
