In [28]:
#pip install scikit-optimize

In [29]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from skopt import gp_minimize
from skopt.space import Integer, Categorical
from skopt.utils import use_named_args
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [30]:
# Load the dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = [
    'Pregnancies','Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'
]
df = pd.read_csv(url, header=None, names=columns)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [31]:
df.isnull().sum()


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [32]:
# Data Preprocessing
missing_columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for column in missing_columns:
    df[column].replace(0, np.nan, inplace=True)
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63,0
764,2,122.0,70.0,27.0,,36.8,0.340,27,0
765,5,121.0,72.0,23.0,112.0,26.2,0.245,30,0
766,1,126.0,60.0,,,30.1,0.349,47,1


In [33]:
df.fillna(df.median(), inplace=True)
X = df.drop('Outcome', axis=1)
y = df['Outcome']
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72.0,35.0,125.0,33.6,0.627,50
1,1,85.0,66.0,29.0,125.0,26.6,0.351,31
2,8,183.0,64.0,29.0,125.0,23.3,0.672,32
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63
764,2,122.0,70.0,27.0,125.0,36.8,0.340,27
765,5,121.0,72.0,23.0,112.0,26.2,0.245,30
766,1,126.0,60.0,29.0,125.0,30.1,0.349,47


In [34]:
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [35]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train_full, X_test, y_train_full, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
#stratify=y: This parameter ensures that the proportion of classes in the target variable y is maintained in both the training and test sets, which is especially important for imbalanced datasets.

In [36]:
# Define Hyperparameter space
param_space = [
    Integer(50, 300, name='n_estimators'),
    Integer(2, 20, name='max_depth'),
    Integer(2, 20, name='min_samples_split'),
    Integer(1, 20, name='min_samples_leaf'),
    Categorical(['sqrt', 'log2', None], name='max_features'),
    Categorical([True, False], name='bootstrap')

]

In [37]:
# Define the objective function
@use_named_args(param_space)
def objective(**params):
    model = RandomForestClassifier(
        random_state=42,
        n_jobs=-1,
        **params
    )
    cv_scores = cross_val_score(
        model, X_train_full, y_train_full, cv=5, scoring='accuracy', n_jobs=-1 # n_jobs: uses all available CPU cores for parallel processing
    )
    score = -np.mean(cv_scores)
    return score

In [38]:
# Run the optimizer 
# gp_minimize: This is a function that performs global optimization using Gaussian processes. It's particularly useful for optimizing functions that are expensive to evaluate or have a complex landscape.
res = gp_minimize(
    func=objective, # This specifies the function you want to minimize.
    dimensions=param_space,
    n_calls=10, # This sets the number of times the optimizer will evaluate the objective function.
    verbose=True # allowing you to see the progress of the optimization, including details about each call and the current best results.
)

Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 18.3392
Function value obtained: -0.7688
Current minimum: -0.7688
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.7919
Function value obtained: -0.7558
Current minimum: -0.7688
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.9018
Function value obtained: -0.7688
Current minimum: -0.7688
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 1.7228
Function value obtained: -0.7590
Current minimum: -0.7688
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 1.8340
Function value obtained: -0.7704
Current minimum: -0.7704
Iteration No: 6 started.

In [39]:
res

          fun: -0.7818072770891644
            x: [208, 5, 10, 19, None, True]
    func_vals: [-7.688e-01 -7.558e-01 -7.688e-01 -7.590e-01 -7.704e-01
                -7.020e-01 -7.460e-01 -7.818e-01 -7.313e-01 -7.671e-01]
      x_iters: [[75, 11, 7, 18, None, True], [104, 2, 15, 12, None, False], [80, 20, 3, 1, 'log2', True], [222, 6, 9, 3, 'sqrt', False], [208, 10, 19, 11, 'log2', False], [181, 13, 5, 2, None, False], [181, 3, 8, 6, None, False], [208, 5, 10, 19, None, True], [172, 12, 16, 13, None, False], [223, 9, 18, 16, None, True]]
       models: [GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1, 1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),
                                        n_restarts_optimizer=2, noise='gaussian',
                                        normalize_y=True, random_state=1619756365)]
        space: Space([Integer(low=50, high=300, prior='uniform', transform='normalize'),
                      Integer(low=2, high=20, prior='uniform',

In [41]:
# Get the best hyperparameters
best_params = {
    'n_estimators': res.x[0],
    'max_depth': res.x[1],
    'min_samples_split': res.x[2],
    'max_features': res.x[4],
    'bootstrap': res.x[5]

}
print('Best Hyperparameters:')
for param, value in best_params.items():
    print(f'{param}: {value}')

Best Hyperparameters:
n_estimators: 208
max_depth: 5
min_samples_split: 10
max_features: None
bootstrap: True


In [42]:
# Evaluate the tuned model
best_model = RandomForestClassifier(
    random_state=42,
    n_jobs=-1,
    **best_params
)
best_model.fit(X_train_full, y_train_full)
y_pred = best_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f'\nTest Accuracy: {acc: .4f}')
print('\nClassification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))


Test Accuracy:  0.7403

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.86      0.81       100
           1       0.67      0.52      0.58        54

    accuracy                           0.74       154
   macro avg       0.72      0.69      0.70       154
weighted avg       0.73      0.74      0.73       154

Confusion Matrix
[[86 14]
 [26 28]]
