In [1]:
# pip install optuna

In [2]:
# Import necessary libraries
import optuna 
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the Pima Indian Diabetes dataset from sklearn
# Note: Scikit-learn's built-in 'load_diabetes' is a regression dataset.
# We will load the actual diabetes dataset from an external source
import pandas as pd

# Load # Load the Pima Indian Diabetes dataset (from UCI repository)
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']

# Load the dataset
df = pd.read_csv(url, names=columns)

df.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
import numpy as np

# Replace zero values with NAN in columns where zero is not valid value
cols_with_missing_vals = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_missing_vals] = df[cols_with_missing_vals].replace(0, np.nan)

# Impute the missing values with the same the mean of the respective column
df.fillna(df.mean(), inplace=True)

# Check if there are any remaining missing values
print(df.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [4]:
# Split into feaures (X) and target (y)
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Split data into training and test sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Optional: Scale the data for better model performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Check the shape of the data
print(f'Training set shape: {X_train.shape}')
print(f'Test set shape: {X_test.shape}')

Training set shape: (537, 8)
Test set shape: (231, 8)


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

#Define the objective function
def objective(trial):
    # suggest values for the hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)

    # Create the RandomForestClassifier with suggested hyperparameters
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42
    )

    # Perform 3 fold cross-validation to calculate accuracy score of each trial
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()

    return score # Return the accuracy score for optuna to maximize

In [6]:
# Create a study object and optimise the objective function
study = optuna.create_study(direction='maximize') # We aim to maximize accuracy
study.optimize(objective, n_trials=50) # Run 50 trials to find the best hyperparameter

[I 2024-09-24 09:17:08,940] A new study created in memory with name: no-name-494f0e3f-bc11-4654-b842-b32f40d4e4ba
[I 2024-09-24 09:17:12,722] Trial 0 finished with value: 0.7560521415270017 and parameters: {'n_estimators': 192, 'max_depth': 4, 'min_samples_split': 6}. Best is trial 0 with value: 0.7560521415270017.
[I 2024-09-24 09:17:14,306] Trial 1 finished with value: 0.7597765363128491 and parameters: {'n_estimators': 111, 'max_depth': 6, 'min_samples_split': 6}. Best is trial 1 with value: 0.7597765363128491.
[I 2024-09-24 09:17:16,780] Trial 2 finished with value: 0.7579143389199254 and parameters: {'n_estimators': 151, 'max_depth': 19, 'min_samples_split': 5}. Best is trial 1 with value: 0.7597765363128491.
[I 2024-09-24 09:17:17,706] Trial 3 finished with value: 0.7765363128491621 and parameters: {'n_estimators': 62, 'max_depth': 10, 'min_samples_split': 6}. Best is trial 3 with value: 0.7765363128491621.
[I 2024-09-24 09:17:18,740] Trial 4 finished with value: 0.76536312849162

In [7]:
# Print the beat result
print(f'Best trial accuracy: {study.best_trial.value}')
print(f'Best hyperparameters: {study.best_trial.params}')

Best trial accuracy: 0.7783985102420856
Best hyperparameters: {'n_estimators': 116, 'max_depth': 16, 'min_samples_split': 2}


In [8]:
from sklearn.metrics import accuracy_score

#  Train a RandomForestClassifier using the best hyperparameter from optuna
best_model = RandomForestClassifier(**study.best_trial.params, random_state=42)

# Fit the model to the training data
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate the accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)

# Print the test accuracy
print(f'Test accuracy with best hyperparameters: {test_accuracy:.2f}')

Test accuracy with best hyperparameters: 0.74


Samplers in Optuna

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Define the objective function
def objective(trial):
    # Suggest values for the hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)

    # Create the RandomForestClassifier with suggested hyperparameteres
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42
    )
    # Perform 3-fold cross-validation and calculate accuracy
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()

    return score # Return the accuracy score for optuna to maximize
    

In [10]:
study = optuna.create_study(sampler=optuna.samplers.RandomSampler(), direction='maximize')
study.optimize(objective, n_trials=50) # Run 50 trials to find the best hyperparameters

[I 2024-09-24 09:18:21,755] A new study created in memory with name: no-name-edc2475e-129c-4ba0-a14b-83e8a67698ce
[I 2024-09-24 09:18:22,773] Trial 0 finished with value: 0.7616387337057727 and parameters: {'n_estimators': 105, 'max_depth': 18, 'min_samples_split': 7}. Best is trial 0 with value: 0.7616387337057727.
[I 2024-09-24 09:18:24,390] Trial 1 finished with value: 0.7672253258845437 and parameters: {'n_estimators': 166, 'max_depth': 11, 'min_samples_split': 6}. Best is trial 1 with value: 0.7672253258845437.
[I 2024-09-24 09:18:25,214] Trial 2 finished with value: 0.7597765363128491 and parameters: {'n_estimators': 137, 'max_depth': 3, 'min_samples_split': 10}. Best is trial 1 with value: 0.7672253258845437.
[I 2024-09-24 09:18:25,722] Trial 3 finished with value: 0.7728119180633147 and parameters: {'n_estimators': 64, 'max_depth': 10, 'min_samples_split': 6}. Best is trial 3 with value: 0.7728119180633147.
[I 2024-09-24 09:18:26,923] Trial 4 finished with value: 0.759776536312

In [11]:
# Print the best results
print(f'Best trial accuracy: {study.best_trial.value}')
print(f'Best hyperparameters: {study.best_trial.params}')

Best trial accuracy: 0.7746741154562384
Best hyperparameters: {'n_estimators': 56, 'max_depth': 13, 'min_samples_split': 2}


In [12]:
from sklearn.metrics import accuracy_score

# Train a RandomForestClassifier using the best hyperparameters from Optuna
best_model = RandomForestClassifier(**study.best_trial.params, random_state=42)

# Fit the model to the training data
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate the accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)

# Print the test accuracy
print(f'Test Accuracy with best hyperparameters: {test_accuracy:.2f}')


Test Accuracy with best hyperparameters: 0.75


GridSearch Sampler

In [13]:
# we define search space outside the objective function in gridsampler
search_space = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [5, 10, 15, 20],
    'min_samples_split' : [2, 5, 10]
}

In [14]:
# create a study and optimize it using GridSampler
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.GridSampler(search_space))
study.optimize(objective)

[I 2024-09-24 09:19:20,722] A new study created in memory with name: no-name-34eee552-616c-42b4-9e47-2d317376113d
[I 2024-09-24 09:19:21,681] Trial 0 finished with value: 0.7653631284916201 and parameters: {'n_estimators': 100, 'max_depth': 15, 'min_samples_split': 5}. Best is trial 0 with value: 0.7653631284916201.
[I 2024-09-24 09:19:22,157] Trial 1 finished with value: 0.7653631284916201 and parameters: {'n_estimators': 50, 'max_depth': 5, 'min_samples_split': 5}. Best is trial 0 with value: 0.7653631284916201.
[I 2024-09-24 09:19:23,773] Trial 2 finished with value: 0.7728119180633147 and parameters: {'n_estimators': 150, 'max_depth': 15, 'min_samples_split': 2}. Best is trial 2 with value: 0.7728119180633147.
[I 2024-09-24 09:19:25,762] Trial 3 finished with value: 0.7616387337057727 and parameters: {'n_estimators': 150, 'max_depth': 15, 'min_samples_split': 5}. Best is trial 2 with value: 0.7728119180633147.
[I 2024-09-24 09:19:26,287] Trial 4 finished with value: 0.7672253258845

In [15]:

# Print the best result
print(f'Best trial accuracy: {study.best_trial.value}')
print(f'Best hyperparameters: {study.best_trial.params}')

Best trial accuracy: 0.7746741154562384
Best hyperparameters: {'n_estimators': 50, 'max_depth': 5, 'min_samples_split': 2}


In [16]:
from sklearn.metrics import accuracy_score

# Train a RandomForestClassifier using the best hyperparameters from Optuna
best_model = RandomForestClassifier(**study.best_trial.params, random_state=42)

# Fit the model to the training data
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate the accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)

# Print the test accuracy
print(f'Test Accuracy with best hyperparameters: {test_accuracy:.2f}')


Test Accuracy with best hyperparameters: 0.74


cmaes Sampler

In [18]:
# pip install cmaes
# Create a study and optimize it using CmaEsSampler

study = optuna.create_study(direction='maximize', sampler=optuna.samplers.CmaEsSampler())
study.optimize(objective, n_trials=50)

[I 2024-09-24 09:22:04,258] A new study created in memory with name: no-name-11196181-2d78-4238-9f0d-1f1f95bd79c3
[I 2024-09-24 09:22:06,226] Trial 0 finished with value: 0.7616387337057727 and parameters: {'n_estimators': 186, 'max_depth': 13, 'min_samples_split': 10}. Best is trial 0 with value: 0.7616387337057727.
[I 2024-09-24 09:22:12,742] Trial 1 finished with value: 0.7746741154562384 and parameters: {'n_estimators': 126, 'max_depth': 8, 'min_samples_split': 8}. Best is trial 1 with value: 0.7746741154562384.
[I 2024-09-24 09:22:14,215] Trial 2 finished with value: 0.7672253258845437 and parameters: {'n_estimators': 163, 'max_depth': 14, 'min_samples_split': 7}. Best is trial 1 with value: 0.7746741154562384.
[I 2024-09-24 09:22:15,226] Trial 3 finished with value: 0.7541899441340782 and parameters: {'n_estimators': 112, 'max_depth': 11, 'min_samples_split': 5}. Best is trial 1 with value: 0.7746741154562384.
[I 2024-09-24 09:22:16,283] Trial 4 finished with value: 0.76908752327

In [19]:

# Print the best result
print(f'Best trial accuracy: {study.best_trial.value}')
print(f'Best hyperparameters: {study.best_trial.params}')

Best trial accuracy: 0.7746741154562384
Best hyperparameters: {'n_estimators': 126, 'max_depth': 8, 'min_samples_split': 8}


In [20]:
from sklearn.metrics import accuracy_score

# Train a RandomForestClassifier using the best hyperparameters from Optuna
best_model = RandomForestClassifier(**study.best_trial.params, random_state=42)

# Fit the model to the training data
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate the accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)

# Print the test accuracy
print(f'Test Accuracy with best hyperparameters: {test_accuracy:.2f}')


Test Accuracy with best hyperparameters: 0.76


In [21]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_max_depth,params_min_samples_split,params_n_estimators,system_attrs_cma:generation,system_attrs_cma:large_n_eval,system_attrs_cma:n_restarts,system_attrs_cma:n_restarts_with_large,system_attrs_cma:optimizer:0,system_attrs_cma:optimizer:1,system_attrs_cma:popsize,system_attrs_cma:poptype,system_attrs_cma:small_n_eval,state
0,0,0.761639,2024-09-24 09:22:04.259542,2024-09-24 09:22:06.226902,0 days 00:00:01.967360,13,10,186,,,,,,,,,,COMPLETE
1,1,0.774674,2024-09-24 09:22:06.228917,2024-09-24 09:22:12.740250,0 days 00:00:06.511333,8,8,126,0.0,0.0,0.0,0.0,,,7.0,small,0.0,COMPLETE
2,2,0.767225,2024-09-24 09:22:12.744202,2024-09-24 09:22:14.215263,0 days 00:00:01.471061,14,7,163,0.0,0.0,0.0,0.0,,,7.0,small,0.0,COMPLETE
3,3,0.75419,2024-09-24 09:22:14.217992,2024-09-24 09:22:15.226041,0 days 00:00:01.008049,11,5,112,0.0,0.0,0.0,0.0,,,7.0,small,0.0,COMPLETE
4,4,0.769088,2024-09-24 09:22:15.231387,2024-09-24 09:22:16.283748,0 days 00:00:01.052361,8,4,102,0.0,0.0,0.0,0.0,,,7.0,small,0.0,COMPLETE
5,5,0.763501,2024-09-24 09:22:16.285747,2024-09-24 09:22:17.612178,0 days 00:00:01.326431,13,8,140,0.0,0.0,0.0,0.0,,,7.0,small,0.0,COMPLETE
6,6,0.769088,2024-09-24 09:22:17.614484,2024-09-24 09:22:19.507514,0 days 00:00:01.893030,15,7,126,0.0,0.0,0.0,0.0,,,7.0,small,0.0,COMPLETE
7,7,0.75419,2024-09-24 09:22:19.511533,2024-09-24 09:22:20.309807,0 days 00:00:00.798274,10,8,81,0.0,0.0,0.0,0.0,,,7.0,small,0.0,COMPLETE
8,8,0.761639,2024-09-24 09:22:20.317501,2024-09-24 09:22:21.661991,0 days 00:00:01.344490,6,10,139,1.0,0.0,0.0,0.0,800495f5060000000000008c0a636d6165732e5f636d61...,4284b014b034b018694681189431800000000000000000...,7.0,small,0.0,COMPLETE
9,9,0.763501,2024-09-24 09:22:21.665170,2024-09-24 09:22:23.739478,0 days 00:00:02.074308,14,8,108,1.0,0.0,0.0,0.0,,,7.0,small,0.0,COMPLETE


In [22]:
study.sampler

<optuna.samplers._cmaes.CmaEsSampler at 0x20f2b4d1d90>

In [23]:
study.pruner

<optuna.pruners._median.MedianPruner at 0x20f2c08f790>

### Optuna Visualizations


In [27]:
# For visualizations
from optuna.visualization import plot_optimization_history, plot_parallel_coordinate, plot_slice, plot_contour, plot_param_importances

In [28]:
# 1. Optimization History

plot_optimization_history(study).show()

In [32]:
# 2. Parallel Coordinates Plot
plot_parallel_coordinate(study).show()

In [33]:
# 3. Slice Plot
plot_slice(study).show()

In [34]:
# 4. Contour plot
plot_contour(study).show()

In [35]:
# 5. Hyperparameter Importance
plot_param_importances(study).show()

### Optimizing Multiple ML Models 

In [36]:
# Importing the required libraries
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

In [48]:
# Define the objective function for Optuna
def objective(trial):
    # Choose the algorithm to tune
    classifier_name = trial.suggest_categorical('classifier', ['SVM', 'RandomForest', 'GradientBoosting'])

    if classifier_name == 'SVM':
        # SVM hyperparameters
        c = trial.suggest_float('C', 0.1, 100, log=True)
        kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly', 'sigmoid'])
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])

        model = SVC(C=c, kernel=kernel, gamma=gamma, random_state=42)

    elif classifier_name == 'RandomForest':
        # Random Forest hyperparameters
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        bootstrap = trial.suggest_categorical('bootstrap', [True, False])

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            bootstrap=bootstrap,
            random_state=42
        )

    elif classifier_name == 'GradientBoosting':
        # Gradient Boosting hyperparameters
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

        model = GradientBoostingClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42
        )

    # Perform cross-validation and return the mean accuracy
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()
    return score

In [49]:
# Create a study and optimize it using CmaEsSampler
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2024-09-24 09:56:16,515] A new study created in memory with name: no-name-e74260d7-e201-4c84-aebe-cd80edf23b78
[I 2024-09-24 09:56:16,571] Trial 0 finished with value: 0.6983240223463687 and parameters: {'classifier': 'SVM', 'C': 27.66959281030136, 'kernel': 'sigmoid', 'gamma': 'auto'}. Best is trial 0 with value: 0.6983240223463687.
[I 2024-09-24 09:56:17,382] Trial 1 finished with value: 0.7560521415270017 and parameters: {'classifier': 'RandomForest', 'n_estimators': 113, 'max_depth': 12, 'min_samples_split': 2, 'min_samples_leaf': 9, 'bootstrap': True}. Best is trial 1 with value: 0.7560521415270017.
[I 2024-09-24 09:56:17,418] Trial 2 finished with value: 0.696461824953445 and parameters: {'classifier': 'SVM', 'C': 8.755902111270748, 'kernel': 'sigmoid', 'gamma': 'auto'}. Best is trial 1 with value: 0.7560521415270017.
[I 2024-09-24 09:56:17,502] Trial 3 finished with value: 0.7616387337057727 and parameters: {'classifier': 'SVM', 'C': 0.23462158572826997, 'kernel': 'sigmoid', 

In [50]:
# Retrieve the best trial
best_trial = study.best_trial
print("Best trial parameters:", best_trial.params)
print("Best trial accuracy:", best_trial.value)

Best trial parameters: {'classifier': 'SVM', 'C': 0.11079127824222741, 'kernel': 'linear', 'gamma': 'auto'}
Best trial accuracy: 0.7895716945996275


In [51]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_bootstrap,params_classifier,params_gamma,params_kernel,params_learning_rate,params_max_depth,params_min_samples_leaf,params_min_samples_split,params_n_estimators,state
0,0,0.698324,2024-09-24 09:56:16.517769,2024-09-24 09:56:16.571406,0 days 00:00:00.053637,27.669593,,SVM,auto,sigmoid,,,,,,COMPLETE
1,1,0.756052,2024-09-24 09:56:16.574656,2024-09-24 09:56:17.382018,0 days 00:00:00.807362,,True,RandomForest,,,,12.0,9.0,2.0,113.0,COMPLETE
2,2,0.696462,2024-09-24 09:56:17.385772,2024-09-24 09:56:17.418822,0 days 00:00:00.033050,8.755902,,SVM,auto,sigmoid,,,,,,COMPLETE
3,3,0.761639,2024-09-24 09:56:17.423355,2024-09-24 09:56:17.502205,0 days 00:00:00.078850,0.234622,,SVM,scale,sigmoid,,,,,,COMPLETE
4,4,0.759777,2024-09-24 09:56:17.504994,2024-09-24 09:56:19.609735,0 days 00:00:02.104741,,True,RandomForest,,,,6.0,8.0,3.0,280.0,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,0.789572,2024-09-24 09:57:32.690233,2024-09-24 09:57:32.735532,0 days 00:00:00.045299,0.125239,,SVM,auto,linear,,,,,,COMPLETE
96,96,0.787709,2024-09-24 09:57:32.737768,2024-09-24 09:57:32.782920,0 days 00:00:00.045152,0.170464,,SVM,auto,linear,,,,,,COMPLETE
97,97,0.743017,2024-09-24 09:57:32.787370,2024-09-24 09:57:39.402555,0 days 00:00:06.615185,,,GradientBoosting,,,0.024554,18.0,9.0,2.0,277.0,COMPLETE
98,98,0.787709,2024-09-24 09:57:39.411693,2024-09-24 09:57:39.464119,0 days 00:00:00.052426,0.102210,,SVM,auto,linear,,,,,,COMPLETE


In [52]:
study.trials_dataframe()['params_classifier'].value_counts()

params_classifier
SVM                 66
RandomForest        25
GradientBoosting     9
Name: count, dtype: int64

In [53]:
study.trials_dataframe().groupby('params_classifier')['value'].mean()

params_classifier
GradientBoosting    0.742603
RandomForest        0.767821
SVM                 0.775718
Name: value, dtype: float64

In [54]:
# 1. Optimization History
plot_optimization_history(study).show()

In [55]:
# 3. Slice Plot
plot_slice(study).show()

In [56]:
# 5. Hyperparameter Importance
plot_param_importances(study).show()