In [2]:
import numpy as np
import pandas as pd
import optuna
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insuline", "BMI",
          "DiabetesPedigreeFunction", "Age", "Outcome" ]

df = pd.read_csv(url, names = columns)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insuline,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
#handling missing values
cols_with_missing_values = ["Glucose", "BloodPressure", "SkinThickness", "Insuline", "BMI"]
df[cols_with_missing_values] = df[cols_with_missing_values].replace(0, np.nan)

df.fillna(df.mean(), inplace=True)

print(df.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insuline                    0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [5]:
#split into feature and target
X = df.drop("Outcome", axis = 1)
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#scaling 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

print(f"training data shape: {X_train.shape}")
print(f"test data shape: {X_test.shape}")

training data shape: (614, 8)
test data shape: (154, 8)


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

#objective function
def objective(trial):
    #values for hyperparameters
    n_estimators = trial.suggest_int("n_estimators", 50, 200)
    max_depth = trial.suggest_int("max_depth", 3,20)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42
    )

    score = cross_val_score(model, X_train, y_train, cv = 3, scoring = "accuracy").mean()
    return score

In [11]:
#create a study object and optimize the objective function
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=50)

[I 2025-02-25 23:03:35,822] A new study created in memory with name: no-name-61a5481e-6c9c-4b93-94b2-2aee96bdbab2
[I 2025-02-25 23:03:36,497] Trial 0 finished with value: 0.7768691216323927 and parameters: {'n_estimators': 56, 'max_depth': 12}. Best is trial 0 with value: 0.7768691216323927.
[I 2025-02-25 23:03:38,363] Trial 1 finished with value: 0.7833652160051012 and parameters: {'n_estimators': 164, 'max_depth': 13}. Best is trial 1 with value: 0.7833652160051012.
[I 2025-02-25 23:03:40,355] Trial 2 finished with value: 0.7670731707317073 and parameters: {'n_estimators': 161, 'max_depth': 19}. Best is trial 1 with value: 0.7833652160051012.
[I 2025-02-25 23:03:43,507] Trial 3 finished with value: 0.7736011477761836 and parameters: {'n_estimators': 171, 'max_depth': 8}. Best is trial 1 with value: 0.7833652160051012.
[I 2025-02-25 23:03:44,441] Trial 4 finished with value: 0.7735852064402997 and parameters: {'n_estimators': 52, 'max_depth': 5}. Best is trial 1 with value: 0.78336521

In [12]:
print(f"Best trial accuracy: {study.best_trial.value}")
print(f"best trial hyperparameters: {study.best_trial.params}")

Best trial accuracy: 0.7850231149370317
best trial hyperparameters: {'n_estimators': 118, 'max_depth': 14}


In [13]:
#using the best hyperparameter values from optuna
from sklearn.metrics import accuracy_score

best_model = RandomForestClassifier(**study.best_trial.params, random_state=42)

best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred)

print(f"Test accuray with best hyperparameters: {test_accuracy:.2f}")

Test accuray with best hyperparameters: 0.76


## Samplers in optuna

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 200)
    max_depth = trial.suggest_int("max_depth", 3, 20)

    model = RandomForestClassifier(
        n_estimators=  n_estimators,
        max_depth=max_depth,
        random_state=42
    )
    scor = cross_val_score(model, X_train, y_train, cv = 3, scoring="accuracy").mean()
    return scor


In [16]:
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.RandomSampler())
study.optimize(objective, n_trials = 50)

[I 2025-02-25 23:25:16,962] A new study created in memory with name: no-name-210c408b-ad5a-4b68-89a4-e558adf27e8a
[I 2025-02-25 23:25:18,221] Trial 0 finished with value: 0.7638291088793241 and parameters: {'n_estimators': 85, 'max_depth': 5}. Best is trial 0 with value: 0.7638291088793241.
[I 2025-02-25 23:25:19,746] Trial 1 finished with value: 0.7752351347042882 and parameters: {'n_estimators': 94, 'max_depth': 6}. Best is trial 1 with value: 0.7752351347042882.
[I 2025-02-25 23:25:20,769] Trial 2 finished with value: 0.7703491152558585 and parameters: {'n_estimators': 69, 'max_depth': 20}. Best is trial 1 with value: 0.7752351347042882.
[I 2025-02-25 23:25:21,645] Trial 3 finished with value: 0.7638370795472661 and parameters: {'n_estimators': 69, 'max_depth': 5}. Best is trial 1 with value: 0.7752351347042882.
[I 2025-02-25 23:25:23,929] Trial 4 finished with value: 0.7605770763589988 and parameters: {'n_estimators': 179, 'max_depth': 4}. Best is trial 1 with value: 0.775235134704

In [17]:
print(f"best trial accuracy: {study.best_trial.value}")
print(f"best trial hyperparameter: {study.best_trial.params}")

best trial accuracy: 0.7866331898613104
best trial hyperparameter: {'n_estimators': 163, 'max_depth': 13}


In [18]:
from sklearn.metrics import accuracy_score

best_model = RandomForestClassifier(**study.best_trial.params)

best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f"accuracy of with best hyperparameter: {accuracy}")

accuracy of with best hyperparameter: 0.7467532467532467


In [21]:
#GridSampler
search_space = {
    "n_estimators" : [50, 100, 150, 200],
    "max_depth" : [5,10, 15, 20]
}

study = optuna.create_study(direction="maximize", sampler = optuna.samplers.GridSampler(search_space))
study.optimize(objective)

[I 2025-02-25 23:33:44,699] A new study created in memory with name: no-name-de255f5e-194c-443f-a40f-0f75a8124ba3
[I 2025-02-25 23:33:45,775] Trial 0 finished with value: 0.7654391838036028 and parameters: {'n_estimators': 100, 'max_depth': 5}. Best is trial 0 with value: 0.7654391838036028.
[I 2025-02-25 23:33:47,598] Trial 1 finished with value: 0.7735772357723577 and parameters: {'n_estimators': 150, 'max_depth': 10}. Best is trial 1 with value: 0.7735772357723577.
[I 2025-02-25 23:33:48,142] Trial 2 finished with value: 0.7687151283277539 and parameters: {'n_estimators': 50, 'max_depth': 15}. Best is trial 1 with value: 0.7735772357723577.
[I 2025-02-25 23:33:49,293] Trial 3 finished with value: 0.7752351347042882 and parameters: {'n_estimators': 100, 'max_depth': 15}. Best is trial 3 with value: 0.7752351347042882.
[I 2025-02-25 23:33:50,464] Trial 4 finished with value: 0.7703491152558585 and parameters: {'n_estimators': 100, 'max_depth': 20}. Best is trial 3 with value: 0.775235

In [22]:
print(f"best trial accurac: {study.best_trial.value}")
print(f"best trial hyperparameter: {study.best_trial.params}")

best trial accurac: 0.7817391997449387
best trial hyperparameter: {'n_estimators': 50, 'max_depth': 10}


In [25]:
from sklearn.metrics import accuracy_score
best_model = RandomForestClassifier(**study.best_trial.params, random_state = 42)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"best model acc: {acc:.2f}")

best model acc: 0.75


## optuna visualization

In [43]:
from optuna.visualization import plot_optimization_history, plot_parallel_coordinate, plot_slice, plot_contour, plot_param_importances
import plotly
import plotly.io as pio
pio.renderers.default = 'png'


In [45]:
plot_optimization_history(study).show()


ImportError: Tried to import 'plotly' but failed. Please make sure that the package is installed correctly to use this feature. Actual error: No module named 'plotly'.

In [None]:
plot_parallel_coordinate(study).show()

In [None]:
plot_slice(study).show()

In [None]:
plot_contour(study).show()

In [None]:
plot_param_importances(study).show()

## optimizing multiple ML models

In [28]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

In [None]:
def objective(trial):
    classifier_name = trial.suggest_categorical("classifier", ["SVM", "RandomForest", "GrdientBoosting"])

    if classifier_name  == "SVM":
        c = trial.suggest_float("C", 0.1, 100, log = True)
        kernel = trial.suggest_categorical("kernel", ["linear", "rbf", "poly", "sigmoid"])
        gamma = trial.suggest_categorical("gamma", ["scale", "auto"])

        model = SVC(C = c, kernel=kernel, gamma=gamma, random_state=42)

    elif classifier_name == "RandomForest":
        n_estimators = trial.suggest_int("n_estimators", 50, 300)
        max_depth = trial.suggest_int("max_depth", 3, 20)
        min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
        min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
        bootstrap = trial.suggest_categorical("bootstrap", [True, False])

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            bootstrap=bootstrap,
            random_state=42
        )

    elif classifier_name == "GradientBoosting":
        n_estimators = trial.suggest_int("n_estimators", 50, 300)
        learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3, log = True)
        max_depth = trial.suggest_int("max_depth", 3, 20)
        min_samples_split = trial.suggest_int('min_sample_split', 2,10)
        min_samples_leaf = trial.suggest_int("min_sample_leaf", 1,10)

        model = GradientBoostingClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state = 42
        )

    score = cross_val_score(model, X_train, y_train, scoring="accuracy").mean()
    return score


In [None]:
study=optuna.create_study(direction="maximize")
study.optimize(objective, n_trails = 150)


In [None]:
best_trial = study.best_trial
print("best trial accuracy: ", best_trial.value)
print("best_trial_params: ", best_trial.params)

In [None]:
study.trials_dataframe()

In [None]:
study.trials_dataframe()["params_classifier"].vlaue_counts()

In [None]:
study.trials_dataframe().groupbu("params_classifier")["value"].mean()

In [None]:
plot_optimization_history(study).show()

In [None]:
plot_slice(study).show()

In [None]:
plot_param_importances(study).show()

## experiment on iris dataset

In [None]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
import numpy as np

X, y = load_iris(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    params = {
        "verbosity":0,
        "objective":"multi:softprob",
        "num_class":3,
        "eval_metric":"mlogloss",
        "booster":"gbtree",
        "lambda":trial.suggest_float("lambda", 1e-8, 1.0, log = True),
        "alpha":trial.suggest_float("alpha", 1e-8, 1.0, log = True),
        "ete":trial.suggest_float("eta", 0.01, 0.3),
        "gamma":trial.suggest_float("gamma", 1e-8, 1.0, log= True),
        "max_depth":trial.suggest_int("max_depth", 3, 9),
        "min_child_weight":trial.suggest_int("min_child_weight", 1, 10),
        "subsample":trial.suggest_float("subsample", 0.4, 1.0),
        "colsample_bytree":trial.suggest_float("colsample_bytree", trial.suggest_float("colsample_bytree", 0.4, 1.0)),
        "n_estimators":300,
    }

    dtrain = xgb.DMatrix(X_train, label = y_train)
    dtest = xgb.DMatrix(X_test, label = y_test)

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "eval_mlogloss")

    bst = xgb.train(
        params,
        dtrain,
        num_boost_round = 300,
        evals = [(dtrain, "train"), (dtest, "eval")],
        early_stopping_rounds = 30,
        callbacks = [pruning_callback]
    )

    #prediction
    preds = bst.predict(dtest)
    best_preds = [int(np.argmax(line)) for line in preds]

    #return acc as objective value
    accuracy = accuracy_score(y_test, best_preds)
    return accuracy

study = optuna.create_study(direction="maximize", pruner=optuna.pruners.SuccessiveHalvingPruner())
study.optimize(objective, n_trials=50)

print(f"best trial: {study.best_params}")
print(f"best acc: {study.best_value}")



In [None]:
from optuna.visualization import plot_intermediate_values
plot_intermediate_values(study).show()