In [1]:

""" 
%pip install optuna
%pip install opendatasets
import opendatasets as od
download_link = "https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database"
od.download(download_link)

"""



' \n%pip install optuna\n%pip install opendatasets\nimport opendatasets as od\ndownload_link = "https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database"\nod.download(download_link)\n\n'

In [2]:

dataset_link = "dataset/diabetes.csv"

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


df = pd.read_csv(dataset_link)
df.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:

df.isnull().sum()


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [4]:

# But Here Null value represent by 0,
missing_value_columns =  ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[missing_value_columns] = df[missing_value_columns].replace(0,np.nan)



In [5]:

# simply fill the nan value with mean:
df.fillna(df.mean(),inplace=True)
df.sample(10)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
631,0,102.0,78.0,40.0,90.0,34.5,0.238,24,0
331,2,87.0,58.0,16.0,52.0,32.7,0.166,25,0
358,12,88.0,74.0,40.0,54.0,35.3,0.378,48,0
348,3,99.0,62.0,19.0,74.0,21.8,0.279,26,0
679,2,101.0,58.0,17.0,265.0,24.2,0.614,23,0
253,0,86.0,68.0,32.0,155.548223,35.8,0.238,25,0
691,13,158.0,114.0,29.15342,155.548223,42.3,0.257,44,1
134,2,96.0,68.0,13.0,49.0,21.1,0.647,26,0
57,0,100.0,88.0,60.0,110.0,46.8,0.962,31,0
154,8,188.0,78.0,29.15342,155.548223,47.9,0.137,43,1


In [6]:
from sklearn.model_selection import train_test_split

X = df.iloc[:,:-1]
y = df["Outcome"]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)



In [7]:

# scale the data for better accuracy:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



<br>
<br>

# #01: Optuna Implementation:

<br>
<br>

In [8]:

"""
Here, we will implement the optuan with a **Objective Function:**
"""
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def Objective(trial):
  # suggest which hyperparameter value to choose
  n_estimators = trial.suggest_int("n_estimators",50,200)
  max_depth = trial.suggest_int("max_depth",3,20)
  model = RandomForestClassifier(
      n_estimators=n_estimators,
      max_depth=max_depth,
      random_state=42)
  score = cross_val_score(model,X_train,y_train,cv=3,scoring="accuracy").mean()
  return score



In [9]:

# Create a study with a sampler (Sampler is somthing like an algorithrm which tell base on current all
# situation or previous all the accuracy along with hyperparameter)
study = optuna.create_study(direction="maximize",sampler=optuna.samplers.TPESampler())
study.optimize(Objective,n_trials=50)



[I 2025-06-02 22:35:55,449] A new study created in memory with name: no-name-8edaa569-8eb3-491b-87fc-e7a605c2ae7f
[I 2025-06-02 22:35:56,594] Trial 0 finished with value: 0.7709497206703911 and parameters: {'n_estimators': 164, 'max_depth': 12}. Best is trial 0 with value: 0.7709497206703911.
[I 2025-06-02 22:35:57,247] Trial 1 finished with value: 0.7709497206703911 and parameters: {'n_estimators': 69, 'max_depth': 8}. Best is trial 0 with value: 0.7709497206703911.
[I 2025-06-02 22:35:59,125] Trial 2 finished with value: 0.7616387337057727 and parameters: {'n_estimators': 184, 'max_depth': 8}. Best is trial 0 with value: 0.7709497206703911.
[I 2025-06-02 22:36:00,275] Trial 3 finished with value: 0.7635009310986964 and parameters: {'n_estimators': 146, 'max_depth': 10}. Best is trial 0 with value: 0.7709497206703911.
[I 2025-06-02 22:36:00,733] Trial 4 finished with value: 0.7523277467411545 and parameters: {'n_estimators': 53, 'max_depth': 10}. Best is trial 0 with value: 0.77094972

In [10]:

# Best trail and parameter:
print(study.best_trial.value)
print(study.best_trial.params)



0.7839851024208566
{'n_estimators': 119, 'max_depth': 15}


In [11]:

# test the value:
from sklearn.metrics import  accuracy_score
classifier = RandomForestClassifier(**study.best_trial.params,random_state=42)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

accuracy_score(y_test,y_pred)*100



74.89177489177489

<br>


# #02 RandomSearchCV with Optuna:

<br>

- Previously, we use TPESampler(), change the sampler we can perfrom RandomSearchCV.

<br>
<br>


In [12]:

study = optuna.create_study(direction="maximize",sampler=optuna.samplers.RandomSampler())
study.optimize(Objective,n_trials=30)


[I 2025-06-02 22:36:36,524] A new study created in memory with name: no-name-2a37934c-43c1-48a9-8c68-98c39dbd8c87
[I 2025-06-02 22:36:37,145] Trial 0 finished with value: 0.7765363128491621 and parameters: {'n_estimators': 90, 'max_depth': 19}. Best is trial 0 with value: 0.7765363128491621.
[I 2025-06-02 22:36:37,978] Trial 1 finished with value: 0.7690875232774674 and parameters: {'n_estimators': 144, 'max_depth': 20}. Best is trial 0 with value: 0.7765363128491621.
[I 2025-06-02 22:36:38,434] Trial 2 finished with value: 0.7541899441340782 and parameters: {'n_estimators': 81, 'max_depth': 6}. Best is trial 0 with value: 0.7765363128491621.
[I 2025-06-02 22:36:39,008] Trial 3 finished with value: 0.7690875232774674 and parameters: {'n_estimators': 98, 'max_depth': 19}. Best is trial 0 with value: 0.7765363128491621.
[I 2025-06-02 22:36:40,000] Trial 4 finished with value: 0.7709497206703911 and parameters: {'n_estimators': 168, 'max_depth': 20}. Best is trial 0 with value: 0.77653631

In [13]:

print(study.best_trial.value)
print(study.best_trial.params)


0.7802607076350093
{'n_estimators': 131, 'max_depth': 17}


In [14]:

from sklearn.metrics import  accuracy_score
classifier = RandomForestClassifier(**study.best_trial.params,random_state=42)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
accuracy_score(y_test,y_pred)*100


74.45887445887446

<br>


# #03 GridSearchCV with Optuna:

<br>




In [15]:

search_space = {
    'n_estimators': [50,100,150,200],
    'max_depth': [5,10,15,20]
}


In [16]:

study = optuna.create_study(direction="maximize",sampler=optuna.samplers.GridSampler(search_space))
study.optimize(Objective)


[I 2025-06-02 22:37:08,295] A new study created in memory with name: no-name-d9c28926-ddbd-4aa2-8764-8b3fed7a8ec9
[I 2025-06-02 22:37:08,913] Trial 0 finished with value: 0.7690875232774674 and parameters: {'n_estimators': 100, 'max_depth': 5}. Best is trial 0 with value: 0.7690875232774674.
[I 2025-06-02 22:37:09,828] Trial 1 finished with value: 0.7672253258845437 and parameters: {'n_estimators': 150, 'max_depth': 10}. Best is trial 0 with value: 0.7690875232774674.
[I 2025-06-02 22:37:10,149] Trial 2 finished with value: 0.7728119180633147 and parameters: {'n_estimators': 50, 'max_depth': 15}. Best is trial 2 with value: 0.7728119180633147.
[I 2025-06-02 22:37:10,794] Trial 3 finished with value: 0.7653631284916201 and parameters: {'n_estimators': 100, 'max_depth': 15}. Best is trial 2 with value: 0.7728119180633147.
[I 2025-06-02 22:37:11,421] Trial 4 finished with value: 0.7690875232774674 and parameters: {'n_estimators': 100, 'max_depth': 20}. Best is trial 2 with value: 0.772811

In [17]:


from sklearn.metrics import  accuracy_score
classifier = RandomForestClassifier(**study.best_trial.params,random_state=42)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
accuracy_score(y_test,y_pred)*100


73.59307359307358

<br>

# #04 Optuna Graph: Visulization

<br>

In [18]:

from optuna.visualization import plot_optimization_history,plot_parallel_coordinate,plot_slice,plot_contour,plot_param_importances

plot_optimization_history(study).show()


In [19]:

# it's show that, the region we select
# maybe the best value or not, density is almost same in everywhere.
# if any region density is high, then, we should do more study
# in that range

plot_parallel_coordinate(study).show()



In [20]:


plot_slice(study).show()


In [21]:

# where color density is high
# there we will the right value
plot_contour(study).show()



In [22]:


# That tell which hyperparameter is more important:
plot_param_importances(study).show()



---

<br>
<br>
<br>

# #05 Dynamics Search Space:

<br>
<br>
<br>

---

In [23]:

from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC

def Objective(trial):

  classifier_name = trial.suggest_categorical("classifier", ['SVC','RandomForestClassifier','GradientBoostingClassifier'])

  if classifier_name == 'SVC':
    c = trial.suggest_float('C',0.1,100,log=True)
    kernel = trial.suggest_categorical('kernal',['rbf','linear','poly','sigmoid'])
    gamma = trial.suggest_categorical('gamma',['scale','auto'])
    model = SVC(C=c,kernel=kernel,gamma=gamma,random_state=42)

  elif classifier_name == 'RandomForestClassifier':
    n_estimators = trial.suggest_int("n_estimators",50,300)
    max_depth = trial.suggest_int("max_depth",3,20)
    min_samples_split = trial.suggest_int("min_samples_split",2,10)
    min_sample_leaf = trial.suggest_int("min_sample_leaf",1,10)
    bootstrap = trial.suggest_categorical("bootstrap",[True,False])
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_sample_leaf,
        bootstrap=bootstrap,
        random_state=42)

  elif classifier_name == 'GradientBoostingClassifier':
    n_estimators = trial.suggest_int("n_estimators",50,300)
    learning_rate = trial.suggest_float("learning_rate",0.01,0.3,log=True)
    max_depth = trial.suggest_int("max_depth",3,20)
    min_samples_split = trial.suggest_int("min_samples_split",2,10)
    min_sample_leaf = trial.suggest_int("min_sample_leaf",1,10)
    model = GradientBoostingClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_sample_leaf,
        random_state=42
        )

  score = cross_val_score(model,X_train,y_train,cv=3,scoring="accuracy").mean()
  return score




In [24]:

study = optuna.create_study(direction="maximize",sampler=optuna.samplers.TPESampler())
study.optimize(Objective,n_trials=100)


[I 2025-06-02 22:37:23,855] A new study created in memory with name: no-name-8d0c4c67-641c-4031-9a39-9545c2d1d434


[I 2025-06-02 22:37:26,259] Trial 0 finished with value: 0.7150837988826817 and parameters: {'classifier': 'GradientBoostingClassifier', 'n_estimators': 137, 'learning_rate': 0.11156472957531484, 'max_depth': 20, 'min_samples_split': 9, 'min_sample_leaf': 10}. Best is trial 0 with value: 0.7150837988826817.
[I 2025-06-02 22:37:26,338] Trial 1 finished with value: 0.6927374301675977 and parameters: {'classifier': 'SVC', 'C': 13.913077431197095, 'kernal': 'sigmoid', 'gamma': 'scale'}. Best is trial 0 with value: 0.7150837988826817.
[I 2025-06-02 22:37:26,420] Trial 2 finished with value: 0.7858472998137803 and parameters: {'classifier': 'SVC', 'C': 0.3149849846620663, 'kernal': 'linear', 'gamma': 'auto'}. Best is trial 2 with value: 0.7858472998137803.
[I 2025-06-02 22:37:29,466] Trial 3 finished with value: 0.7281191806331471 and parameters: {'classifier': 'GradientBoostingClassifier', 'n_estimators': 96, 'learning_rate': 0.22312641054514146, 'max_depth': 16, 'min_samples_split': 7, 'mi

In [25]:

print(study.best_trial.value)
print(study.best_trial.params)


0.7895716945996275
{'classifier': 'SVC', 'C': 0.13425137910046955, 'kernal': 'linear', 'gamma': 'auto'}


In [26]:

study.trials_dataframe()[:10]


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_bootstrap,params_classifier,params_gamma,params_kernal,params_learning_rate,params_max_depth,params_min_sample_leaf,params_min_samples_split,params_n_estimators,state
0,0,0.715084,2025-06-02 22:37:23.858189,2025-06-02 22:37:26.258246,0 days 00:00:02.400057,,,GradientBoostingClassifier,,,0.111565,20.0,10.0,9.0,137.0,COMPLETE
1,1,0.692737,2025-06-02 22:37:26.259964,2025-06-02 22:37:26.337450,0 days 00:00:00.077486,13.913077,,SVC,scale,sigmoid,,,,,,COMPLETE
2,2,0.785847,2025-06-02 22:37:26.339531,2025-06-02 22:37:26.420142,0 days 00:00:00.080611,0.314985,,SVC,auto,linear,,,,,,COMPLETE
3,3,0.728119,2025-06-02 22:37:26.422320,2025-06-02 22:37:29.465596,0 days 00:00:03.043276,,,GradientBoostingClassifier,,,0.223126,16.0,5.0,7.0,96.0,COMPLETE
4,4,0.767225,2025-06-02 22:37:29.467243,2025-06-02 22:37:29.560656,0 days 00:00:00.093413,0.336864,,SVC,scale,rbf,,,,,,COMPLETE
5,5,0.767225,2025-06-02 22:37:29.561898,2025-06-02 22:37:30.185413,0 days 00:00:00.623515,,False,RandomForestClassifier,,,,4.0,4.0,6.0,93.0,COMPLETE
6,6,0.761639,2025-06-02 22:37:30.187607,2025-06-02 22:37:31.868895,0 days 00:00:01.681288,,True,RandomForestClassifier,,,,5.0,4.0,9.0,198.0,COMPLETE
7,7,0.716946,2025-06-02 22:37:31.870024,2025-06-02 22:37:31.908651,0 days 00:00:00.038627,3.153984,,SVC,scale,sigmoid,,,,,,COMPLETE
8,8,0.761639,2025-06-02 22:37:31.910278,2025-06-02 22:37:36.302636,0 days 00:00:04.392358,,True,RandomForestClassifier,,,,8.0,8.0,4.0,280.0,COMPLETE
9,9,0.75419,2025-06-02 22:37:36.312520,2025-06-02 22:37:40.289602,0 days 00:00:03.977082,,True,RandomForestClassifier,,,,4.0,8.0,6.0,142.0,COMPLETE


In [27]:

# Base on the result baysian understand that,we will get best result from SVC
study.trials_dataframe()['params_classifier'].value_counts()



params_classifier
SVC                           80
GradientBoostingClassifier    10
RandomForestClassifier        10
Name: count, dtype: int64

In [28]:

# average accuracy:
study.trials_dataframe().groupby('params_classifier')['value'].mean()



params_classifier
GradientBoostingClassifier    0.730912
RandomForestClassifier        0.766108
SVC                           0.777235
Name: value, dtype: float64

In [29]:


plot_contour(study).show()



Contour plot will not be displayed because `C` and `bootstrap` cannot co-exist in `trial.params`.


Contour plot will not be displayed because `C` and `learning_rate` cannot co-exist in `trial.params`.


Contour plot will not be displayed because `C` and `max_depth` cannot co-exist in `trial.params`.


Contour plot will not be displayed because `C` and `min_sample_leaf` cannot co-exist in `trial.params`.


Contour plot will not be displayed because `C` and `min_samples_split` cannot co-exist in `trial.params`.


Contour plot will not be displayed because `C` and `n_estimators` cannot co-exist in `trial.params`.


Contour plot will not be displayed because `bootstrap` and `C` cannot co-exist in `trial.params`.


Contour plot will not be displayed because `bootstrap` and `gamma` cannot co-exist in `trial.params`.


Contour plot will not be displayed because `bootstrap` and `kernal` cannot co-exist in `trial.params`.


Contour plot will not be displayed because `bootstrap` and `learning_


<br>

- We can also use distributed computing with optuna.
- We can intregrate it with sklearn,tensorflow,pytorch and also for mlflow.
- We can also use it in ml and also dl.

<br>

