In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.7.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.7.0-py3-none-any.whl (413 kB)
   ---------------------------------------- 0.0/413.9 kB ? eta -:--:--
   --- ----------------------------------- 41.0/413.9 kB 667.8 kB/s eta 0:00:01
   ----------- ---------------------------- 122.9/413.9 kB 1.2 MB/s eta 0:00:01
   ---------------- ----------------------- 174.1/413.9 kB 1.2 MB/s eta 0:00:01
   ------------------------ --------------- 256.0/413.9 kB 1.3 MB/s eta 0:00:01
   ------------------------ --------------- 256.0/413.9 kB 1.3 MB/s eta 0:00:01
   ------------------------ --------------- 256.0/413.9 kB 1.3 MB/s eta 0:00:01
   ---------------------------- --------- 307.2/413.9 kB 905.4 kB/s eta 0:00:01
   -------------------------------- ----- 358.4/413.9 kB 928.4 kB/s eta 0:00:01
   -------------------------------------- 413.9/413.9 kB 958.

In [13]:
import optuna
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [9]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
           'DiabetesPedigreeFunction', 'Age', 'Outcome']

In [11]:
df = pd.read_csv(url, names=columns)

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [15]:
cols_with_missing_vals = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_missing_vals] = df[cols_with_missing_vals].replace(0, np.nan)

In [17]:
df.fillna(df.mean() , inplace = True)

In [19]:
print(df.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [21]:
X = df.drop('Outcome' , axis = 1)
y = df['Outcome']

In [25]:
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 42)

In [33]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Check the shape of the data
print(f'Training set shape: {X_train.shape}')
print(f'Test set shape: {X_test.shape}')

Training set shape: (614, 8)
Test set shape: (154, 8)


In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [43]:
# define the obj func
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators' , 50 , 200)
    max_depth = trial.suggest_int('max_depth' , 3 , 20)
    model = RandomForestClassifier(
        n_estimators = n_estimators,
        max_depth = max_depth,
        random_state = 42
    )
    score = cross_val_score(model , X_train , y_train , cv = 3 , scoring = 'accuracy').mean()
    return score

In [47]:
# Create a study object and optimize the objective function
study = optuna.create_study(direction = 'maximize' , sampler = optuna.samplers.TPESampler())

[32m[I 2026-01-29 12:02:55,076][0m A new study created in memory with name: no-name-6046c1c8-cad1-4a4b-89bb-5c5c4ba39e57[0m


In [51]:
study.optimize(objective , n_trials = 50)

[32m[I 2026-01-29 12:03:27,181][0m Trial 0 finished with value: 0.7671050534034752 and parameters: {'n_estimators': 79, 'max_depth': 17}. Best is trial 0 with value: 0.7671050534034752.[0m
[32m[I 2026-01-29 12:03:27,546][0m Trial 1 finished with value: 0.7801370954886019 and parameters: {'n_estimators': 52, 'max_depth': 12}. Best is trial 1 with value: 0.7801370954886019.[0m
[32m[I 2026-01-29 12:03:28,570][0m Trial 2 finished with value: 0.7817551410808226 and parameters: {'n_estimators': 130, 'max_depth': 18}. Best is trial 2 with value: 0.7817551410808226.[0m
[32m[I 2026-01-29 12:03:29,184][0m Trial 3 finished with value: 0.771967160848079 and parameters: {'n_estimators': 66, 'max_depth': 6}. Best is trial 2 with value: 0.7817551410808226.[0m
[32m[I 2026-01-29 12:03:29,854][0m Trial 4 finished with value: 0.7687151283277539 and parameters: {'n_estimators': 98, 'max_depth': 14}. Best is trial 2 with value: 0.7817551410808226.[0m
[32m[I 2026-01-29 12:03:30,652][0m Tria

In [53]:
# otimisng multiple ML models

In [55]:
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.svm import SVC

In [59]:
def objective(trial):
    classifier_name = trial.suggest_categorical('classifier' , ['SVM' , 'RandomForest' , 'GradientBoosting'])
    if classifier_name == 'SVM':
        # SVM hyperparameters
        c = trial.suggest_float('C', 0.1, 100, log=True)
        kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly', 'sigmoid'])
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])

        model = SVC(C=c, kernel=kernel, gamma=gamma, random_state=42)

    elif classifier_name == 'RandomForest':
        # Random Forest hyperparameters
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        bootstrap = trial.suggest_categorical('bootstrap', [True, False])

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            bootstrap=bootstrap,
            random_state=42
        )

    elif classifier_name == 'GradientBoosting':
        # Gradient Boosting hyperparameters
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

        model = GradientBoostingClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42
        )

    # Perform cross-validation and return the mean accuracy
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()
    return score
    
    

In [61]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[32m[I 2026-01-29 12:10:00,727][0m A new study created in memory with name: no-name-d0f65240-cb69-4fbf-afb4-53f066aafa91[0m
[32m[I 2026-01-29 12:10:04,774][0m Trial 0 finished with value: 0.7491869918699187 and parameters: {'classifier': 'GradientBoosting', 'n_estimators': 256, 'learning_rate': 0.0225177960583666, 'max_depth': 15, 'min_samples_split': 10, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.7491869918699187.[0m
[32m[I 2026-01-29 12:10:04,860][0m Trial 1 finished with value: 0.7101147776183644 and parameters: {'classifier': 'SVM', 'C': 36.54331385634085, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 0 with value: 0.7491869918699187.[0m
[32m[I 2026-01-29 12:10:06,512][0m Trial 2 finished with value: 0.7606009883628247 and parameters: {'classifier': 'GradientBoosting', 'n_estimators': 124, 'learning_rate': 0.2468479751310459, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 4}. Best is trial 2 with value: 0.7606009883628247.[0m
[32m[I 2026-01-