In [9]:
import optuna

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.ensemble import AdaBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

import sklearn.datasets

In [10]:
def define_scikit_classifiers(trial):
    """
    This funtion contains the different classifier models from 
    scikit-learn from basic to advanced classifiers including 
    light-gbm and xgboost. 
    """
    classifier = None
    models = ['SVC', 'KNeighborsClassifier', 'DecisionTree',
              'RandomForest', 'GBM', 'ADABoost', 'LGBM', 'XGBoost']
    
    clf_name = trial.suggest_categorical('clf', models)
    
    if clf_name == 'SVC':
        svc_c = trial.suggest_float('svc_c', 1e-10, 1e-10, log=True)
        classifier = LinearSVC(C=svc_c)
    elif clf_name =='KNeighborsClassifier':
        n_neighbors = trial.suggest_int('KNN_n_neighbors', 3, 10)
        weights = trial.suggest_categorical('KNN_weights',
                                            ['uniform', 'distance'])
        classifier = KNeighborsClassifier(n_neighbors=n_neighbors,
                                          weights=weights)
    elif clf_name =='DecisionTree':
        max_depth = trial.suggest_int('dt_max_depth', 3, 12)
        max_features = trial.suggest_categorical('dt_max_features',
                                                 ['sqrt', 'log2', None])
        criterion = trial.suggest_categorical('dt_criterion',
                                              ['gini', 'entropy'])
        classifier = DecisionTreeClassifier(max_depth=max_depth,
                                            max_features=max_features,
                                            criterion=criterion)
    elif clf_name =='RandomForest':
        max_depth = trial.suggest_int('rf_max_depth', 3, 12)
        max_features = trial.suggest_categorical('rf_max_features',
                                                 ['sqrt', 'log2', None])
        n_estimators = trial.suggest_int('rf_n_estimators', 50, 250)
        classifier = RandomForestClassifier(max_depth=max_depth,
                                            max_features=max_features,
                                            n_estimators=n_estimators)
    elif clf_name == 'GBM':
        n_estimators = trial.suggest_int('gbm_n_estimators', 50, 250)
        max_depth = trial.suggest_int('gbm_max_depth', 2, 12)
        min_leaf = trial.suggest_int('gbm_min_weight_fraction_leaf', 0, 0.5)
        lr = trial.suggest_float('gbm_learning_rate', 0.1, 1)
        classifier = GradientBoostingClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_weight_fraction_leaf=min_leaf,
            learning_rate=lr, random_state=143
        )
    elif clf_name == 'ADABoost':
        n_estimators = trial.suggest_int('ada_n_estimators', 50, 250)
        lr = trial.suggest_float('ada_learning_rate', 0.1, 1)
        classifier = AdaBoostClassifier(n_estimators=n_estimators,
                                        learning_rate=lr,
                                        random_state=143)
    elif clf_name == 'LGBM':
        num_leaves = trial.suggest_int('lgbm_num_leaves', 20, 100)
        n_estimators = trial.suggest_int('lgbm_n_estimators', 50, 250)
        max_depth = trial.suggest_int('lgbm_max_depth', 2, 12)
        lr = trial.suggest_float('lgbm_learning_rate', 0.1, 1)
        classifier = LGBMClassifier(n_estimators=n_estimators,
                                    max_depth=max_depth,
                                    learning_rate=lr,
                                    num_leaves=num_leaves,
                                    random_state=143)
    elif clf_name == 'XGBoost':
        n_estimators = trial.suggest_int('xgbm_n_estimators', 50, 250)
        max_depth = trial.suggest_int('xgbm_max_depth', 2, 12)
        lr = trial.suggest_float('xgbm_learning_rate', 0.01, 0.3)
        classifier = XGBClassifier(n_estimators=n_estimators,
                                   max_depth=max_depth,
                                   learning_rate=lr,
                                   random_state=143)
    return classifier

In [11]:
def objective(trial):
    """
    This is the objective function that utilizes Optuna. We are using 
    the Iris dataset as example for implementation.
    """
    clf = define_scikit_classifiers(trial)
    iris = sklearn.datasets.load_iris()
    x, y = iris.data, iris.target
    
    X_train, X_val, y_train, y_val = (
        train_test_split(x, y, test_size=0.25, random_state=1337)
    )

    steps = [('clf', clf)]
    pipeline = Pipeline(steps)
    pipeline.fit(X_train, y_train)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1337)
    score = cross_val_score(pipeline, X_val, y_val, n_jobs=-1, cv=cv)
    accuracy = score.mean()

    return accuracy

In [12]:
if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study.optimize(objective, n_trials=15)
    print(f'The best accuracy obtained is: {study.best_trial.value}')
    print(f'The best parameters obtained are: {study.best_trial.params}')

[32m[I 2023-03-16 07:41:25,425][0m A new study created in memory with name: no-name-d043036b-dbaf-454a-8219-f48360b9c9c9[0m


The best accuracy obtained is: 0.975
The best parameters obtained are: {'clf': 'ADABoost', 'ada_n_estimators': 97, 'ada_learning_rate': 0.4591367235913083}
