In [1]:
import pandas as pd

# X_train = pd.read_pickle('wine_X_train.pkl')
# X_test = pd.read_pickle('wine_X_test.pkl')
# y_train = pd.read_pickle('wine_y_train.pkl')
# y_test = pd.read_pickle('wine_y_test.pkl')
X = pd.read_pickle('wine_X.pkl')
y = pd.read_pickle('wine_y.pkl')

In [2]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

### Searched model with Optuna and cross validation

In [3]:
import numpy as np
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

np.random.seed(11)
# Define the objective function for Optuna
# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameters to be optimized
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
    }

    # Train the model with the given hyperparameters using cross-validation
    model = XGBClassifier(**params)
    cv = StratifiedKFold(n_splits=2, shuffle=True)
    scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1)

    # Calculate the mean validation accuracy and return it as the objective value
    score = np.mean(scores)
    return score

# Run the optimization using Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

# Print the results
print('Best score:', study.best_value)
print('Best parameters:', study.best_params)

[32m[I 2023-06-12 12:15:14,337][0m A new study created in memory with name: no-name-ed2090ba-2d94-475e-9f87-687768ff1c9d[0m
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1),
  'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
  'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
[32m[I 2023-06-12 12:15:18,039][0m Trial 0 finished with value: 0.5666027847309136 and parameters: {'n_estimators': 117, 'max_depth': 4, 'learning_rate': 0.07001376232215462, 'subsample': 0.96701076534418, 'colsample_bytree': 0.6044429663668593, 'reg_alpha': 2.6416119362463814e-05, 'reg_lambda': 0.0002579278486191841, 'min_child_weight': 155, 'gamma': 1.4978831428259934e-06}. Best is trial 0 with value: 0.5666027847309136.[0m
  'learning_rate': trial.suggest_loguniform('

Best score: 0.6397809762202753
Best parameters: {'n_estimators': 65, 'max_depth': 9, 'learning_rate': 0.012942731714029907, 'subsample': 0.7670863090474973, 'colsample_bytree': 0.7203656316491017, 'reg_alpha': 2.336994780196003e-05, 'reg_lambda': 1.4976254182722443e-08, 'min_child_weight': 2, 'gamma': 0.07383365854216414}


### Performance report and analysis

In [4]:
# from sklearn.metrics import classification_report
# import joblib
#
# # Save the model
# # joblib.dump(best_estimator, "best_model.joblib")
# # Load the model
# loaded_model = joblib.load("best_model.joblib")
#
# # Use the loaded model for predictions
# y_pred = pd.Series(le.inverse_transform(loaded_model.predict(X_test), index=y_test.index)
# # Generate confusion matrix
# report = classification_report(y_test, y_pred)
# print(report)