In [5]:
import pandas as pd
import xgboost as xgb
import optuna
import numpy as np

# # 1. Load your dataset into a pandas DataFrame
# #    Replace 'your_dataset.csv' with your actual file path or DataFrame source.
# df = pd.read_csv('your_dataset.csv')

# # 2. Separate features, label, and weight
# #    - 'label' is the binary target column.
# #    - 'weight_column' contains the weight for each row.
# X = df.drop(['label', 'weight_column'], axis=1)
# y = df['label']
# w = df['weight_column']

# Set a random seed for reproducibility
np.random.seed(42)

# Generate 500 samples with 10 features
n_samples = 500
n_features = 10

# Random features from a normal distribution
X = np.random.randn(n_samples, n_features)

# Binary labels (0 or 1), with some imbalance
y = np.random.binomial(1, p=0.3, size=n_samples)

# Sample weights: higher weights for class 1
w = np.where(y == 1, 3.0, 1.0) + 0.1 * np.random.randn(n_samples)


# 3. Create an XGBoost DMatrix, passing in the sample weights
dtrain = xgb.DMatrix(data=X, label=y, weight=w)

In [6]:
# 4. Define the Optuna objective function
def objective(trial):
    # 4.1 Suggest values for hyperparameters to tune
    params = {
        'objective': 'binary:logistic',      # binary classification
        'eval_metric': 'auc',                # use AUC for evaluation
        'verbosity': 0,                      # silent
        'tree_method': 'hist',               # faster histogram algorithm
        # tuning search space:
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
    }

    # 4.2 Run cross-validation with early stopping
    cv_results = xgb.cv(
        params=params,
        dtrain=dtrain,
        num_boost_round=1000,               # maximum number of trees
        nfold=5,                            # 5-fold CV
        metrics=('auc',),                   # evaluate AUC
        early_stopping_rounds=50,           # stop if no improvement after 50 rounds
        seed=42,                            # for reproducibility
        stratified=True                     # maintain label ratio in folds
    )

    # 4.3 Extract the best average validation AUC
    best_auc = cv_results['test-auc-mean'].max()

    # 4.4 Return it (Optuna will try to maximize it)
    return best_auc

In [7]:
# 5. Create and run the study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50, timeout=600)

[I 2025-06-27 10:19:19,091] A new study created in memory with name: no-name-4ec5a305-6aee-4a65-862c-110bc4e250c4
[I 2025-06-27 10:19:19,383] Trial 0 finished with value: 0.5297559862778906 and parameters: {'max_depth': 8, 'learning_rate': 0.0453527986617336, 'subsample': 0.5089292726627259, 'colsample_bytree': 0.5290104380184957, 'min_child_weight': 10, 'gamma': 0.12870045725977144}. Best is trial 0 with value: 0.5297559862778906.
[I 2025-06-27 10:19:19,577] Trial 1 finished with value: 0.5205924781120584 and parameters: {'max_depth': 9, 'learning_rate': 0.0004030149436437459, 'subsample': 0.8369050041106911, 'colsample_bytree': 0.8473333633247015, 'min_child_weight': 9, 'gamma': 0.008435406358373596}. Best is trial 0 with value: 0.5297559862778906.
[I 2025-06-27 10:19:19,942] Trial 2 finished with value: 0.5354100532927892 and parameters: {'max_depth': 7, 'learning_rate': 0.0006152908121484206, 'subsample': 0.6213105360740743, 'colsample_bytree': 0.7605242971766957, 'min_child_weight

In [8]:
# 6. Display the best results
print(f"Best AUC: {study.best_value:.4f}")
print("Best hyperparameters:")
for key, val in study.best_params.items():
    print(f"  {key}: {val}")

Best AUC: 0.5748
Best hyperparameters:
  max_depth: 9
  learning_rate: 0.00992095788535075
  subsample: 0.7439698662961218
  colsample_bytree: 0.6341045414805973
  min_child_weight: 5
  gamma: 6.99801971161806e-08
