### Default params ###

In [1]:
default_xgb_params_optimize = {}
default_xgb_params_optimize['predictor'] = 'cpu_predictor'
default_xgb_params_optimize['num_parallel_tree'] = 6
default_xgb_params_optimize['gamma'] = 0

### Setup ###

In [2]:
import results as r
import optuna
from xgboost import XGBRegressor
import numpy as np
from sklearn.model_selection import cross_val_score
from pathlib import Path
import pandas as pd

def score_dataset(X, y, model=XGBRegressor(), n_jobs=None):
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    log_y = np.log(y)
    score = cross_val_score(
        model, X, log_y, cv=5, scoring="neg_mean_squared_error", n_jobs=n_jobs,
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score

def optimize_params(optimize_X, optimize_y, n_trials=10):
    def objective(trial):
        xgb_params = dict(
            **default_xgb_params_optimize,
            max_delta_step=trial.suggest_discrete_uniform(
                'max_delta_step', 0, 10, 0.1),
            eta=trial.suggest_discrete_uniform(
                'eta', 0.1, 1.0, 0.001),
            colsample_bytree=trial.suggest_discrete_uniform(
                'colsample_bytree', 0.1, 1.0, 0.001),
            colsample_bylevel=trial.suggest_discrete_uniform(
                'colsample_bylevel', 0.1, 1.0, 0.001),
            colsample_bynode=trial.suggest_discrete_uniform(
                'colsample_bynode', 0.1, 1.0, 0.001),
            max_depth=trial.suggest_int("max_depth", 2, 8),
            learning_rate=trial.suggest_float(
                "learning_rate", 1e-4, 1e-1, log=True),
            n_estimators=trial.suggest_int("n_estimators", 1000, 6000),
            min_child_weight=trial.suggest_int("min_child_weight", 1, 5),
            subsample=trial.suggest_float("subsample", 0.2, 1.0),
            reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),
            reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True),
        )
        xgb = XGBRegressor(random_state=0, nthread=9, **xgb_params)
        return score_dataset(optimize_X, optimize_y, xgb, 10)
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    return study.best_params

### Load data ###

In [3]:
X = pd.read_csv("data/X.csv", index_col="Id")
y = pd.read_csv("data/y.csv", index_col="Id")

### Start optimization ###

In [4]:
xgb_params_optimize = optimize_params(X, y, n_trials=300)
xgb_params_optimize.update(default_xgb_params_optimize)
r.save_result(xgb_params_optimize, 'data/XGBRegressor_params.json')

[32m[I 2023-02-19 20:36:13,795][0m A new study created in memory with name: no-name-8cff314d-2c62-46d9-9dc3-644864e67722[0m
  self._init_valid()


  0%|          | 0/300 [00:00<?, ?it/s]

  max_delta_step=trial.suggest_discrete_uniform(
  eta=trial.suggest_discrete_uniform(
  colsample_bytree=trial.suggest_discrete_uniform(
  colsample_bylevel=trial.suggest_discrete_uniform(
  colsample_bynode=trial.suggest_discrete_uniform(


[32m[I 2023-02-19 20:36:25,068][0m Trial 0 finished with value: 0.12695233722885066 and parameters: {'max_delta_step': 0.9, 'eta': 0.309, 'colsample_bytree': 0.592, 'colsample_bylevel': 0.481, 'colsample_bynode': 0.248, 'max_depth': 3, 'learning_rate': 0.02561037666226619, 'n_estimators': 3116, 'min_child_weight': 5, 'subsample': 0.22569508133133, 'reg_alpha': 0.00641108292782174, 'reg_lambda': 82.73600430909259}. Best is trial 0 with value: 0.12695233722885066.[0m
[32m[I 2023-02-19 20:37:07,155][0m Trial 1 finished with value: 3.51097239771956 and parameters: {'max_delta_step': 6.9, 'eta': 0.517, 'colsample_bytree': 0.926, 'colsample_bylevel': 0.894, 'colsample_bynode': 0.223, 'max_depth': 8, 'learning_rate': 0.00025964881450367253, 'n_estimators': 5197, 'min_child_weight': 4, 'subsample': 0.7218166348745649, 'reg_alpha': 0.00142406136722088, 'reg_lambda': 0.0024637763747659613}. Best is trial 0 with value: 0.12695233722885066.[0m
[32m[I 2023-02-19 20:37:27,109][0m Trial 2 fin