### Default params ###

In [32]:
default_xgb_params_optimize = {}
default_xgb_params_optimize['predictor'] = 'cpu_predictor'
default_xgb_params_optimize['num_parallel_tree'] = 4
default_xgb_params_optimize['gamma'] = 0
default_xgb_params_optimize['n_estimators'] = 500

### Setup ###

In [33]:
from playsound import playsound
import results as r
import optuna
from xgboost import XGBRegressor
import numpy as np
from sklearn.model_selection import cross_val_score
from pathlib import Path
import pandas as pd


def score_dataset(X, y, model=XGBRegressor(), n_jobs=None):
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    log_y = np.log(y)
    score = cross_val_score(
        model, X, log_y, cv=5, scoring="neg_mean_squared_error", n_jobs=n_jobs,
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score


def optimize_params(optimize_X, optimize_y, n_trials=10):
    def objective(trial):
        xgb_params = dict(
            **default_xgb_params_optimize,
            max_delta_step=trial.suggest_float('max_delta_step', 0, 10),
            eta=trial.suggest_float('eta', 0.1, 1.0),
            colsample_bytree=trial.suggest_float('colsample_bytree', 0.1, 1.0),
            colsample_bylevel=trial.suggest_float(
                'colsample_bylevel', 0.1, 1.0),
            colsample_bynode=trial.suggest_float('colsample_bynode', 0.1, 1.0),
            max_depth=trial.suggest_int("max_depth", 2, 16),
            learning_rate=trial.suggest_float(
                "learning_rate", 1e-4, 1e-1, log=True),
            min_child_weight=trial.suggest_int("min_child_weight", 1, 5),
            subsample=trial.suggest_float("subsample", 0.2, 1.0),
            reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),
            reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True),
        )
        xgb = XGBRegressor(random_state=0, nthread=9, **xgb_params)
        return score_dataset(optimize_X, optimize_y, xgb, 10)
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials, n_jobs=1,)
    return study.best_params


### Load data ###

In [34]:
X = pd.read_csv("data/X.csv", index_col="Id")
y = pd.read_csv("data/y.csv", index_col="Id")

### Start optimization ###

In [35]:
xgb_params_optimize = optimize_params(X, y, n_trials=500)
xgb_params_optimize.update(default_xgb_params_optimize)
r.save_result(xgb_params_optimize, 'data/XGBRegressor_params.json')

[32m[I 2023-03-01 18:44:52,607][0m A new study created in memory with name: no-name-aea7c7e6-a2ac-4126-9e6f-31f2c0907ea0[0m
[32m[I 2023-03-01 18:44:56,631][0m Trial 0 finished with value: 0.1205899253361519 and parameters: {'max_delta_step': 5.541548462423963, 'eta': 0.2957485907112778, 'colsample_bytree': 0.6249251419740778, 'colsample_bylevel': 0.2870568611077179, 'colsample_bynode': 0.2120448319243723, 'max_depth': 12, 'learning_rate': 0.03666977985540038, 'min_child_weight': 3, 'subsample': 0.7032012817309368, 'reg_alpha': 0.007443694945758678, 'reg_lambda': 0.06403120985563665}. Best is trial 0 with value: 0.1205899253361519.[0m
[32m[I 2023-03-01 18:44:58,093][0m Trial 1 finished with value: 11.389541503811124 and parameters: {'max_delta_step': 2.5452545492620606, 'eta': 0.8616736832483892, 'colsample_bytree': 0.5106580935334837, 'colsample_bylevel': 0.2522251716205729, 'colsample_bynode': 0.4125001816588557, 'max_depth': 11, 'learning_rate': 0.00010995477854028869, 'min_c