### Default params ###

In [21]:
default_xgb_params_optimize = {}
default_xgb_params_optimize['predictor'] = 'cpu_predictor'
default_xgb_params_optimize['num_parallel_tree'] = 4
default_xgb_params_optimize['gamma'] = 0
default_xgb_params_optimize['n_estimators'] = 500

### Setup ###

In [22]:
from playsound import playsound
import results as r
import optuna
from xgboost import XGBRegressor
import numpy as np
from sklearn.model_selection import cross_val_score
from pathlib import Path
import pandas as pd


def score_dataset(X, y, model=XGBRegressor(), n_jobs=None):
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    log_y = np.log(y)
    score = cross_val_score(
        model, X, log_y, cv=5, scoring="neg_mean_squared_error", n_jobs=n_jobs,
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score


def optimize_params(optimize_X, optimize_y, n_trials=10):
    def objective(trial):
        xgb_params = dict(
            **default_xgb_params_optimize,
            max_delta_step=trial.suggest_float('max_delta_step', 0, 10),
            eta=trial.suggest_float('eta', 0.1, 1.0),
            colsample_bytree=trial.suggest_float('colsample_bytree', 0.1, 1.0),
            colsample_bylevel=trial.suggest_float(
                'colsample_bylevel', 0.1, 1.0),
            colsample_bynode=trial.suggest_float('colsample_bynode', 0.1, 1.0),
            max_depth=trial.suggest_int("max_depth", 2, 16),
            learning_rate=trial.suggest_float(
                "learning_rate", 1e-4, 1e-1, log=True),
            min_child_weight=trial.suggest_int("min_child_weight", 1, 5),
            subsample=trial.suggest_float("subsample", 0.2, 1.0),
            reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),
            reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True),
        )
        xgb = XGBRegressor(random_state=0, nthread=9, **xgb_params)
        return score_dataset(optimize_X, optimize_y, xgb, 10)
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials, n_jobs=1,)
    return study.best_params


### Load data ###

In [23]:
X = pd.read_csv("data/X.csv", index_col="Id")
y = pd.read_csv("data/y.csv", index_col="Id")

### Start optimization ###

In [24]:
xgb_params_optimize = optimize_params(X, y, n_trials=700)
xgb_params_optimize.update(default_xgb_params_optimize)
r.save_result(xgb_params_optimize, 'data/XGBRegressor_params.json')

[32m[I 2023-03-02 10:48:28,067][0m A new study created in memory with name: no-name-381928db-b723-4641-b7e3-af99c6f7e81f[0m
[32m[I 2023-03-02 10:48:30,120][0m Trial 0 finished with value: 10.516137900225546 and parameters: {'max_delta_step': 6.804991187322738, 'eta': 0.8864492629001931, 'colsample_bytree': 0.15241217128911655, 'colsample_bylevel': 0.31969349013069237, 'colsample_bynode': 0.9809091030844344, 'max_depth': 10, 'learning_rate': 0.00029798647171223446, 'min_child_weight': 4, 'subsample': 0.8927249260813939, 'reg_alpha': 4.4105400450745655, 'reg_lambda': 0.0058186840086992325}. Best is trial 0 with value: 10.516137900225546.[0m
[32m[I 2023-03-02 10:48:34,419][0m Trial 1 finished with value: 0.12972301837958436 and parameters: {'max_delta_step': 2.7087322614445473, 'eta': 0.9173155946123884, 'colsample_bytree': 0.9708376668281076, 'colsample_bylevel': 0.35295036712705596, 'colsample_bynode': 0.9842815485372335, 'max_depth': 16, 'learning_rate': 0.03151607944763671, 'm