### Default params ###

In [None]:
default_xgb_params_optimize = {}
default_xgb_params_optimize['predictor'] = 'cpu_predictor'
default_xgb_params_optimize['num_parallel_tree'] = 4
default_xgb_params_optimize['gamma'] = 0
default_xgb_params_optimize['n_estimators'] = 500

### Setup ###

In [None]:
from playsound import playsound
import results as r
import optuna
from xgboost import XGBRegressor
import numpy as np
from sklearn.model_selection import cross_val_score
from pathlib import Path
import pandas as pd


def score_dataset(X, y, model=XGBRegressor(), n_jobs=None):
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    log_y = np.log(y)
    score = cross_val_score(
        model, X, log_y, cv=5, scoring="neg_mean_squared_error", n_jobs=n_jobs,
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score


def optimize_params(optimize_X, optimize_y, n_trials=10, n_jobs=1,):
    def objective(trial):
        xgb_params = dict(
            **default_xgb_params_optimize,
            eta=trial.suggest_float('eta', 0.1, 1.0),
            colsample_bytree=trial.suggest_float('colsample_bytree', 0.1, 1.0),
            colsample_bylevel=trial.suggest_float(
                'colsample_bylevel', 0.1, 1.0),
            colsample_bynode=trial.suggest_float('colsample_bynode', 0.1, 1.0),
            max_depth=trial.suggest_int("max_depth", 2, 16),
            learning_rate=trial.suggest_float(
                "learning_rate", 1e-4, 1e-1, log=True),
            min_child_weight=trial.suggest_int("min_child_weight", 1, 5),
            subsample=trial.suggest_float("subsample", 0.2, 1.0),
            reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),
            reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True),
        )
        xgb = XGBRegressor(random_state=0, nthread=9, **xgb_params)
        return score_dataset(optimize_X, optimize_y, xgb, 10)
    
    study = optuna.create_study(
        direction="minimize",
        # sampler=optuna.samplers.RandomSampler(),
        # pruner=optuna.pruners.MedianPruner(),
        storage="sqlite:///../study.db",  # Specify the storage URL here.
        study_name="feature-engineering-for-house-prices13"
        )
    print(f"Sampler is {study.sampler.__class__.__name__}")
    study.optimize(objective, n_trials=n_trials, n_jobs=n_jobs,)
    return study.best_params


### Load data ###

In [None]:
X = pd.read_csv("data/X.csv", index_col="Id")
y = pd.read_csv("data/y.csv", index_col="Id")

### Start optimization ###

In [None]:
xgb_params_optimize = optimize_params(X, y, n_trials=1000,)
xgb_params_optimize.update(default_xgb_params_optimize)
r.save_result(xgb_params_optimize, 'data/XGBRegressor_params.json')