In [1]:
# !pip install optuna

In [2]:
import json 
import numpy as np 
import pandas as pd

import optuna
import xgboost as xgb

from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import train_test_split

In [3]:
from google.colab import drive
drive.mount("/content/gdrive/", force_remount = True) 
%cd "/content/gdrive/My Drive/AAAPortfolio/kaggle_comp"   

Mounted at /content/gdrive/
/content/gdrive/My Drive/AAAPortfolio/kaggle_comp


In [4]:
train = pd.read_csv("./raw_data/train_clean.csv") 
test = pd.read_csv("./raw_data/test_clean.csv") 

target = train.target
log_target = train.log_target
train_id = train.Id
test_id = test.Id

train.drop(["target"], axis = 1, inplace = True) 
train.drop(["log_target"], axis = 1, inplace = True) 
train.drop(["Id"], axis = 1, inplace = True) 
test.drop(["Id"], axis = 1, inplace = True) 

In [5]:
def objective(trial):

    # use this to track best mse 
    best_mse = np.inf 

    # init DMatrix for training 
    d_train = xgb.DMatrix(data = train, label = log_target)

    param_grid = {
        "verbosity": 0,
        "objective": "reg:squarederror",
        "booster": trial.suggest_categorical("booster", ["gblinear"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    # cross-validate 
    xgb_cv_results = xgb.cv(
        params = param_grid,
        dtrain = d_train, 
        num_boost_round = 1000, 
        nfold = 10, 
        early_stopping_rounds = 100, 
        verbose_eval = False, 
        as_pandas = True
    )

    trial.set_user_attr("n_estimators", len(xgb_cv_results))

    # save cross-validation results
    filepath = "./cv_results/{}.csv".format(trial.number)
    xgb_cv_results.to_csv(filepath, index = False)

    # extract the best score
    mse = xgb_cv_results["test-rmse-mean"].values[-1]

    # save params from best model 
    if mse < best_mse: 

        best_mse = mse
        params = {key: value for key, value in trial.params.items()}
        with open("./saved_model/best_model_params_cv.json", "w") as outfile: 
            outfile.write(json.dumps(params))

    return mse

In [None]:
study = optuna.create_study(direction = "minimize")
study.optimize(objective, n_trials = 50)

[32m[I 2022-05-11 06:45:18,646][0m A new study created in memory with name: no-name-a506d716-df53-4856-970a-0989d1d92dfa[0m


In [None]:
print("Best trial:")
trial = study.best_trial

params = {key: value for key, value in trial.params.items()}
with open("./saved_model/final_model_params_cv.json", "w") as outfile: 
    outfile.write(json.dumps(params)) 

print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [8]:
final_model = xgb.train(params = params, 
                        dtrain = xgb.DMatrix(data = train, label = log_target), 
                        num_boost_round = 1000)

final_predictions = final_model.predict(xgb.DMatrix(data = test))   

In [9]:
submission = pd.DataFrame({"Id": test_id, "SalePrice": np.exp(final_predictions)})
submission.to_csv("./Final Submission/xgboost_prices_final_cv.csv", index = False)