In [None]:
# !pip install optuna

In [None]:
import os, json, pickle 
import pandas as pd
import numpy as np 

import optuna
import xgboost as xgb

from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount("/content/gdrive/", force_remount = True) 
%cd "/content/gdrive/My Drive/AAAPortfolio/kaggle_comp"   

Mounted at /content/gdrive/
/content/gdrive/My Drive/AAAPortfolio/kaggle_comp


In [None]:
train = pd.read_csv("./raw_data/train_clean.csv") 
test = pd.read_csv("./raw_data/test_clean.csv") 

target = train.target
log_target = train.log_target
train_id = train.Id
test_id = test.Id

train.drop(["target"], axis = 1, inplace = True) 
train.drop(["log_target"], axis = 1, inplace = True) 
train.drop(["Id"], axis = 1, inplace = True) 
test.drop(["Id"], axis = 1, inplace = True) 

X_train, X_test, y_train, y_test = train_test_split(train, log_target, test_size = 0.33, random_state = 13) 

In [None]:
mse_xgboost = []  

In [None]:
# include in objective a saved value of best mse loaded into best params json
# only update model + best param if mse is lower that the global best 

In [None]:
def objective(trial):

    best_mse = np.inf 

    d_train = xgb.DMatrix(data = X_train, label = y_train)
    d_test = xgb.DMatrix(data = X_test, label = y_test)

    param_grid = {
        "verbosity": 0,
        "objective": "reg:squarederror",
        "booster": trial.suggest_categorical("booster", ["gblinear"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    # num_boost_round = 1000
    xgb_results = xgb.train(
        params = param_grid,
        dtrain = d_train, 
        num_boost_round = 1000
    )

    predictions = xgb_results.predict(d_test)
    mse = mean_squared_error(predictions, y_test)
    mse_xgboost.append(mse)

    if mse < best_mse: 

        best_mse = mse
        params = {key: value for key, value in trial.params.items()}

        with open("./Final Model/best_model_params.json", "w") as outfile: 
            outfile.write(json.dumps(params))

        pickle.dump(xgb_results, open("./Final Model/best_model.p", "wb"))

    return mse

In [None]:
study = optuna.create_study(direction = "minimize")
study.optimize(objective, n_trials = 100) 

[32m[I 2022-05-11 01:18:36,937][0m A new study created in memory with name: no-name-bb1d31d1-45a5-4892-bd9b-7162110d08ca[0m
[32m[I 2022-05-11 01:18:56,415][0m Trial 0 finished with value: 0.03379812983825979 and parameters: {'booster': 'gblinear', 'lambda': 0.07747697813025828, 'alpha': 0.10926901363309155, 'subsample': 0.5768453168180254, 'colsample_bytree': 0.8543458439006135}. Best is trial 0 with value: 0.03379812983825979.[0m
[32m[I 2022-05-11 01:19:08,367][0m Trial 1 finished with value: 0.03170039183653088 and parameters: {'booster': 'gblinear', 'lambda': 0.6354833470334907, 'alpha': 0.6399499066064996, 'subsample': 0.22029479767641977, 'colsample_bytree': 0.6724289803458159}. Best is trial 1 with value: 0.03170039183653088.[0m
[32m[I 2022-05-11 01:19:10,305][0m Trial 2 finished with value: 0.03141970153710448 and parameters: {'booster': 'gblinear', 'lambda': 0.26479349938811625, 'alpha': 0.9758866114195973, 'subsample': 0.6104901855335545, 'colsample_bytree': 0.71870

In [None]:
print("Best trial:")
trial = study.best_trial

params = {key: value for key, value in trial.params.items()}
with open("./Final Model/final_model_params.json", "w") as outfile: 
    outfile.write(json.dumps(params)) 

print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Best trial:
  Value:  0.014990828064675053
  Params: 
    booster: gblinear
    lambda: 0.025063825814236888
    alpha: 5.159073707836065e-05
    subsample: 0.3868360198136104
    colsample_bytree: 0.831045079050417


In [None]:
final_model = pickle.load(open("./Final Model/best_model.p", "rb")) 
final_predictions = final_model.predict(xgb.DMatrix(data = test))  

In [None]:
submission = pd.DataFrame({"Id": test_id, "SalePrice": np.exp(final_predictions)})
submission.to_csv("./Final Submission/xgboost_prices_final.csv", index = False)

In [None]:
# final submission score on kaggle was 0.11986, placed 673/4590 (top 15%) 