In [2]:
import numpy as np
import pandas as pd

from utils import get_train_data
from FeatureEngineering import _encode
# We will do categorical encoding, and not use the integrated module of XGBoost
# supposed to handle these categorical variables

# from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

### __Setup__

In [3]:
# We import the data:
X, y = get_train_data()

# We put the label to the log to help the model:
X_encoded, y_log = _encode(X, y)

# Pipeline creation:
model = XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=6,
        random_state=42
    )

### __Optuna__

In [4]:
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Define the objective function
def objective(trial):
    # Split the data
    X_train, X_val, y_train, y_val = train_test_split(X_encoded, y_log, test_size=0.2, random_state=42)

    # Hyperparameter search space
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10)
    }

    # Train XGBoost
    model = XGBRegressor(**param)
    model.fit(X_train, y_train)

    # Evaluate
    preds = model.predict(X_val)
    rmse = mean_squared_error(y_val, preds, squared=False)
    return rmse

### __Optimization__

In [6]:
# Create a study
study = optuna.create_study(direction='minimize')  # 'minimize' for RMSE, 'maximize' for accuracy, etc.

# Optimize
study.optimize(objective, n_trials=50)

# Best parameters and score
print("Best trial:")
print(" Value: ", study.best_value)
print(" Params: ", study.best_params)

[I 2025-01-03 18:19:06,650] A new study created in memory with name: no-name-2271a8e8-b98c-403e-83c8-67d471366b67
[I 2025-01-03 18:19:12,903] Trial 0 finished with value: 0.13421888123332854 and parameters: {'n_estimators': 902, 'learning_rate': 0.010069276755047376, 'max_depth': 4}. Best is trial 0 with value: 0.13421888123332854.
[I 2025-01-03 18:19:20,856] Trial 1 finished with value: 0.14680070953055216 and parameters: {'n_estimators': 613, 'learning_rate': 0.015407733433225724, 'max_depth': 7}. Best is trial 0 with value: 0.13421888123332854.
[I 2025-01-03 18:19:24,168] Trial 2 finished with value: 0.1375309058542543 and parameters: {'n_estimators': 476, 'learning_rate': 0.014372390857423646, 'max_depth': 4}. Best is trial 0 with value: 0.13421888123332854.
[I 2025-01-03 18:19:25,968] Trial 3 finished with value: 0.12847630007473865 and parameters: {'n_estimators': 344, 'learning_rate': 0.04219287557979192, 'max_depth': 3}. Best is trial 3 with value: 0.12847630007473865.
[I 2025-

Best trial:
 Value:  0.12349490722911001
 Params:  {'n_estimators': 291, 'learning_rate': 0.07256489894558181, 'max_depth': 3}


### __Test Data Prediction with Best Parameters__

In [7]:
# FIT with the best Hyperparameters from the Optuna study:
best_params = study.best_params
best_model = XGBRegressor(**best_params)
best_model.fit(X_encoded, y_log)

In [8]:
# PREDICT
test_data = pd.read_csv('data/test.csv')
test_data_encoded = _encode(test_data).reindex(columns=X_encoded.columns, 
                                               fill_value=0)

test_prediction = best_model.predict(test_data_encoded)
predictions = np.exp(test_prediction)

### __Output Extraction__

In [9]:
results = pd.DataFrame(
    dict(
        Id=test_data['Id'],
        SalePrice=predictions,
    )
)
results.to_csv("submission_XGB_vOptuna.csv", index=False)