In [3]:
import optuna
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd


In [4]:
train_df = pd.read_csv("assets/preprocessed_train.csv")
train_df.drop(columns=["Unnamed: 0"], inplace=True)
y = train_df["SalePrice"]
X = train_df.drop(columns=["SalePrice"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:

# Define objective function for Optuna
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 2000, step=200),  # Wider range, larger step
        "max_depth": trial.suggest_int("max_depth", 5, 30),  # More depth options
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 15),  # Adjusted upper limit
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 8),  # Limited to reasonable values
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),  # Common RF choices
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
    }

    model = RandomForestRegressor(**params, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    return rmse  

# Run Optuna optimization
study = optuna.create_study(direction="minimize")  # Minimize RMSE
study.optimize(objective, n_trials=500)  # Reduced trials for faster tuning

# Get best hyperparameters
best_params = study.best_params

# Train final model with best parameters
model = RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Print results
print("Best hyperparameters:", best_params)
print(f"Best RMSE: {rmse}")


[I 2025-04-03 14:48:42,167] A new study created in memory with name: no-name-fc5b9963-85aa-4e05-bc87-308f419ffea6
[I 2025-04-03 14:48:44,099] Trial 0 finished with value: 44715.29449838321 and parameters: {'n_estimators': 1400, 'max_depth': 25, 'min_samples_split': 13, 'min_samples_leaf': 4, 'max_features': None, 'bootstrap': False}. Best is trial 0 with value: 44715.29449838321.
[I 2025-04-03 14:48:45,455] Trial 1 finished with value: 41048.191174964224 and parameters: {'n_estimators': 800, 'max_depth': 16, 'min_samples_split': 7, 'min_samples_leaf': 2, 'max_features': None, 'bootstrap': False}. Best is trial 1 with value: 41048.191174964224.
[I 2025-04-03 14:48:46,959] Trial 2 finished with value: 34202.249482621395 and parameters: {'n_estimators': 1200, 'max_depth': 16, 'min_samples_split': 15, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 2 with value: 34202.249482621395.
[I 2025-04-03 14:48:47,418] Trial 3 finished with value: 32145.543558615995 

Best hyperparameters: {'n_estimators': 200, 'max_depth': 24, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'bootstrap': False}
Best RMSE: 26838.93246849822
