In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

while any(marker in os.getcwd() for marker in ('exercises', 'notebooks', 'students', 'research', 'projects')):
    os.chdir("..")
sys.path.append('src')

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import mlflow
import optuna
from dotenv import load_dotenv
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_log_error # Metric used in the competition for evaluation

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Set MLflow tracking URI to local directory
mlflow.set_tracking_uri("projects/proj_2_team_4/mlruns")

# Use for running ui:
# mlflow ui --backend-store-uri "Absolute path"
# e.g.: mlflow ui --backend-store-uri "/Users/wojciechjurewicz/Desktop/Multivariate Data Analysis/Lab/mda2425/projects/proj_2_team_4/mlruns"



In [4]:
# Load environment variables from the correct .env file location
env_path = 'projects/proj_2_team_4/.env'
load_dotenv(env_path)

False

In [6]:
# Get dataset path from environment variable
train_preprocessed_path = os.getenv('TRAIN_PREPROCESSED_PATH')
test_preprocessed_path = os.getenv('VALID_PREPROCESSED_PATH')

df_train = pd.read_csv("projects/proj_2_team_4/datasets/Train_preprocessed.csv")
df_test = pd.read_csv("projects/proj_2_team_4/datasets/Valid_preprocessed.csv")

In [8]:
X_train = df_train.drop(columns=["SalePrice"])
y_train = df_train["SalePrice"]

X_test = df_test.drop(columns=["SalePrice"])
y_test = df_test["SalePrice"]

In [9]:
n_estimators = 200
learning_rate = 0.1
max_depth = 3
subsample = 1.0
max_features = None
min_samples_split = 2
min_samples_leaf = 1
random_state = 42

In [10]:
pipeline = Pipeline([
    ('gbr', GradientBoostingRegressor(
        n_estimators = n_estimators,
        learning_rate = learning_rate,
        max_depth = max_depth,
        subsample = subsample,
        max_features = max_features,
        min_samples_split = min_samples_split,
        min_samples_leaf = min_samples_leaf,
        random_state = random_state
    ))
])


In [11]:
# override Optuna's default logging to ERROR only
optuna.logging.set_verbosity(optuna.logging.ERROR)

# define a logging callback that will report on only new challenger parameter configurations if a
# trial has usurped the state of 'best conditions'


def champion_callback(study, frozen_trial):
  """
  Logging callback that will report when a new trial iteration improves upon existing
  best trial values.

  Note: This callback is not intended for use in distributed computing systems such as Spark
  or Ray due to the micro-batch iterative implementation for distributing trials to a cluster's
  workers or agents.
  The race conditions with file system state management for distributed trials will render
  inconsistent values with this callback.
  """

  winner = study.user_attrs.get("winner", None)

  if study.best_value and winner != study.best_value:
      study.set_user_attr("winner", study.best_value)
      if winner:
          improvement_percent = (abs(winner - study.best_value) / study.best_value) * 100
          print(
              f"Trial {frozen_trial.number} achieved value: {frozen_trial.value} with "
              f"{improvement_percent: .4f}% improvement"
          )
      else:
          print(f"Initial trial {frozen_trial.number} achieved value: {frozen_trial.value}")

In [12]:
def objective(trial):
    # Define hyperparameter search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'max_features': trial.suggest_categorical('max_features', [None, 'sqrt', 'log2']),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'random_state': 42
    }
    
    # Create and train pipeline
    pipeline = Pipeline([
        ('gbr', GradientBoostingRegressor(**params))
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    y_pred = np.maximum(0, y_pred) # Added to fix RMSLE ValueError
    
    # Return RMSLE as optimization metric
    return root_mean_squared_log_error(y_test, y_pred)
    

In [14]:
# Create study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=3, callbacks=[champion_callback], timeout=7200) # set n_trials to higher when doing final training and summary

# Get best parameters
best_params = study.best_params

# Train final model with best parameters
experiment = mlflow.set_experiment("GradientBoostingRegressor_for_Bulldozers")
with mlflow.start_run(experiment_id=experiment.experiment_id):
    mlflow.log_params(best_params)

# Log tags
    mlflow.set_tags(
        tags={
            "project": "Bluebook for Bulldozers",
            "optimizer_engine": "optuna",
            "model_family": "GradientBoostingRegressor",
            "feature_set_version": 1,
        }
    )
    
    # Train final model
    final_pipeline = Pipeline([
        ('gbr', GradientBoostingRegressor(**best_params))
    ])
    final_pipeline.fit(X_train, y_train)
    y_pred = final_pipeline.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    rmsle = root_mean_squared_log_error(y_test, y_pred)

    # Log metrics
    mlflow.log_metrics({
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "R2": r2,
        "RMSLE": rmsle
    })

    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"R-squared (Coefficient of Determination): {r2}")
    print(f"Root Mean Squared Log Error (RMSLE): {rmsle} - Metric used in competition")
    mlflow.sklearn.log_model(final_pipeline, "model")

    # Create and log visualization
    plt.scatter(y_test, y_pred, alpha=0.7)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--', linewidth=2)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title('Actual vs Predicted')
    plt.grid(True)
    plt.axis('equal')
    plt.tight_layout()
    actual_vs_predicted_path = "actual_vs_predicted.png"
    plt.savefig(actual_vs_predicted_path)
    plt.close()

    mlflow.log_artifact(actual_vs_predicted_path)
    os.remove(actual_vs_predicted_path)

Initial trial 0 achieved value: 0.3445053328174663
Trial 1 achieved value: 0.33277312411179955 with  3.5256% improvement


2025/05/22 21:31:15 INFO mlflow.tracking.fluent: Experiment with name 'GradientBoostingRegressor_for_Bulldozers' does not exist. Creating a new experiment.


Trial 2 achieved value: 0.3304285024454855 with  0.7096% improvement
Mean Absolute Error (MAE): 8505.443522270594
Mean Squared Error (MSE): 166611896.95110556
Root Mean Squared Error (RMSE): 12907.823091098884
R-squared (Coefficient of Determination): 0.7335905210542838
Root Mean Squared Log Error (RMSLE): 0.3274962593599146 - Metric used in competition


