In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

while any(marker in os.getcwd() for marker in ('exercises', 'notebooks', 'students', 'research', 'projects')):
    os.chdir("..")
sys.path.append('src')

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import mlflow
import optuna
from dotenv import load_dotenv
from catboost import CatBoostRegressor, Pool
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_log_error # Metric used in the competition for evaluation

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Set MLflow tracking URI to local directory
mlflow.set_tracking_uri("projects/proj_2_team_4/mlruns")

# Use for running ui:
# mlflow ui --backend-store-uri "Absolute path"
# e.g.: mlflow ui --backend-store-uri "/Users/wojciechjurewicz/Desktop/Multivariate Data Analysis/Lab/mda2425/projects/proj_2_team_4/mlruns"

In [4]:
# Load environment variables from the correct .env file location
env_path = 'projects/proj_2_team_4/.env'
load_dotenv(env_path)

True

In [5]:
# Get dataset path from environment variable
train_path = os.getenv('TRAIN_RAW_MERGED_PATH')
test_path = os.getenv('VALID_RAW_MERGED_PATH')

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

  df_train = pd.read_csv(train_path)


In [6]:
X_train = df_train.drop(columns=["SalePrice"])
y_train = df_train["SalePrice"]

X_test = df_test.drop(columns=["SalePrice"])
y_test = df_test["SalePrice"]

In [7]:
# Get categorical columns
cat_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Fill NaN values in categorical columns with 'Missing'
for col in cat_features:
    X_train[col] = X_train[col].fillna('Missing')
    X_test[col] = X_test[col].fillna('Missing')
    
    # Convert any numeric values in categorical columns to strings
    X_train[col] = X_train[col].astype(str)
    X_test[col] = X_test[col].astype(str)

# Create pools with categorical features specified
train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, cat_features=cat_features)

train_pool_log = Pool(X_train, np.log1p(y_train), cat_features=cat_features) # Catboost doesnt support RMSLE loss function, so we use log transformation

In [8]:
learning_rate = 0.1
depth = 8
l2_leaf_reg = 3.0
random_strength = 1.0
bagging_temperature = 0.5
border_count = 128
iterations = 1000

In [9]:
pipeline = Pipeline([
    ('catboost', CatBoostRegressor(
        learning_rate=learning_rate,
        depth=depth,
        l2_leaf_reg=l2_leaf_reg,
        random_strength=random_strength,
        bagging_temperature=bagging_temperature,
        border_count=border_count,
        iterations=iterations,
        loss_function='RMSE',
        verbose=100
    ))
])

In [10]:
# override Optuna's default logging to ERROR only
optuna.logging.set_verbosity(optuna.logging.ERROR)

# define a logging callback that will report on only new challenger parameter configurations if a
# trial has usurped the state of 'best conditions'


def champion_callback(study, frozen_trial):
  """
  Logging callback that will report when a new trial iteration improves upon existing
  best trial values.

  Note: This callback is not intended for use in distributed computing systems such as Spark
  or Ray due to the micro-batch iterative implementation for distributing trials to a cluster's
  workers or agents.
  The race conditions with file system state management for distributed trials will render
  inconsistent values with this callback.
  """

  winner = study.user_attrs.get("winner", None)

  if study.best_value and winner != study.best_value:
      study.set_user_attr("winner", study.best_value)
      if winner:
          improvement_percent = (abs(winner - study.best_value) / study.best_value) * 100
          print(
              f"Trial {frozen_trial.number} achieved value: {frozen_trial.value} with "
              f"{improvement_percent: .4f}% improvement"
          )
      else:
          print(f"Initial trial {frozen_trial.number} achieved value: {frozen_trial.value}")

In [11]:
def objective(trial):
    # Define hyperparameter search space
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
        "random_strength": trial.suggest_float("random_strength", 0.5, 2.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "iterations": 1000,
        "loss_function": "RMSE",
        "early_stopping_rounds": 50,
        "verbose": 100
    }
    
    # Create and train pipeline
    pipeline = Pipeline([
        ('catboost', CatBoostRegressor(**params))
    ])
    
    pipeline.fit(train_pool_log)
    y_pred_log = pipeline.predict(test_pool)

    y_pred_log = np.maximum(0, y_pred_log) # Added to fix RMSLE ValueError

    y_pred = np.expm1(y_pred_log)
    
    # Return RMSLE as optimization metric
    return root_mean_squared_log_error(y_test, y_pred)
    

In [12]:
learning_rate = 0.1
depth = 8
l2_leaf_reg = 3.0
random_strength = 1.0
bagging_temperature = 0.5
border_count = 128
iterations = 1000

In [13]:
# Create study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=1, callbacks=[champion_callback], timeout=7200, show_progress_bar=True) # set n_trials to higher when doing final training and summary

# Get best parameters
best_params = study.best_params

# Train final model with best parameters
experiment = mlflow.set_experiment("Bluebook_for_bulldozers_CatBoost")
with mlflow.start_run(experiment_id=experiment.experiment_id):
    mlflow.log_params(best_params)

# Log tags
    mlflow.set_tags(
        tags={
            "project": "Bluebook for Bulldozers",
            "optimizer_engine": "optuna",
            "model_family": "catboost",
            "feature_set_version": 1,
        }
    )
    
    # Train final model
    final_pipeline = Pipeline([
        ('catboost', CatBoostRegressor(**best_params))
    ])
    final_pipeline.fit(train_pool_log)
    y_pred_log = final_pipeline.predict(test_pool)
    y_pred = np.expm1(y_pred_log)


    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    rmsle = root_mean_squared_log_error(y_test, y_pred)

    # Log metrics
    mlflow.log_metrics({
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "R2": r2,
        "RMSLE": rmsle
    })

    print(f"Mean Absolute Error (MAE): {mae}")
    print(f"Mean Squared Error (MSE): {mse}")
    print(f"Root Mean Squared Error (RMSE): {rmse}")
    print(f"R-squared (Coefficient of Determination): {r2}")
    print(f"Root Mean Squared Log Error (RMSLE): {rmsle} - Metric used in competition")
    mlflow.sklearn.log_model(final_pipeline, "model")

    # Create and log visualization
    plt.scatter(y_test, y_pred, alpha=0.7)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--', linewidth=2)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title('Actual vs Predicted')
    plt.grid(True)
    plt.axis('equal')
    plt.tight_layout()
    actual_vs_predicted_path = "actual_vs_predicted.png"
    plt.savefig(actual_vs_predicted_path)
    plt.close()

    mlflow.log_artifact(actual_vs_predicted_path)
    os.remove(actual_vs_predicted_path)

  0%|          | 0/1 [00:00<?, ?it/s]

0:	learn: 0.6386059	total: 595ms	remaining: 9m 54s
100:	learn: 0.2733438	total: 56.9s	remaining: 8m 26s
200:	learn: 0.2525605	total: 1m 50s	remaining: 7m 21s
300:	learn: 0.2430443	total: 2m 46s	remaining: 6m 27s
400:	learn: 0.2369614	total: 3m 41s	remaining: 5m 30s
500:	learn: 0.2330792	total: 4m 39s	remaining: 4m 37s
600:	learn: 0.2301341	total: 5m 34s	remaining: 3m 41s
700:	learn: 0.2267933	total: 6m 30s	remaining: 2m 46s
800:	learn: 0.2245557	total: 7m 26s	remaining: 1m 51s
900:	learn: 0.2223978	total: 8m 23s	remaining: 55.4s
999:	learn: 0.2206564	total: 9m 18s	remaining: 0us


Best trial: 0. Best value: 0.241792: 100%|██████████| 1/1 [09:19<00:00, 559.72s/it, 559.72/7200 seconds]
2025/05/26 19:55:29 INFO mlflow.tracking.fluent: Experiment with name 'Bluebook_for_bulldozers_CatBoost' does not exist. Creating a new experiment.


Initial trial 0 achieved value: 0.24179201660949895
0:	learn: 0.6386059	total: 434ms	remaining: 7m 13s
1:	learn: 0.5910618	total: 814ms	remaining: 6m 46s
2:	learn: 0.5510681	total: 1.15s	remaining: 6m 22s
3:	learn: 0.5161471	total: 1.56s	remaining: 6m 28s
4:	learn: 0.4856289	total: 1.9s	remaining: 6m 17s
5:	learn: 0.4600657	total: 2.28s	remaining: 6m 18s
6:	learn: 0.4381608	total: 2.75s	remaining: 6m 29s
7:	learn: 0.4193202	total: 3.19s	remaining: 6m 35s
8:	learn: 0.4034340	total: 3.64s	remaining: 6m 40s
9:	learn: 0.3901444	total: 4.11s	remaining: 6m 46s
10:	learn: 0.3790306	total: 4.48s	remaining: 6m 42s
11:	learn: 0.3692944	total: 4.96s	remaining: 6m 48s
12:	learn: 0.3611849	total: 5.41s	remaining: 6m 50s
13:	learn: 0.3542889	total: 5.86s	remaining: 6m 52s
14:	learn: 0.3480354	total: 6.28s	remaining: 6m 52s
15:	learn: 0.3429673	total: 6.73s	remaining: 6m 53s
16:	learn: 0.3388056	total: 7.2s	remaining: 6m 56s
17:	learn: 0.3351161	total: 7.65s	remaining: 6m 57s
18:	learn: 0.3318086	tot

