In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

while any(marker in os.getcwd() for marker in ('exercises', 'notebooks', 'students', 'research', 'projects')):
    os.chdir("..")
sys.path.append('src')

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import mlflow
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import mlflow.sklearn
import optuna
from dotenv import load_dotenv
from sklearn.metrics import root_mean_squared_log_error

In [3]:
env_path = '/Users/alanmakowski1/Desktop/project2/.env2'
load_dotenv(env_path)

True

In [None]:
train_preprocessed_path = os.getenv('TRAIN_PREPROCESSED_LOG_PATH')
test_preprocessed_path = os.getenv('VALID_PREPROCESSED_PATH')

df_train = pd.read_csv(train_preprocessed_path)
df_test = pd.read_csv(test_preprocessed_path)

In [5]:
X_train = df_train.drop(columns=["SalePrice"])
y_train = df_train["SalePrice"]
X_test = df_test.drop(columns=["SalePrice"])
y_test = df_test["SalePrice"]

In [6]:
mlflow.set_tracking_uri("/Users/alanmakowski1/Desktop/visual_studio_code/mda2425/mlruns")

In [7]:
# KNeighborsRegressor parameters
n_neighbors = 5
weights = 'uniform'           
algorithm = 'auto'  
leaf_size = 30
p = 2                         
metric = 'minkowski'         
n_jobs = -1                   

In [8]:
pipeline = Pipeline([
    ('knn', KNeighborsRegressor(
        n_neighbors=n_neighbors,
        weights=weights,
        algorithm=algorithm,
        leaf_size=leaf_size,
        p=p,
        metric=metric,
        n_jobs=n_jobs
    ))
])

In [10]:
# override Optuna's default logging to ERROR only
optuna.logging.set_verbosity(optuna.logging.ERROR)

# define a logging callback that will report on only new challenger parameter configurations if a
# trial has usurped the state of 'best conditions'


def champion_callback(study, frozen_trial):
  """
  Logging callback that will report when a new trial iteration improves upon existing
  best trial values.

  Note: This callback is not intended for use in distributed computing systems such as Spark
  or Ray due to the micro-batch iterative implementation for distributing trials to a cluster's
  workers or agents.
  The race conditions with file system state management for distributed trials will render
  inconsistent values with this callback.
  """

  winner = study.user_attrs.get("winner", None)

  if study.best_value and winner != study.best_value:
      study.set_user_attr("winner", study.best_value)
      if winner:
          improvement_percent = (abs(winner - study.best_value) / study.best_value) * 100
          print(
              f"Trial {frozen_trial.number} achieved value: {frozen_trial.value} with "
              f"{improvement_percent: .4f}% improvement"
          )
      else:
          print(f"Initial trial {frozen_trial.number} achieved value: {frozen_trial.value}")

In [None]:
def objective(trial):
    # Define hyperparameter search space for KNN
    params = {
        "n_neighbors": trial.suggest_int("n_neighbors", 3, 20),
        "weights": trial.suggest_categorical("weights", ["uniform", "distance"]),
        "algorithm": trial.suggest_categorical("algorithm", ["auto", "ball_tree", "kd_tree", "brute"]),
        "leaf_size": trial.suggest_int("leaf_size", 10, 60),
        "p": trial.suggest_int("p", 1, 2),  # 1 = Manhattan, 2 = Euclidean
        "metric": "minkowski",
        "n_jobs": -1
    }

    # Build the pipeline (with preprocessing!)
    pipeline = Pipeline([
        ("knn", KNeighborsRegressor(**params))
    ])

    # Train
    pipeline.fit(X_train, y_train)
    y_pred = np.expm1(pipeline.predict(X_test))

    # Ensure no negative predictions for RMSLE
    y_pred = np.maximum(0, y_pred)

    # Calculate and return RMSLE
    return root_mean_squared_log_error(y_test, y_pred)

In [12]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=1, callbacks=[champion_callback], show_progress_bar=True, 
               catch=(Exception,), timeout=1*60*60)

best_params = study.best_params

# === Start MLflow experiment ===
mlflow.set_experiment("kNRegressor_Bulldozers")
experiment = mlflow.get_experiment_by_name("kNRegressor_Bulldozers")
with mlflow.start_run(experiment_id=experiment.experiment_id):
    mlflow.log_params(best_params)

    mlflow.set_tags(
        tags={
            "project": "Bluebook for Bulldozers",
            "optimizer_engine": "optuna",
            "model_family": "kNRegressor",
            "feature_set_version": 1,
        }
    )
    # Train final model
    final_pipeline = Pipeline([
        ('kNR', (KNeighborsRegressor(**best_params)))
    ])
    final_pipeline.fit(X_train, y_train)
    y_pred = final_pipeline.predict(X_test)

    # Metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    rmsle = root_mean_squared_log_error(y_test, y_pred)

    # Log metrics
    mlflow.log_metrics({
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "R2": r2,
        "RMSLE": rmsle
           
    })

    print(f"MAE: {mae:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R2 Score: {r2:.4f}")
    print(f"RMSLE Score: {rmsle:.4f}")

    # Save model
    mlflow.sklearn.log_model(final_pipeline, "model")

    plt.figure(figsize=(8, 6))
    plt.scatter(y_test, y_pred, alpha=0.7)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--', linewidth=2)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title('Actual vs Predicted (kNRegressor)')
    plt.grid(True)
    plt.tight_layout()

    plot_path = "actual_vs_predicted.png"
    plt.savefig(plot_path)
    plt.close()

    # Log the plot
    mlflow.log_artifact(plot_path)

  0%|          | 0/1 [00:00<?, ?it/s]

Initial trial 0 achieved value: 0.3968258666857088
MAE: 9951.3771
MSE: 208160858.7282
RMSE: 14427.7808
R2 Score: 0.6672
RMSLE Score: 0.3968


