# Notebook setup

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys

while any(marker in os.getcwd() for marker in ('exercises', 'notebooks', 'students', 'research', 'projects')):
    os.chdir("..")
sys.path.append('src')

In [None]:
# Standard library
import time
from datetime import datetime

# Third-party libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import plotly.graph_objects as go
from joblib import Parallel, delayed


# Machine Learning & Modeling
import mlflow
import mlflow.sklearn
import optuna
import optuna.visualization as vis
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from catboost import Pool, CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    r2_score,
    root_mean_squared_log_error,  # Metric used in the competition
)


In [3]:
# Set MLflow tracking URI to local directory
mlflow.set_tracking_uri("projects/proj_2_team_4/mlruns")

# Use for running ui:
# mlflow ui --backend-store-uri "Absolute path"
# e.g.: mlflow ui --backend-store-uri "/Users/wojciechjurewicz/Desktop/Multivariate Data Analysis/Lab/mda2425/projects/proj_2_team_4/mlruns"

In [None]:
# Load environment variables from the correct .env file location
env_path = 'projects/proj_2_team_4/.env'
load_dotenv(env_path)

# Data load

In [5]:
train_catboost_path = os.getenv('TRAIN_RAW_MERGED_PATH')
train_preprocessed_path = os.getenv('TRAIN_PREPROCESSED_PATH')
train_preprocessed_log_path = os.getenv('TRAIN_PREPROCESSED_LOG_PATH')

test_catboost_path = os.getenv('VALID_RAW_MERGED_PATH')
test_preprocessed_path = os.getenv('VALID_PREPROCESSED_PATH')

In [None]:
df_train_catboost = pd.read_csv(train_catboost_path)
df_train_preprocessed = pd.read_csv(train_preprocessed_path)
df_train_log_preprocessed = pd.read_csv(train_preprocessed_log_path)

df_test = pd.read_csv(test_preprocessed_path)
df_test_catboost = pd.read_csv(test_catboost_path)


In [7]:
# To avoid sets split in different ways, we will use the same indices for all sets
# Different set sizes cause issues when reversing log-transfor on SalePrice

# Step 1: Generate consistent indices for the split
train_idx, valid_idx = train_test_split(df_train_preprocessed.index, test_size=0.10, random_state=42)

# Step 2: Use these indices to slice all your DataFrames
train_df_catboost = df_train_catboost.loc[train_idx]
valid_df_catboost = df_train_catboost.loc[valid_idx]

train_df = df_train_preprocessed.loc[train_idx]
valid_df = df_train_preprocessed.loc[valid_idx]

train_df_log = df_train_log_preprocessed.loc[train_idx]
valid_df_log = df_train_log_preprocessed.loc[valid_idx]


In [8]:
X_train_catboost = train_df_catboost.drop(columns=["SalePrice"])
y_train_catboost = train_df_catboost["SalePrice"]

X_train = train_df.drop(columns=["SalePrice"])
y_train = train_df["SalePrice"]

X_train_log = train_df_log.drop(columns=["SalePrice"])
y_train_log = train_df_log["SalePrice"]



X_valid_catboost = valid_df_catboost.drop(columns=["SalePrice"])
y_valid_catboost = valid_df_catboost["SalePrice"]

X_valid = valid_df.drop(columns=["SalePrice"])
y_valid = valid_df["SalePrice"]

X_valid_log = valid_df_log.drop(columns=["SalePrice"])
y_valid_log = valid_df_log["SalePrice"]


X_test_catboost = df_test_catboost.drop(columns=["SalePrice"])
X_test = df_test.drop(columns=["SalePrice"])

valid_solutions_path = os.getenv("VALID_SOLUTIONS_PATH")
df_valid_solutions = pd.read_csv(valid_solutions_path)
#df_valid_solutions = pd.read_csv("projects/proj_2_team_4/datasets/ValidSolution.csv")
y_test = df_test["SalePrice"]

y_test_catboost = df_test_catboost["SalePrice"]

# Main base class for Tuning

In [9]:
class BaseModelTuner:
    def __init__(self, model_name, X_train, y_train, X_valid, y_valid, inverse_transform=None, run_name=None):
        self.model_name = model_name
        self.X_train = X_train
        self.y_train = y_train
        self.X_valid = X_valid
        self.y_valid = y_valid
        self.inverse_transform = inverse_transform or (lambda x: x)  # Default: identity function
        self.run_name = run_name

    def define_search_space(self, trial):
        raise NotImplementedError

    def build_model(self, params):
        raise NotImplementedError

    def objective(self, trial=None, params=None):
        start = time.time()

        # 1) Sample hyperparameters & build the model
        if trial is not None:
            params = self.define_search_space(trial)
        model = self.build_model(params)

        # 2) Fit and get raw predictions
        model.fit(self.X_train, self.y_train)
        y_pred_raw = model.predict(self.X_valid)

        # 3) Clip to [0, ∞), then invert the transformation

        y_pred = np.maximum(0, y_pred_raw)
        y_pred = self.inverse_transform(y_pred)
        
        # 4) Bail early if any preds are infinite or NaN
        if not np.all(np.isfinite(y_pred)):
            # Return an infinite loss so Optuna discards this trial
            return float("inf"), time.time() - start

        # 5) Compute metrics against the *raw* y_valid
        rmsle_val = root_mean_squared_log_error(self.y_valid, y_pred)
        r2_val    = r2_score(self.y_valid, y_pred)
        training_time = time.time() - start

        # 6) Log to MLflow
        mlflow.set_experiment(self.model_name)
        with mlflow.start_run(run_name=self.run_name, nested=True):
            mlflow.log_params(params)
            mlflow.log_metrics({
                "RMSLE": rmsle_val,
                "R2": r2_val,
                "training_time": training_time
            })
            if trial is None:
                mlflow.sklearn.log_model(model, "model")

        # 7) Return for multi‐objective (RMSLE, time)
        return rmsle_val, training_time


    def run_study(self, n_trials=30, timeout=None, show_progress_bar=True):
        self.study = optuna.create_study(
            directions=["minimize", "minimize"],
            study_name=self.model_name
        )
        self.study.optimize(self.objective, n_trials=n_trials, timeout=timeout, show_progress_bar=show_progress_bar)


# Model classes

## Linear Regression

### No regularization

In [10]:
class LinearRegressionTuner(BaseModelTuner):
    def define_search_space(self, trial):
        return {
            "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
        }

    def build_model(self, params):
        return LinearRegression(**params)

### Ridge

In [11]:
class RidgeRegressionTuner(BaseModelTuner):
    def define_search_space(self, trial):
        return {
            "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
            "alpha": trial.suggest_loguniform("alpha", 1e-4, 100.0),
            "solver": trial.suggest_categorical("solver", ["auto", "svd", "cholesky", "lsqr", "saga"]),
        }

    def build_model(self, params):
        return Ridge(**params)

### Lasso

In [12]:
class LassoRegressionTuner(BaseModelTuner):
    def define_search_space(self, trial):
        return {
            "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
            "alpha": trial.suggest_loguniform("alpha", 1e-4, 100.0)
        }

    def build_model(self, params):
        return Lasso(**params)

## KNeighborsRegressor

In [13]:
class KNTuner(BaseModelTuner):
    def define_search_space(self, trial):
        return {
        "n_neighbors": trial.suggest_int("n_neighbors", 5, 15),
        "weights": trial.suggest_categorical("weights", ["uniform", "distance"]),
        "algorithm": trial.suggest_categorical("algorithm", ["auto", "ball_tree", "kd_tree", "brute"]),
        "leaf_size": trial.suggest_int("leaf_size", 20, 40),
        "p": trial.suggest_int("p", 1, 2),  # 1 = Manhattan, 2 = Euclidean
        "metric": "minkowski",
        "n_jobs": -1
    }

    def build_model(self, params):
        return KNeighborsRegressor(**params)

## GradientBoosterRegressor

In [14]:
class GradientBoostingTuner(BaseModelTuner):
    def define_search_space(self, trial):
        return {
            "n_estimators": trial.suggest_int("n_estimators", 50, 300),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
            "max_depth": trial.suggest_int("max_depth", 3, 13),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),    
            "n_iter_no_change":   10,
            "validation_fraction": 0.1,
        }

    def build_model(self, params):
        return GradientBoostingRegressor(**params)

## RandomForestRegressor

In [15]:
class RandomForestTuner(BaseModelTuner):
    def define_search_space(self, trial):
        return {
            "n_estimators": trial.suggest_int("n_estimators", 100, 300),
            "max_depth": trial.suggest_int("max_depth", 5, 10),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 15),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 8),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
            "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
        }

    def build_model(self, params):
        return RandomForestRegressor(**params, n_jobs=-1)


## Multi-layer Perceptron Regressor (MLPRegressor)

In [16]:
class MLPTuner(BaseModelTuner):
    def define_search_space(self, trial):
        return {
            "hidden_layer_sizes": trial.suggest_categorical("hidden_layer_sizes", [(50,), (100,), (100,50)]),
            "activation": trial.suggest_categorical("activation", ["relu", "tanh"]),
            "alpha": trial.suggest_loguniform("alpha", 1e-5, 1e-1),
            "learning_rate": trial.suggest_categorical("learning_rate", ["constant", "adaptive"]),
            "max_iter": trial.suggest_int("max_iter", 200, 1000),
        }

    def build_model(self, params):
        return MLPRegressor(**params)

## XGBoost

In [17]:
class XGBoostTuner(BaseModelTuner):
    def define_search_space(self, trial):
        return {
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "max_depth": trial.suggest_int("max_depth", 3, 8),
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "subsample": trial.suggest_float("subsample", 0.5, 1.0),
            "tree_method": trial.suggest_categorical("tree_method", ["approx", "hist"])
    }

    def build_model(self, params):
        return XGBRegressor(**params)

## DecisionTreeRegressor

In [18]:
class DecisionTreeTuner(BaseModelTuner):
    def define_search_space(self, trial):
        return {
            "max_depth": trial.suggest_int("max_depth", 3, 15),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 12),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 8),
            "criterion": trial.suggest_categorical("criterion", ["squared_error", "friedman_mse", "absolute_error", "poisson"]),
        }

    def build_model(self, params):
        return DecisionTreeRegressor(**params)


## CatBoostRegressor

In [19]:
class CatBoostTuner(BaseModelTuner):
    def define_search_space(self, trial):
        return {
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "depth": trial.suggest_int("depth", 4, 10),
            "iterations": trial.suggest_int("iterations", 100, 1000),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10.0),
        }
 
    def build_model(self, params):
        return CatBoostRegressor(**params, loss_function="RMSE", verbose=False)
 
    def objective(self, trial=None, params=None):
        y_train_mod = np.log1p(self.y_train) # CatBoost does not have RMSLE loss function. We use RMSE and proper transformation instead.
 
        cat_feats = self.X_train.select_dtypes(['object','category']).columns.tolist()
 
        for c in cat_feats:
            self.X_train[c] = self.X_train[c].astype(str)
            self.X_valid[c] = self.X_valid[c].astype(str)
 
        train_pool = Pool(self.X_train, y_train_mod, cat_features=cat_feats)
        valid_pool = Pool(self.X_valid, cat_features=cat_feats)
 
        start = time.time()
        
        if trial is not None:
            params = self.define_search_space(trial)
        model = self.build_model(params)

        model.fit(train_pool)
        preds_log = model.predict(valid_pool)
        duration = time.time() - start
 
        preds = np.expm1(preds_log)
        score = root_mean_squared_log_error(self.y_valid, preds)
 
        mlflow.set_experiment(self.model_name)
        with mlflow.start_run(run_name=self.run_name, nested=True):
            mlflow.log_params(model.get_params())
            mlflow.log_metrics({
                "Validation RMSLE": score,
                "training_time": duration
            })

            if trial is None:
                mlflow.sklearn.log_model(model, "model")
                
        return score, duration

# Config for studies

In [20]:
experiment_tag = datetime.now().strftime("%d.%m-%H:%M")

model_configs = [
    {
        "model_cls": KNTuner,
        "experiment_name": f"KNeighbours_{experiment_tag}",
        "X_train": X_train_log,
        "y_train": y_train_log,
        "X_valid": X_valid,
        "y_valid": y_valid,
        "X_test": X_test,
        "y_test": y_test,
        "inverse_transform": np.expm1,
        "n_trials": 30,
        "timeout": 800,
    },
    {
        "model_cls": LinearRegressionTuner,
        "experiment_name": f"LinearRegression_{experiment_tag}",
        "X_train": X_train_log,
        "y_train": y_train_log,
        "X_valid": X_valid,
        "y_valid": y_valid,
        "X_test": X_test,
        "y_test": y_test,
        "inverse_transform": np.expm1,
        "n_trials": 2,
        "timeout": 400,
    },
    {
        "model_cls": RidgeRegressionTuner,
        "experiment_name": f"RidgeRegression_{experiment_tag}",
        "X_train": X_train_log,
        "y_train": y_train_log,
        "X_valid": X_valid,
        "y_valid": y_valid,
        "X_test": X_test,
        "y_test": y_test,
        "inverse_transform": np.expm1,
        "n_trials": 20,
        "timeout": 450,
    },
    {
        "model_cls": LassoRegressionTuner,
        "experiment_name": f"LassoRegression_{experiment_tag}",
        "X_train": X_train_log,
        "y_train": y_train_log,
        "X_valid": X_valid,
        "y_valid": y_valid,
        "X_test": X_test,
        "y_test": y_test,
        "inverse_transform": np.expm1,
        "n_trials": 10,
        "timeout": 450,
    },
    {
        "model_cls": DecisionTreeTuner,
        "experiment_name": f"DecisionTree_{experiment_tag}",
        "X_train": X_train,
        "y_train": y_train,
        "X_valid": X_valid,
        "y_valid": y_valid,
        "X_test": X_test,
        "y_test": y_test,
        "inverse_transform": None,
        "n_trials": 150,
        "timeout": 800,
    },
    {
        "model_cls": GradientBoostingTuner,
        "experiment_name": f"GradientBoosting_{experiment_tag}",
        "X_train": X_train,
        "y_train": y_train,
        "X_valid": X_valid,
        "y_valid": y_valid,
        "X_test": X_test,
        "y_test": y_test,
        "inverse_transform": None,
        "n_trials": 100,
        "timeout": 800,
    },
    {
        "model_cls": RandomForestTuner,
        "experiment_name": f"RandomForest_{experiment_tag}",
        "X_train": X_train,
        "y_train": y_train,
        "X_valid": X_valid,
        "y_valid": y_valid,
        "X_test": X_test,
        "y_test": y_test,
        "inverse_transform": None,
        "n_trials": 100,
        "timeout": 800,
    },
    {
        "model_cls": MLPTuner,
        "experiment_name": f"MLP_{experiment_tag}",
        "X_train": X_train_log,
        "y_train": y_train_log,
        "X_valid": X_valid,
        "y_valid": y_valid,
        "X_test": X_test,
        "y_test": y_test,
        "inverse_transform": np.expm1,
        "n_trials": 100,
        "timeout": 800,
    },
    {
        "model_cls": XGBoostTuner,
        "experiment_name": f"XGBoost_{experiment_tag}",
        "X_train": X_train,
        "y_train": y_train,
        "X_valid": X_valid,
        "y_valid": y_valid,
        "X_test": X_test,
        "y_test": y_test,
        "inverse_transform": None,
        "n_trials": 200,
        "timeout": 1100,
    },
    {
        "model_cls": CatBoostTuner,
        "experiment_name": f"CatBoost_{experiment_tag}",
        "X_train": X_train_catboost,
        "y_train": y_train_catboost,
        "X_valid": X_valid_catboost,
        "y_valid": y_valid_catboost,
        "X_test":  X_test_catboost,
        "y_test":  y_test_catboost,
        "inverse_transform": None,
        "n_trials": 200,
        "timeout": 1100
    },
]

# Shared params1
for cfg in model_configs:
    cfg.update({
        "show_progress_bar": False,
        "run_name": None
    })

# Studies

In [None]:
not_excluded = {
   KNTuner,
   LinearRegressionTuner,
   RidgeRegressionTuner,
   LassoRegressionTuner,
   DecisionTreeTuner,
   GradientBoostingTuner,
   RandomForestTuner,
   MLPTuner,
   XGBoostTuner,
   CatBoostTuner 
}

model_configs = [
   cfg
   for cfg in model_configs
   if cfg["model_cls"] in not_excluded
]

# Set Optuna logging level to WARNING to suppress trial info
optuna.logging.set_verbosity(optuna.logging.CRITICAL)

studies = {}

def run_study(config):
    try:
        tuner = config["model_cls"](
            "parallel_validation_set_" + config["experiment_name"],
            config["X_train"],
            config["y_train"],
            config["X_valid"],
            config["y_valid"],
            inverse_transform=config.get("inverse_transform")
        )

        print(f"Running study for validation_set {config['model_cls'].__name__}. Time is {datetime.now().strftime('%d.%m-%H:%M')}")

        tuner.run_study(
            n_trials=config["n_trials"],
            timeout=config["timeout"],
            show_progress_bar=config["show_progress_bar"]
        )
        
        fig = vis.plot_pareto_front(tuner.study, target_names=["RMSLE", "Training Time"])
        fig.update_layout(title=f"{tuner.model_name} Pareto Front: RMSLE vs Training Time")
        fig.show()
        
        # Save the figure
        fig.write_html(f"{tuner.model_name}_pareto_front.html")
        fig.write_image(f"{tuner.model_name}_pareto_front.png")
        
        return tuner.model_name, tuner.study
        
    except Exception as e:
        print(f"Error with {config['model_cls'].__name__}: {str(e)}")
        return None
    
# Run studies in parallel using joblib
results = Parallel(n_jobs=-1)(delayed(run_study)(config) for config in model_configs)
    
# Update studies dict with results
studies.update(dict(filter(None, results)))


# Testing best models in RMSLE, training time, and combined

In [None]:
def normalize(lst):
    min_val, max_val = min(lst), max(lst)
    if max_val == min_val:
        return [0.0 for _ in lst]
    return [(x - min_val) / (max_val - min_val) for x in lst]

# Store all models and their metrics for final pareto plot
all_models = []
all_metrics = []
all_params = []  # Added to store parameters

for config in model_configs:
    model_name = "validation_set_" + config["experiment_name"]
    study = studies.get(model_name)
    if study is None:
        print(f"No study found for {model_name}")
        continue

    trials = [t for t in study.trials if t.values is not None]
    if not trials:
        print(f"No valid trials for {model_name}")
        continue

    rmsle_vals = np.array([t.values[0] for t in trials])
    time_vals = np.array([t.values[1] for t in trials])
    rmsle_norm = normalize(rmsle_vals)
    time_norm = normalize(time_vals)
    combined = np.array([0.5 * r + 0.5 * t for r, t in zip(rmsle_norm, time_norm)])

    n_select = max(2, int(np.ceil(0.01 * len(trials))))

    idx_rmsle = np.argsort(rmsle_vals)
    idx_time = np.argsort(time_vals)
    idx_combined = np.argsort(combined)

    selected = set()
    def pick_unique(indices):
        chosen = []
        for idx in indices:
            if idx not in selected:
                chosen.append(idx)
                selected.add(idx)
            if len(chosen) == n_select:
                break
        return chosen

    best_rmsle = pick_unique(idx_rmsle)
    best_time = pick_unique(idx_time)
    best_combined = pick_unique(idx_combined)
    
    # For each selected trial, retrain and log to MLflow
    for category, indices in zip(
        ["best_rmsle", "best_time", "best_combined"],
        [best_rmsle, best_time, best_combined]
    ):
        for idx in indices:
            trial = trials[idx]
            params = trial.params

            # Instantiate the correct tuner
            try:
                tuner = config["model_cls"](
                    "test_set_results"+"_"+experiment_tag,
                    config["X_train"],
                    config["y_train"],
                    config["X_test"],
                    config["y_test"],
                    inverse_transform=config.get("inverse_transform"),
                    run_name=f"{config['experiment_name']}_{category}_{idx}"
                )

                print(f"Running study for test_set {category, config['model_cls'].__name__}. Time is {datetime.now().strftime('%d.%m-%H:%M')}")

                # Train model and get metrics
                rmsle, train_time = tuner.objective(trial=None, params=params)
                
                # Save model and metrics for pareto plot
                all_models.append(f"{config['experiment_name']}_{category}_{idx}")
                all_metrics.append((rmsle, train_time))
                all_params.append(params)  # Store the parameters

            except Exception as e:
                print(f"Error with {config['model_cls'].__name__}: {str(e)}")
                continue

# Create and save final pareto front plot
if all_metrics:
    rmsle_vals, time_vals = zip(*all_metrics)
    fig = go.Figure()
    
    # Create hover text with parameters
    hover_texts = []
    for model_name, params in zip(all_models, all_params):
        param_text = "<br>".join([f"{k}: {v}" for k, v in params.items()])
        hover_texts.append(f"{model_name}<br>Parameters:<br>{param_text}")
    
    fig.add_scatter(
        x=rmsle_vals,
        y=time_vals,
        mode='markers+text',
        text=all_models,
        textposition="top center",
        hovertext=hover_texts,
        hoverinfo='text',
        marker=dict(size=10)
    )
    fig.update_layout(
        title="Pareto Front: All Models RMSLE vs Training Time",
        xaxis_title="RMSLE",
        yaxis_title="Training Time (s)"
    )
    
    # Save plot
    fig.write_html("pareto_front_all_models.html")
    mlflow.log_artifact("pareto_front_all_models.html")