# Model tuning

In [None]:
# Importing libraries

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

from lightgbm import LGBMRegressor

import utils

  from .autonotebook import tqdm as notebook_tqdm


#### Loading training data

The training dataset is loaded using [pandas](https://pandas.pydata.org/). After separating the predictors (X) from the variable to be estimated (y) the dataset is split into a dataset for training and testing the point estimator as well as a calibration dataset to conformalise the qunatile regressors used to estimate predictive uncertainty. While the training/testing split is done with a 80/20 ratio the testing/calibration split is conducted using a 50/50 ratio. The splitting is facilitated using the train_test_split functionality of [https://scikit-learn.org](scikit-learn). The coordinates were also detached from the data since they shall not be used as predictors here.

In [2]:
# Preparing the data

training_data = pd.read_csv("./data/train.csv")
variables = list(training_data.columns)
y = training_data["rh98"]
X = training_data.drop(columns=["rh98"])

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_calib, y_test, y_calib = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [4]:
train_coords = X_train[["lat", "lon"]]
X_train.drop(columns=["lat", "lon"], inplace=True)

test_coords = X_test[["lat", "lon"]]
X_test.drop(columns=["lat", "lon"], inplace=True)

calib_coords = X_calib[["lat", "lon"]]
X_calib.drop(columns=["lat", "lon"], inplace=True)

#### Model tuning and fitting
Before fiting the model, in this case a tree-base ensemble approach namely [Light Gradient Boosting](https://lightgbm.readthedocs.io/en/stable/), its hyperparameters are tuned on the training dataset using thr optimisation framework [Optuna](https://optuna.readthedocs.io/en/stable/). This is done by defining an objective function alongside the parameter space. The Roost Mean Squared Error is used as the score function which has to be minimize. The tuhing ran for 300 epochs. Optuna also provides a dashboard to reviev tuning performance and assess the importance of individual hyperparameters. In can be started using the following command: optuna-dashboard sqlite:///tuning.sqlite3. I can be accessed through: [http://127.0.0.1:8080/dashboard/](http://127.0.0.1:8080/dashboard/). The tuning results are stored in a sqlite database for later use or extension. The RSME was used as the loss function which had be minimized here.

In [5]:
try:
    def objective(trial):
        n_estimators = trial.suggest_int("n_estimators", 10, 1000)
        learning_rate = trial.suggest_float("learning_rate", 0.001, 0.5)
        max_depth = trial.suggest_int("max_depth", 3, 50)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
        subsample = trial.suggest_float("subsample", 0.1, 1.0)
        reg_alpha = trial.suggest_float("reg_alpha", 0.0001, 100, log=True)
        reg_lambda = trial.suggest_float("reg_lambda", 0.0001, 100, log=True)

        model = LGBMRegressor(objective='regression',
                                    n_estimators=n_estimators,
                                    learning_rate=learning_rate,
                                    max_depth=max_depth,
                                    colsample_bytree=colsample_bytree,
                                    subsample=subsample,
                                    boosting_type='gbdt',
                                    reg_alpha=reg_alpha,
                                    reg_lambda=reg_lambda,
                                    random_state=42,
                                    n_jobs=14,
                                    verbosity=-1)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score = root_mean_squared_error(y_test, y_pred)
        
        return score

    study = optuna.create_study(study_name=f"OGH_2025_Canopy_Prediction_LightGradientBoosting", direction="minimize", storage="sqlite:///tuning.sqlite3", load_if_exists=False)
    study.optimize(objective, n_trials=300)
    print(f"Best parameters: {study.best_params}")
except:
    print("Model has already been tuned. You may review this process under: http://127.0.0.1:8080/dashboard/")

Model has already been tuned. You may review this process under: http://127.0.0.1:8080/dashboard/


#### Fitting the LGBM regressor using the tuned hyperparameters
The tuned regressor is fitted to the training dataset using the best performing hyperparameters from the study stored in the sqlite database. The trained model is subsequently pickled for later use.

In [6]:
study_lgb = optuna.load_study(study_name=f"OGH_2025_Canopy_Prediction_LightGradientBoosting", storage="sqlite:///tuning.sqlite3")
lgb = LGBMRegressor(**study_lgb.best_params, random_state=42)
lgb.fit(X_train, y_train)
with open(f'./lightGradientBoosting.pickle', 'wb') as handle:
        pickle.dump(lgb, handle, protocol=pickle.HIGHEST_PROTOCOL)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004383 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6750
[LightGBM] [Info] Number of data points in the train set: 31628, number of used features: 64
[LightGBM] [Info] Start training from score 1681.126281


#### Tuning upper and lower quantile 
Point estimates alongside the performance indicators of the underlying estimator might be lacking utility and interpretability in certain use cases. Infromaton regading the uncertainly and reliability of the point estimates might improve on both aspects. Since the predictive uncertainty shall also be quantified two qunatile regressors are fitted to the data corresponding to the 0.025 and 0.975 quantile respectively. This was also done using optuna. The qunatile regressors are also fitted using the tuned hyperparameters and subsequently pickled for later use. While quantile regression is a straighforward way to quantify uncertainties it sure has its pitfalls as shown later.

In [7]:
confidence = 0.95
miscoverage = 1-confidence
alpha_low = round(miscoverage/2, 3)
alpha_upp = round(1-miscoverage/2, 3)

In [8]:

def objective(trial, alpha):
    n_estimators = trial.suggest_int("n_estimators", 10, 1000)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.5)
    max_depth = trial.suggest_int("max_depth", 3, 50)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    reg_alpha = trial.suggest_float("reg_alpha", 0.0001, 100, log=True)
    reg_lambda = trial.suggest_float("reg_lambda", 0.0001, 100, log=True)
    
    model = LGBMRegressor(objective='quantile',
                          alpha=alpha,
                        n_estimators=n_estimators,
                        learning_rate=learning_rate,
                        max_depth=max_depth,
                        colsample_bytree=colsample_bytree,
                        subsample=subsample,
                        boosting_type='gbdt',
                        reg_alpha=reg_alpha,
                        reg_lambda=reg_lambda,
                        verbose=-1,
                        random_state=42,
                        n_jobs=12)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_train) 
    score = root_mean_squared_error(y_train, y_pred)
    
    return score

try:
  study_low = optuna.create_study(study_name=f"OGH_2025_Canopy_Prediction_{alpha_low}_QuantileRegressor", direction="minimize", storage="sqlite:///tuning.sqlite3", load_if_exists=False)
  study_low.optimize(lambda trial: objective(trial, alpha=alpha_low), n_trials=300)
except:
    print("Model has already been tuned. You may review this process under: http://127.0.0.1:8080/dashboard/")

try:
  study_upp = optuna.create_study(study_name=f"OGH_2025_Canopy_Prediction_{alpha_upp}_QuantileRegressor", direction="minimize", storage="sqlite:///tuning.sqlite3", load_if_exists=False)
  study_upp.optimize(lambda trial: objective(trial, alpha=alpha_upp), n_trials=300)
except:
    print("Model has already been tuned. You may review this process under: http://127.0.0.1:8080/dashboard/")

Model has already been tuned. You may review this process under: http://127.0.0.1:8080/dashboard/
Model has already been tuned. You may review this process under: http://127.0.0.1:8080/dashboard/


In [9]:
study_low = optuna.load_study(study_name=f"OGH_2025_Canopy_Prediction_{alpha_low}_QuantileRegressor", storage="sqlite:///tuning.sqlite3")
study_low = LGBMRegressor(**study_low.best_params, random_state=42)
study_low.fit(X_train, y_train)
with open(f'./{alpha_low}_quantileRegressor.pickle', 'wb') as handle:
        pickle.dump(study_low, handle, protocol=pickle.HIGHEST_PROTOCOL)

study_upp = optuna.load_study(study_name=f"OGH_2025_Canopy_Prediction_{alpha_upp}_QuantileRegressor", storage="sqlite:///tuning.sqlite3")
study_upp = LGBMRegressor(**study_upp.best_params, random_state=42)
study_upp.fit(X_train, y_train)
with open(f'./{alpha_upp}_quantileRegressor.pickle', 'wb') as handle:
        pickle.dump(study_upp, handle, protocol=pickle.HIGHEST_PROTOCOL)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002819 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6750
[LightGBM] [Info] Number of data points in the train set: 31628, number of used features: 64
[LightGBM] [Info] Start training from score 1681.126281
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003615 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6750
[LightGBM] [Info] Number of data points in the train set: 31628, number of used features: 64
[LightGBM] [Info] Start training from score 1681.126281
