In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import HistGradientBoostingRegressor  # Replacing CatBoost
from sklearn.metrics import mean_absolute_error, r2_score
import joblib
import optuna
from sklearn.preprocessing import StandardScaler

# ✅ Load Processed Dataset
df = pd.read_csv("../data/processed/UHI_Weather_Building_Sentinel_LST_Featured_Cleaned.csv")

# ✅ Define Features & Target
X = df.drop(columns=["uhi_index"])
y = df["uhi_index"]

# ✅ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, "../models/scaler.pkl")

# ✅ Define Objective Function for Hyperparameter Tuning
def objective(trial):
    model_type = trial.suggest_categorical("model_type", ["xgb", "lgb", "hgb"])  # Replacing catboost with HGB
    
    if model_type == "xgb":
        model = XGBRegressor(
            n_estimators=trial.suggest_int("n_estimators", 100, 1000, step=100),
            learning_rate=trial.suggest_loguniform("learning_rate", 0.01, 0.3),
            max_depth=trial.suggest_int("max_depth", 3, 10),
            random_state=42
        )
    elif model_type == "lgb":
        model = LGBMRegressor(
            n_estimators=trial.suggest_int("n_estimators", 100, 1000, step=100),
            learning_rate=trial.suggest_loguniform("learning_rate", 0.01, 0.3),
            max_depth=trial.suggest_int("max_depth", 3, 10),
            random_state=42
        )
    else:
        model = HistGradientBoostingRegressor(
            max_iter=trial.suggest_int("max_iter", 100, 1000, step=100),
            learning_rate=trial.suggest_loguniform("learning_rate", 0.01, 0.3),
            max_depth=trial.suggest_int("max_depth", 3, 10),
            random_state=42
        )
    
    scores = cross_val_score(model, X_train_scaled, y_train, scoring='r2', cv=5)
    return np.mean(scores)

# ✅ Hyperparameter Tuning with Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)
best_params = study.best_params

# ✅ Train Best Model
if best_params['model_type'] == 'xgb':
    final_model = XGBRegressor(**{k: v for k, v in best_params.items() if k != 'model_type'})
elif best_params['model_type'] == 'lgb':
    final_model = LGBMRegressor(**{k: v for k, v in best_params.items() if k != 'model_type'})
else:
    final_model = HistGradientBoostingRegressor(**{k: v for k, v in best_params.items() if k != 'model_type'})

final_model.fit(X_train_scaled, y_train)

# ✅ Evaluate Model
y_pred = final_model.predict(X_test_scaled)
print(f"🎯 R² Score: {r2_score(y_test, y_pred):.4f}")
print(f"📉 MAE: {mean_absolute_error(y_test, y_pred):.4f}")

# ✅ Save Final Model
joblib.dump(final_model, "../models/UHI_best_model.pkl")
print("✅ Optimized Model Saved Successfully!")

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-02-26 19:21:41,236] A new study created in memory with name: no-name-aeafb28d-c48a-4e42-b38b-874825189ada
  learning_rate=trial.suggest_loguniform("learning_rate", 0.01, 0.3),
[I 2025-02-26 19:21:45,935] Trial 0 finished with value: 0.9090911407493213 and parameters: {'model_type': 'xgb', 'n_estimators': 800, 'learning_rate': 0.2038185918008321, 'max_depth': 3}. Best is trial 0 with value: 0.9090911407493213.
  learning_rate=trial.suggest_loguniform("learning_rate", 0.01, 0.3),
[I 2025-02-26 19:22:05,546] Trial 1 finished with value: 0.9398858950411839 and parameters: {'model_type': 'xgb', 'n_estimators': 500, 'learning_rate': 0.028859983323604996, 'max_depth': 10}. Best is trial 1 with value: 0.9398858950411839.
  learning_rate=trial.suggest_loguniform("learning_rate", 0.01, 0.3),
[I 2025-02-26 19:22:47,257] Trial 2 finished with value: 0.9008772237428992 and parameters: {'model_type': 'hgb', 'max_iter': 1000, 'learning_rate': 

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000582 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3329
[LightGBM] [Info] Number of data points in the train set: 7186, number of used features: 22
[LightGBM] [Info] Start training from score 0.000568




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000424 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3329
[LightGBM] [Info] Number of data points in the train set: 7186, number of used features: 22
[LightGBM] [Info] Start training from score -0.006741




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000425 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3328
[LightGBM] [Info] Number of data points in the train set: 7186, number of used features: 22
[LightGBM] [Info] Start training from score -0.005734
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000475 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3328
[LightGBM] [Info] Number of data points in the train set: 7187, number of used features: 22
[LightGBM] [Info] Start training from score 0.003715








[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000637 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3329
[LightGBM] [Info] Number of data points in the train set: 7187, number of used features: 22
[LightGBM] [Info] Start training from score -0.000827


[I 2025-02-26 19:23:27,147] Trial 5 finished with value: 0.9305207326759234 and parameters: {'model_type': 'lgb', 'n_estimators': 900, 'learning_rate': 0.09489750887162716, 'max_depth': 7}. Best is trial 1 with value: 0.9398858950411839.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000476 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3329
[LightGBM] [Info] Number of data points in the train set: 7186, number of used features: 22
[LightGBM] [Info] Start training from score 0.000568


  learning_rate=trial.suggest_loguniform("learning_rate", 0.01, 0.3),






[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000482 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3329
[LightGBM] [Info] Number of data points in the train set: 7186, number of used features: 22
[LightGBM] [Info] Start training from score -0.006741
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000580 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3328
[LightGBM] [Info] Number of data points in the train set: 7186, number of used features: 22
[LightGBM] [Info] Start training from score -0.005734








[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000540 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3328
[LightGBM] [Info] Number of data points in the train set: 7187, number of used features: 22
[LightGBM] [Info] Start training from score 0.003715
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000607 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3329
[LightGBM] [Info] Number of data points in the train set: 7187, number of used features: 22
[LightGBM] [Info] Start training from score -0.000827






[I 2025-02-26 19:23:30,720] Trial 6 finished with value: 0.7979203736271787 and parameters: {'model_type': 'lgb', 'n_estimators': 600, 'learning_rate': 0.03721211093595231, 'max_depth': 3}. Best is trial 1 with value: 0.9398858950411839.




  learning_rate=trial.suggest_loguniform("learning_rate", 0.01, 0.3),


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000572 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3329
[LightGBM] [Info] Number of data points in the train set: 7186, number of used features: 22
[LightGBM] [Info] Start training from score 0.000568
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000570 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3329
[LightGBM] [Info] Number of data points in the train set: 7186, number of used features: 22
[LightGBM] [Info] Start training from score -0.006741








[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000476 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3328
[LightGBM] [Info] Number of data points in the train set: 7186, number of used features: 22
[LightGBM] [Info] Start training from score -0.005734
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000538 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3328
[LightGBM] [Info] Number of data points in the train set: 7187, number of used features: 22
[LightGBM] [Info] Start training from score 0.003715








[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000534 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3329
[LightGBM] [Info] Number of data points in the train set: 7187, number of used features: 22
[LightGBM] [Info] Start training from score -0.000827


[I 2025-02-26 19:23:35,078] Trial 7 finished with value: 0.9008048843460523 and parameters: {'model_type': 'lgb', 'n_estimators': 200, 'learning_rate': 0.07938241950455747, 'max_depth': 8}. Best is trial 1 with value: 0.9398858950411839.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000482 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3329
[LightGBM] [Info] Number of data points in the train set: 7186, number of used features: 22
[LightGBM] [Info] Start training from score 0.000568


  learning_rate=trial.suggest_loguniform("learning_rate", 0.01, 0.3),


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000471 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3329
[LightGBM] [Info] Number of data points in the train set: 7186, number of used features: 22
[LightGBM] [Info] Start training from score -0.006741








[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000473 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3328
[LightGBM] [Info] Number of data points in the train set: 7186, number of used features: 22
[LightGBM] [Info] Start training from score -0.005734
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000475 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3328
[LightGBM] [Info] Number of data points in the train set: 7187, number of used features: 22
[LightGBM] [Info] Start training from score 0.003715




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000448 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3329
[LightGBM] [Info] Number of data points in the train set: 7187, number of used features: 22
[LightGBM] [Info] Start training from score -0.000827






[I 2025-02-26 19:23:41,034] Trial 8 finished with value: 0.8667670367485959 and parameters: {'model_type': 'lgb', 'n_estimators': 700, 'learning_rate': 0.03916862955707373, 'max_depth': 4}. Best is trial 1 with value: 0.9398858950411839.




  learning_rate=trial.suggest_loguniform("learning_rate", 0.01, 0.3),
[I 2025-02-26 19:23:43,587] Trial 9 finished with value: 0.8754737868875363 and parameters: {'model_type': 'xgb', 'n_estimators': 200, 'learning_rate': 0.03571467872378473, 'max_depth': 6}. Best is trial 1 with value: 0.9398858950411839.
  learning_rate=trial.suggest_loguniform("learning_rate", 0.01, 0.3),
[I 2025-02-26 19:23:51,940] Trial 10 finished with value: 0.9376045670697302 and parameters: {'model_type': 'xgb', 'n_estimators': 400, 'learning_rate': 0.2807707605474118, 'max_depth': 10}. Best is trial 1 with value: 0.9398858950411839.
  learning_rate=trial.suggest_loguniform("learning_rate", 0.01, 0.3),
[I 2025-02-26 19:24:00,495] Trial 11 finished with value: 0.9373469684892249 and parameters: {'model_type': 'xgb', 'n_estimators': 400, 'learning_rate': 0.28053403557511575, 'max_depth': 10}. Best is trial 1 with value: 0.9398858950411839.
  learning_rate=trial.suggest_loguniform("learning_rate", 0.01, 0.3),
[I 

🎯 R² Score: 0.9481
📉 MAE: 0.1485
✅ Optimized Model Saved Successfully!
