import statements

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
import pandas as pd
import numpy as np
from datetime import datetime
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from dotenv import load_dotenv
import joblib  # Added for saving models
load_dotenv()
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from src.config import TRANSFORMED_DATA_DIR, MODELS_DIR  # Added MODELS_DIR
from src.data_utils import split_time_series_data
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow

Data Loading and test train split

In [2]:
# Load the tabular data
df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")
df.head(5)

# Split the data into training and testing sets
# Training period: January 29, 2024, to December 31, 2024
# Test period: January 1, 2025, to January 31, 2025
X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2025, 1, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# Select lagged ride count features
past_ride_columns = [c for c in X_train.columns if c.startswith("rides_")]
X_train_lags = X_train[past_ride_columns]
X_test_lags = X_test[past_ride_columns]

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,start_station_name,target
0,4,9,3,0,2,0,0,0,1,2,...,4,6,8,10,7,2,0,2024-01-29 00:00:00,8 Ave & W 31 St,4
1,9,3,0,2,0,0,0,1,2,3,...,6,8,10,7,2,0,4,2024-01-29 01:00:00,8 Ave & W 31 St,0
2,3,0,2,0,0,0,1,2,3,3,...,8,10,7,2,0,4,0,2024-01-29 02:00:00,8 Ave & W 31 St,1
3,0,2,0,0,0,1,2,3,3,8,...,10,7,2,0,4,0,1,2024-01-29 03:00:00,8 Ave & W 31 St,1
4,2,0,0,0,1,2,3,3,8,5,...,7,2,0,4,0,1,1,2024-01-29 04:00:00,8 Ave & W 31 St,1


(24336, 674)
(24336,)
(2232, 674)
(2232,)


Average function definition

In [3]:
# Model 1: Baseline Model (Naive Lag - Previous Hour)
class BaselineModelPreviousHour:
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass

    def predict(self, X_test: pd.DataFrame) -> np.array:
        return X_test["rides_t-1"]

model = BaselineModelPreviousHour()
predictions = model.predict(X_test_lags)
test_mae = mean_absolute_error(y_test, predictions)
print(f"BaselineModelPreviousHour MAE: {test_mae:.4f}")
mlflow = set_mlflow_tracking()
log_model_to_mlflow(model, X_test_lags, "BaselineModelPreviousHour", "mean_absolute_error", score=test_mae)

# Save the model to the models folder
model_path = MODELS_DIR / "baseline_previous_hour.pkl"
joblib.dump(model, model_path)
print(f"BaselineModelPreviousHour saved to: {model_path}")

INFO:src.experiment_utils:MLflow tracking URI and credentials set.


BaselineModelPreviousHour MAE: 4.1080


INFO:src.experiment_utils:Experiment set to: BaselineModelPreviousHour
INFO:src.experiment_utils:Logged mean_absolute_error: 4.107974910394265
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/05/09 22:27:01 INFO mlflow.models.model: Found the following environment variables used during model inference: [HOPSWORKS_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
Registered model 'BaselineModelPreviousHour' already exists. Creating a new version of this model...
2025/05/09 22:27:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: BaselineModelPreviousHour, version 5
Created version '5' of model 'BaselineModelPreviousHour'.
INFO:src.experiment_utils:Model logged with name: BaselineModelPreviousHour


🏃 View run spiffy-bird-588 at: https://dagshub.com/singhvarun0405/CDA500PF1.mlflow/#/experiments/0/runs/aecb6e0c85884275aa8d0e57e7769ca7
🧪 View experiment at: https://dagshub.com/singhvarun0405/CDA500PF1.mlflow/#/experiments/0


<mlflow.models.model.ModelInfo at 0x27e49aab080>

['C:\\Users\\singh\\Downloads\\CDS500_Applied_ML_DS\\Projects\\CDA500Final\\models\\baseline_previous_hour.pkl']

BaselineModelPreviousHour saved to: C:\Users\singh\Downloads\CDS500_Applied_ML_DS\Projects\CDA500Final\models\baseline_previous_hour.pkl


Custom scikit-learn based class that extracts the temporal features from the dataset

In [4]:
# Model 2: LightGBM with All Lag Features for Past 28 Days
# Select the last 28 days of lag features (rides_t-1 to rides_t-672, where 672 = 28*24)
lag_columns_28_days = [f"rides_t-{i}" for i in range(1, 28*24 + 1)]
lag_columns_28_days = [col for col in lag_columns_28_days if col in X_train_lags.columns]
X_train_28_days = X_train_lags[lag_columns_28_days]
X_test_28_days = X_test_lags[lag_columns_28_days]

model = lgb.LGBMRegressor()
model.fit(X_train_28_days, y_train)
predictions = model.predict(X_test_28_days)
test_mae = mean_absolute_error(y_test, predictions)
print(f"LightGBM_28DaysLags MAE: {test_mae:.4f}")
log_model_to_mlflow(model, X_test_28_days, "LightGBM_28DaysLags", "mean_absolute_error", score=test_mae)

# Save the model to the models folder
model_path = MODELS_DIR / "lightgbm_28days_lags.pkl"
joblib.dump(model, model_path)
print(f"LightGBM_28DaysLags saved to: {model_path}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.061378 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65856
[LightGBM] [Info] Number of data points in the train set: 24336, number of used features: 672
[LightGBM] [Info] Start training from score 17.580950


INFO:src.experiment_utils:Experiment set to: LightGBM_28DaysLags


LightGBM_28DaysLags MAE: 3.0292


INFO:src.experiment_utils:Logged mean_absolute_error: 3.0291538263036446
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'LGBMRegressor' already exists. Creating a new version of this model...
2025/05/09 22:27:48 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LGBMRegressor, version 9
Created version '9' of model 'LGBMRegressor'.
INFO:src.experiment_utils:Model logged with name: LGBMRegressor


🏃 View run fearless-foal-992 at: https://dagshub.com/singhvarun0405/CDA500PF1.mlflow/#/experiments/7/runs/cb06bf232b64435ab6489c8331f55172
🧪 View experiment at: https://dagshub.com/singhvarun0405/CDA500PF1.mlflow/#/experiments/7


<mlflow.models.model.ModelInfo at 0x27e46ff4fe0>

['C:\\Users\\singh\\Downloads\\CDS500_Applied_ML_DS\\Projects\\CDA500Final\\models\\lightgbm_28days_lags.pkl']

LightGBM_28DaysLags saved to: C:\Users\singh\Downloads\CDS500_Applied_ML_DS\Projects\CDA500Final\models\lightgbm_28days_lags.pkl


Pipeline

In [5]:
# Model 3: LightGBM with Feature Reduction (Top 10 Features using Feature Importance)
selector_model = lgb.LGBMRegressor()
selector_model.fit(X_train_lags, y_train)

selector = SelectKBest(score_func=f_regression, k=10)
X_train_reduced = selector.fit_transform(X_train_lags, y_train)
X_test_reduced = selector.transform(X_test_lags)

selected_features = X_train_lags.columns[selector.get_support()].tolist()
print(f"Selected Features: {selected_features}")

model = lgb.LGBMRegressor()
model.fit(X_train_reduced, y_train)
predictions = model.predict(X_test_reduced)
test_mae = mean_absolute_error(y_test, predictions)
print(f"LightGBM_Top10Features MAE: {test_mae:.4f}")
log_model_to_mlflow(model, X_test_reduced, "LightGBM_Top10Features", "mean_absolute_error", score=test_mae)

# Save the model to the models folder
model_path = MODELS_DIR / "lightgbm_top10_features.pkl"
joblib.dump(model, model_path)
print(f"LightGBM_Top10Features saved to: {model_path}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.052610 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 65856
[LightGBM] [Info] Number of data points in the train set: 24336, number of used features: 672
[LightGBM] [Info] Start training from score 17.580950


Selected Features: ['rides_t-672', 'rides_t-504', 'rides_t-360', 'rides_t-336', 'rides_t-312', 'rides_t-192', 'rides_t-168', 'rides_t-144', 'rides_t-24', 'rides_t-1']
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000569 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 980
[LightGBM] [Info] Number of data points in the train set: 24336, number of used features: 10
[LightGBM] [Info] Start training from score 17.580950


INFO:src.experiment_utils:Experiment set to: LightGBM_Top10Features


LightGBM_Top10Features MAE: 3.1664


INFO:src.experiment_utils:Logged mean_absolute_error: 3.1663847563577625
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'LGBMRegressor' already exists. Creating a new version of this model...
2025/05/09 22:28:06 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LGBMRegressor, version 10
Created version '10' of model 'LGBMRegressor'.
INFO:src.experiment_utils:Model logged with name: LGBMRegressor


🏃 View run masked-cow-382 at: https://dagshub.com/singhvarun0405/CDA500PF1.mlflow/#/experiments/8/runs/527bcce8834f4ae6bb003e71036ea5d3
🧪 View experiment at: https://dagshub.com/singhvarun0405/CDA500PF1.mlflow/#/experiments/8


<mlflow.models.model.ModelInfo at 0x27e49aab080>

['C:\\Users\\singh\\Downloads\\CDS500_Applied_ML_DS\\Projects\\CDA500Final\\models\\lightgbm_top10_features.pkl']

LightGBM_Top10Features saved to: C:\Users\singh\Downloads\CDS500_Applied_ML_DS\Projects\CDA500Final\models\lightgbm_top10_features.pkl


Running LGM with Hyper Parameters

In [6]:
# Model 4: Gradient Boosting with Temporal Features
X_train_temp = X_train.copy()
X_test_temp = X_test.copy()
X_train_temp["hour"] = X_train_temp["pickup_hour"].dt.hour
X_train_temp["day_of_week"] = X_train_temp["pickup_hour"].dt.dayofweek
X_train_temp["month"] = X_train_temp["pickup_hour"].dt.month
X_test_temp["hour"] = X_test_temp["pickup_hour"].dt.hour
X_test_temp["day_of_week"] = X_test_temp["pickup_hour"].dt.dayofweek
X_test_temp["month"] = X_test_temp["pickup_hour"].dt.month

features_to_use = past_ride_columns + ["hour", "day_of_week", "month"]
X_train_temp = X_train_temp[features_to_use]
X_test_temp = X_test_temp[features_to_use]

model = GradientBoostingRegressor(n_estimators=200, max_depth=5, random_state=42)
model.fit(X_train_temp, y_train)
predictions = model.predict(X_test_temp)
test_mae = mean_absolute_error(y_test, predictions)
print(f"GradientBoosting_TemporalFeatures MAE: {test_mae:.4f}")
log_model_to_mlflow(model, X_test_temp, "GradientBoosting_TemporalFeatures", "mean_absolute_error", score=test_mae)

# Save the model to the models folder
model_path = MODELS_DIR / "gradient_boosting_temporal_features.pkl"
joblib.dump(model, model_path)
print(f"GradientBoosting_TemporalFeatures saved to: {model_path}")

GradientBoosting_TemporalFeatures MAE: 3.0904


INFO:src.experiment_utils:Experiment set to: GradientBoosting_TemporalFeatures
INFO:src.experiment_utils:Logged mean_absolute_error: 3.090369498386376
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'GradientBoostingRegressor' already exists. Creating a new version of this model...
2025/05/09 22:44:47 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: GradientBoostingRegressor, version 2
Created version '2' of model 'GradientBoostingRegressor'.
INFO:src.experiment_utils:Model logged with name: GradientBoostingRegressor


🏃 View run abundant-doe-298 at: https://dagshub.com/singhvarun0405/CDA500PF1.mlflow/#/experiments/9/runs/0335161b548d4483ae5135b0f4d24c92
🧪 View experiment at: https://dagshub.com/singhvarun0405/CDA500PF1.mlflow/#/experiments/9


<mlflow.models.model.ModelInfo at 0x27e3be29cd0>

['C:\\Users\\singh\\Downloads\\CDS500_Applied_ML_DS\\Projects\\CDA500Final\\models\\gradient_boosting_temporal_features.pkl']

GradientBoosting_TemporalFeatures saved to: C:\Users\singh\Downloads\CDS500_Applied_ML_DS\Projects\CDA500Final\models\gradient_boosting_temporal_features.pkl


Logging model and results in MLFlow

In [7]:
# Model 5: LightGBM with Broader Lagged Features, Cyclic Temporal Features, Interactions, and Tuning
lag_columns_7_days = [f"rides_t-{i}" for i in range(1, 7*24 + 1)]
lag_columns_7_days = [col for col in lag_columns_7_days if col in X_train_lags.columns]
X_train_7_days = X_train_lags[lag_columns_7_days]
X_test_7_days = X_test_lags[lag_columns_7_days]

X_train_enhanced = X_train_7_days.copy()
X_test_enhanced = X_test_7_days.copy()

X_train_enhanced["hour"] = X_train["pickup_hour"].dt.hour
X_train_enhanced["day_of_week"] = X_train["pickup_hour"].dt.dayofweek
X_train_enhanced["month"] = X_train["pickup_hour"].dt.month
X_test_enhanced["hour"] = X_test["pickup_hour"].dt.hour
X_test_enhanced["day_of_week"] = X_test["pickup_hour"].dt.dayofweek
X_test_enhanced["month"] = X_test["pickup_hour"].dt.month

X_train_enhanced["hour_sin"] = np.sin(2 * np.pi * X_train_enhanced["hour"] / 24)
X_train_enhanced["hour_cos"] = np.cos(2 * np.pi * X_train_enhanced["hour"] / 24)
X_train_enhanced["day_of_week_sin"] = np.sin(2 * np.pi * X_train_enhanced["day_of_week"] / 7)
X_train_enhanced["day_of_week_cos"] = np.cos(2 * np.pi * X_train_enhanced["day_of_week"] / 7)
X_train_enhanced["month_sin"] = np.sin(2 * np.pi * X_train_enhanced["month"] / 12)
X_train_enhanced["month_cos"] = np.cos(2 * np.pi * X_train_enhanced["month"] / 12)
X_test_enhanced["hour_sin"] = np.sin(2 * np.pi * X_test_enhanced["hour"] / 24)
X_test_enhanced["hour_cos"] = np.cos(2 * np.pi * X_test_enhanced["hour"] / 24)
X_test_enhanced["day_of_week_sin"] = np.sin(2 * np.pi * X_test_enhanced["day_of_week"] / 7)
X_test_enhanced["day_of_week_cos"] = np.cos(2 * np.pi * X_test_enhanced["day_of_week"] / 7)
X_test_enhanced["month_sin"] = np.sin(2 * np.pi * X_test_enhanced["month"] / 12)
X_test_enhanced["month_cos"] = np.cos(2 * np.pi * X_test_enhanced["month"] / 12)

X_train_enhanced["is_winter"] = X_train["pickup_hour"].dt.month.isin([12, 1, 2]).astype(int)
X_test_enhanced["is_winter"] = X_test["pickup_hour"].dt.month.isin([12, 1, 2]).astype(int)

X_train_enhanced["is_weekend"] = X_train["pickup_hour"].dt.dayofweek.isin([5, 6]).astype(int)
X_test_enhanced["is_weekend"] = X_test["pickup_hour"].dt.dayofweek.isin([5, 6]).astype(int)

holiday_dates = [(12, 25), (12, 31), (1, 1)]
X_train_enhanced["is_holiday"] = X_train["pickup_hour"].apply(
    lambda x: 1 if (x.month, x.day) in holiday_dates else 0
)
X_test_enhanced["is_holiday"] = X_test["pickup_hour"].apply(
    lambda x: 1 if (x.month, x.day) in holiday_dates else 0
)

trend_lags = [f"rides_t-{i}" for i in range(1, 7*24 + 1)]
trend_lags = [col for col in trend_lags if col in X_train_lags.columns]
X_train_enhanced["trend_7d"] = X_train_lags[trend_lags].mean(axis=1)
X_test_enhanced["trend_7d"] = X_test_lags[trend_lags].mean(axis=1)

X_train_enhanced["rides_t1_hour"] = X_train_enhanced["rides_t-1"] * X_train_enhanced["hour"]
X_test_enhanced["rides_t1_hour"] = X_test_enhanced["rides_t-1"] * X_test_enhanced["hour"]
X_train_enhanced["rides_t24_day_of_week"] = X_train_enhanced["rides_t-24"] * X_train_enhanced["day_of_week"]
X_test_enhanced["rides_t24_day_of_week"] = X_test_enhanced["rides_t-24"] * X_test_enhanced["day_of_week"]

param_grid = {
    "num_leaves": [31, 50],
    "max_depth": [10, 15],
    "learning_rate": [0.01, 0.05],
    "n_estimators": [200, 500]
}
model = lgb.LGBMRegressor(random_state=42)
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring="neg_mean_absolute_error",
    cv=3,
    verbose=2,
    n_jobs=-1
)
grid_search.fit(X_train_enhanced, y_train)

best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

predictions = best_model.predict(X_test_enhanced)
test_mae = mean_absolute_error(y_test, predictions)
print(f"LightGBM_EnhancedLags_CyclicTemporal_Interactions MAE: {test_mae:.4f}")
log_model_to_mlflow(best_model, X_test_enhanced, "LightGBM_EnhancedLags_CyclicTemporal_Interactions", "mean_absolute_error", score=test_mae)

# Save the model to the models folder
model_path = MODELS_DIR / "lightgbm_enhanced_lags_cyclic_temporal_interactions.pkl"
joblib.dump(best_model, model_path)
print(f"LightGBM_EnhancedLags_CyclicTemporal_Interactions saved to: {model_path}")

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017881 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17331
[LightGBM] [Info] Number of data points in the train set: 24336, number of used features: 183
[LightGBM] [Info] Start training from score 17.580950


INFO:src.experiment_utils:Experiment set to: LightGBM_EnhancedLags_CyclicTemporal_Interactions


Best Parameters: {'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 200, 'num_leaves': 31}
LightGBM_EnhancedLags_CyclicTemporal_Interactions MAE: 3.0278


INFO:src.experiment_utils:Logged mean_absolute_error: 3.027770997920668
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'LGBMRegressor' already exists. Creating a new version of this model...
2025/05/09 22:50:21 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LGBMRegressor, version 11
Created version '11' of model 'LGBMRegressor'.
INFO:src.experiment_utils:Model logged with name: LGBMRegressor


🏃 View run dashing-cod-216 at: https://dagshub.com/singhvarun0405/CDA500PF1.mlflow/#/experiments/13/runs/7da24c2689cd44dda9322d08a828d388
🧪 View experiment at: https://dagshub.com/singhvarun0405/CDA500PF1.mlflow/#/experiments/13


<mlflow.models.model.ModelInfo at 0x27e4bd3e060>

['C:\\Users\\singh\\Downloads\\CDS500_Applied_ML_DS\\Projects\\CDA500Final\\models\\lightgbm_enhanced_lags_cyclic_temporal_interactions.pkl']

LightGBM_EnhancedLags_CyclicTemporal_Interactions saved to: C:\Users\singh\Downloads\CDS500_Applied_ML_DS\Projects\CDA500Final\models\lightgbm_enhanced_lags_cyclic_temporal_interactions.pkl


In [8]:
# Print feature importance for insight
feature_importance = pd.DataFrame({
    "Feature": X_train_enhanced.columns,
    "Importance": best_model.feature_importances_
}).sort_values(by="Importance", ascending=False)
print("\nFeature Importance:")
print(feature_importance)


Feature Importance:
         Feature  Importance
0      rides_t-1         346
167  rides_t-168         185
1      rides_t-2         178
23    rides_t-24         162
22    rides_t-23         133
..           ...         ...
88    rides_t-89           7
175    month_sin           5
177    is_winter           0
178   is_weekend           0
179   is_holiday           0

[183 rows x 2 columns]


| **Hyperparameter**       | **Description**                                                                                                                                                                                                 | **Impact**                                                                                                                                                                                                                     |
|---------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `num_leaves`             | Maximum number of leaves in one tree. Larger values increase model complexity and accuracy but may lead to overfitting.                                                                                         | High: Directly controls the complexity of the model. Larger values can improve accuracy but increase the risk of overfitting.                                                                                                  |
| `max_depth`              | Maximum depth of a tree. Limits the depth of the tree to prevent overfitting. Use `-1` for no limit.                                                                                                           | High: Limits the model's ability to capture complex patterns. Shallower trees reduce overfitting but may underfit.                                                                                                             |
| `learning_rate`          | Step size for updating weights. Smaller values make training slower but improve generalization.                                                                                                                | High: Affects convergence speed and generalization. Lower values often require more iterations (`n_estimators`).                                                                                                               |
| `n_estimators`           | Number of boosting iterations (trees). Higher values improve accuracy but increase training time.                                                                                                              | High: Controls the number of trees in the ensemble. Too many trees can lead to overfitting if `learning_rate` is not adjusted.                                                                                                |
| `feature_fraction`       | Fraction of features (columns) to randomly select for each tree. Helps prevent overfitting.                                                                                                                    | Medium-High: Reduces overfitting and speeds up training. Lower values may lead to underfitting.                                                                                                                               |
| `bagging_fraction`       | Fraction of data (rows) to randomly select for each iteration. Works with `bagging_freq`.                                                                                                                       | Medium-High: Reduces overfitting and improves generalization. Lower values may lead to underfitting.                                                                                                                           |
| `bagging_freq`           | Frequency of bagging. For example, `5` means bagging is performed every 5 iterations.                                                                                                                          | Medium: Works with `bagging_fraction` to control how often bagging is applied.                                                                                                                                                |
| `min_child_samples`      | Minimum number of data points required in a leaf. Larger values prevent overfitting by limiting leaf size.                                                                                                     | Medium: Controls overfitting by limiting the size of leaf nodes. Higher values may lead to underfitting.                                                                                                                       |
| `colsample_bytree`       | Alias for `feature_fraction`. Fraction of features to randomly select for each tree.                                                                                                                            | Medium: Similar to `feature_fraction`, reduces overfitting and speeds up training.                                                                                                                                            |
| `subsample`              | Alias for `bagging_fraction`. Fraction of data to randomly select for each iteration.                                                                                                                           | Medium: Similar to `bagging_fraction`, reduces overfitting and improves generalization.                                                                                                                                       |
| `reg_alpha`              | L1 regularization term on weights. Adds a penalty for large coefficients to reduce overfitting.                                                                                                                | Medium: Helps reduce overfitting, especially in high-dimensional data.                                                                                                                                                        |
| `reg_lambda`             | L2 regularization term on weights. Adds a penalty for large coefficients to reduce overfitting.                                                                                                                | Medium: Similar to `reg_alpha`, but penalizes squared coefficients.                                                                                                                                                           |
| `max_bin`                | Maximum number of bins for discretizing continuous features. Higher values improve accuracy but increase training time.                                                                                         | Medium: Affects how continuous features are bucketed. Higher values improve precision but increase computational cost.                                                                                                         |
| `min_split_gain`         | Minimum gain required to split a node. Higher values prevent splitting nodes with low information gain.                                                                                                         | Medium-Low: Helps control overfitting by limiting unnecessary splits.                                                                                                                                                         |
| `boosting_type`          | Type of boosting algorithm. Options: `gbdt` (default), `dart`, `goss`.                                                                                                                                          | Medium-Low: Affects the boosting strategy. `dart` and `goss` are alternatives to `gbdt` for specific use cases.                                                                                                               |
| `objective`              | Objective function to optimize. Common options: `regression`, `regression_l1`, `huber`, `fair`.                                                                                                                | Medium-Low: Determines the loss function. Impacts how the model optimizes predictions.                                                                                                                                         |
| `verbosity`              | Controls the level of logging. Higher values provide more detailed logs.                                                                                                                                       | Low: Does not affect model performance but helps with debugging.                                                                                                                                                              |
| `random_state`           | Seed for reproducibility. Ensures consistent results across runs.                                                                                                                                               | Low: Does not affect model performance but ensures reproducibility.                                                                                                                                                           |
| `early_stopping_round`   | Stops training if validation score does not improve for a specified number of rounds.                                                                                                                           | Low: Helps save time during training but does not directly affect model performance.                                                                                                                                           |