In [17]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [19]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")

In [20]:
df["pickup_hour"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 87620 entries, 0 to 87619
Series name: pickup_hour
Non-Null Count  Dtype         
--------------  -----         
87620 non-null  datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 684.7 KB


In [21]:
from datetime import datetime

from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2023, 9, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(55900, 674)
(55900,)
(31720, 674)
(31720,)


In [22]:
X_train

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,2023-01-29,2
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2023-01-30,2
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2023-01-31,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2023-02-01,2
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2023-02-02,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55895,90,64,43,20,13,11,16,29,48,61,...,76,91,97,101,80,94,100,69,2023-08-27,263
55896,18,12,5,5,4,22,48,74,95,109,...,90,80,80,72,91,67,53,26,2023-08-28,263
55897,12,11,10,5,4,15,44,97,118,107,...,59,79,84,69,80,57,37,20,2023-08-29,263
55898,26,11,8,0,8,10,44,97,126,121,...,75,92,91,95,72,68,63,30,2023-08-30,263


In [23]:
def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    last_4_weeks_columns = [
            f"rides_t-{7*24}",  # 1 week ago
            f"rides_t-{14*24}", # 2 weeks ago
            f"rides_t-{21*24}", # 3 weeks ago
            f"rides_t-{28*24}"  # 4 weeks ago
        ]

        # Ensure the required columns exist in the test DataFrame
    for col in last_4_weeks_columns:
        if col not in X.columns:
            raise ValueError(f"Missing required column: {col}")

    # Calculate the average of the last 4 weeks
    X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)

    return X

from sklearn.preprocessing import FunctionTransformer

add_feature_average_rides_last_4_weeks = FunctionTransformer(
    average_rides_last_4_weeks, validate=False
)

In [24]:
from sklearn.base import BaseEstimator, TransformerMixin

class TemporalFeatureEngineer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_["hour"] = X_["pickup_hour"].dt.hour
        X_["day_of_week"] = X_["pickup_hour"].dt.dayofweek

        return X_.drop(columns=["pickup_hour", "pickup_location_id"])

add_temporal_features = TemporalFeatureEngineer()


In [25]:
import lightgbm as lgb

from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_temporal_features,
    lgb.LGBMRegressor()
)

In [26]:
features = df.drop(columns=["target"])
target = df["target"]
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

for train_index, test_index in tscv.split(features):
    X_train, X_test = features.iloc[train_index], features.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]

    print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

Train size: 14605, Test size: 14603
Train size: 29208, Test size: 14603
Train size: 43811, Test size: 14603
Train size: 58414, Test size: 14603
Train size: 73017, Test size: 14603


In [19]:
X_train

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,2023-01-29,2
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2023-01-30,2
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2023-01-31,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2023-02-01,2
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2023-02-02,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73012,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,1,0,0,2023-09-06,220
73013,0,0,0,0,0,0,0,1,2,2,...,0,0,0,0,0,0,0,0,2023-09-07,220
73014,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,1,2023-09-08,220
73015,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,2023-09-09,220


In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV
import lightgbm as lgb

# Define the pipeline
pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,  # Feature engineering step 1
    add_temporal_features,  # Feature engineering step 2
    lgb.LGBMRegressor()  # LightGBM model
)

# Define hyperparameter search space
param_distributions = {
    "lgbmregressor__num_leaves": [20, 31, 50, 100],
    "lgbmregressor__max_depth": [-1, 5, 10, 15],
    "lgbmregressor__learning_rate": [0.01, 0.05, 0.1, 0.2],
    "lgbmregressor__subsample": [0.7, 0.8, 0.9, 1.0]
}

# Create RandomizedSearchCV object
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    scoring="neg_mean_absolute_error",  # MAE as scoring metric
    n_iter=10,  # Number of random samples
    cv=3,  # 3-fold cross-validation
    verbose=1,
    random_state=42
)

# Train with hyperparameter tuning
random_search.fit(X_train, y_train)

# Print best parameters
print("Best Parameters:", random_search.best_params_)
from sklearn.metrics import mean_absolute_error
best_params = random_search.best_params_
best_model = random_search.best_estimator_

# Predict & compute MAE
y_pred = best_model.predict(X_test)
best_mae = mean_absolute_error(y_test, y_pred)
print(best_mae)


# Optuna hyper parameter tuning

In [None]:
import optuna
import lightgbm as lgb
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error

# Define a fixed set of hyperparameters (except learning rate)
fixed_params = {
    "num_leaves": 50,
    "max_depth": 10,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "reg_alpha": 0.1,
    "reg_lambda": 0.1,
    "n_estimators": 200  # A reasonable default
}

# Objective function for learning rate tuning
def tune_learning_rate(trial):
    learning_rate = trial.suggest_loguniform("learning_rate", 0.001, 0.3)

    pipeline = make_pipeline(
        add_feature_average_rides_last_4_weeks,
        add_temporal_features,
        lgb.LGBMRegressor(learning_rate=learning_rate, **fixed_params)
    )

    # Train the model
    pipeline.fit(X_train, y_train)

    # Predict and evaluate MAE
    y_pred = pipeline.predict(X_test)
    return mean_absolute_error(y_test, y_pred)

# Optimize learning rate
study_lr = optuna.create_study(direction="minimize")
study_lr.optimize(tune_learning_rate, n_trials=20)  # Try 20 learning rates

best_learning_rate = study_lr.best_params["learning_rate"]
print(f"Best Learning Rate: {best_learning_rate}")

[I 2025-03-03 12:42:47,921] A new study created in memory with name: no-name-1bb3b5ef-48f0-4eff-a054-d8830637128a
  learning_rate = trial.suggest_loguniform("learning_rate", 0.001, 0.3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066570 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:42:55,703] Trial 0 finished with value: 4.203580427580282 and parameters: {'learning_rate': 0.010908981164979825}. Best is trial 0 with value: 4.203580427580282.
  learning_rate = trial.suggest_loguniform("learning_rate", 0.001, 0.3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073654 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:43:03,231] Trial 1 finished with value: 3.9510272371210897 and parameters: {'learning_rate': 0.012285272825043311}. Best is trial 1 with value: 3.9510272371210897.
  learning_rate = trial.suggest_loguniform("learning_rate", 0.001, 0.3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073207 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:43:09,132] Trial 2 finished with value: 3.999554769301414 and parameters: {'learning_rate': 0.03807716807839043}. Best is trial 1 with value: 3.9510272371210897.




  learning_rate = trial.suggest_loguniform("learning_rate", 0.001, 0.3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.072332 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:43:14,850] Trial 3 finished with value: 3.996486751450546 and parameters: {'learning_rate': 0.04891447222299709}. Best is trial 1 with value: 3.9510272371210897.




  learning_rate = trial.suggest_loguniform("learning_rate", 0.001, 0.3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066316 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:43:22,074] Trial 4 finished with value: 4.177481419457089 and parameters: {'learning_rate': 0.010957244017599635}. Best is trial 1 with value: 3.9510272371210897.
  learning_rate = trial.suggest_loguniform("learning_rate", 0.001, 0.3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.064752 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:43:29,684] Trial 5 finished with value: 11.00637372754458 and parameters: {'learning_rate': 0.0030598539514132186}. Best is trial 1 with value: 3.9510272371210897.
  learning_rate = trial.suggest_loguniform("learning_rate", 0.001, 0.3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070425 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:43:35,306] Trial 6 finished with value: 4.0534253016348964 and parameters: {'learning_rate': 0.1994630667307587}. Best is trial 1 with value: 3.9510272371210897.
  learning_rate = trial.suggest_loguniform("learning_rate", 0.001, 0.3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.075182 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:43:40,389] Trial 7 finished with value: 4.154706808126708 and parameters: {'learning_rate': 0.1687470767507048}. Best is trial 1 with value: 3.9510272371210897.




  learning_rate = trial.suggest_loguniform("learning_rate", 0.001, 0.3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.072721 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:43:48,179] Trial 8 finished with value: 15.945683301346307 and parameters: {'learning_rate': 0.0011357141001088955}. Best is trial 1 with value: 3.9510272371210897.
  learning_rate = trial.suggest_loguniform("learning_rate", 0.001, 0.3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.068088 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:43:54,681] Trial 9 finished with value: 3.758150098818564 and parameters: {'learning_rate': 0.028645648841153587}. Best is trial 9 with value: 3.758150098818564.




  learning_rate = trial.suggest_loguniform("learning_rate", 0.001, 0.3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070371 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:44:00,235] Trial 10 finished with value: 4.202369482232958 and parameters: {'learning_rate': 0.05637850914904884}. Best is trial 9 with value: 3.758150098818564.
  learning_rate = trial.suggest_loguniform("learning_rate", 0.001, 0.3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069650 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:44:07,969] Trial 11 finished with value: 6.446575522073322 and parameters: {'learning_rate': 0.006245601849621494}. Best is trial 9 with value: 3.758150098818564.
  learning_rate = trial.suggest_loguniform("learning_rate", 0.001, 0.3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.068233 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:44:14,468] Trial 12 finished with value: 3.7638820111324574 and parameters: {'learning_rate': 0.027038970017202412}. Best is trial 9 with value: 3.758150098818564.
  learning_rate = trial.suggest_loguniform("learning_rate", 0.001, 0.3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.072832 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:44:20,650] Trial 13 finished with value: 3.8191743499923017 and parameters: {'learning_rate': 0.030539068998363693}. Best is trial 9 with value: 3.758150098818564.




  learning_rate = trial.suggest_loguniform("learning_rate", 0.001, 0.3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069813 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:44:25,629] Trial 14 finished with value: 4.392875598253259 and parameters: {'learning_rate': 0.09437677242226807}. Best is trial 9 with value: 3.758150098818564.
  learning_rate = trial.suggest_loguniform("learning_rate", 0.001, 0.3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070536 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:44:32,080] Trial 15 finished with value: 3.790716588540088 and parameters: {'learning_rate': 0.027002157894176107}. Best is trial 9 with value: 3.758150098818564.




  learning_rate = trial.suggest_loguniform("learning_rate", 0.001, 0.3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.075678 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:44:39,343] Trial 16 finished with value: 7.942348867491545 and parameters: {'learning_rate': 0.0049060738769410505}. Best is trial 9 with value: 3.758150098818564.
  learning_rate = trial.suggest_loguniform("learning_rate", 0.001, 0.3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070385 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:44:44,246] Trial 17 finished with value: 4.3035353377057675 and parameters: {'learning_rate': 0.07755615927332357}. Best is trial 9 with value: 3.758150098818564.
  learning_rate = trial.suggest_loguniform("learning_rate", 0.001, 0.3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.236350 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:44:51,652] Trial 18 finished with value: 3.6173726282064362 and parameters: {'learning_rate': 0.02153048982254091}. Best is trial 18 with value: 3.6173726282064362.




  learning_rate = trial.suggest_loguniform("learning_rate", 0.001, 0.3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069283 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:44:58,530] Trial 19 finished with value: 3.5887826470266924 and parameters: {'learning_rate': 0.017608258814279996}. Best is trial 19 with value: 3.5887826470266924.


Best Learning Rate: 0.017608258814279996


In [28]:
# Objective function for tuning all other hyperparameters
def tune_hyperparameters(trial):
    params = {
        "num_leaves": trial.suggest_int("num_leaves", 20, 200),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "learning_rate": best_learning_rate  # Use best found learning rate
    }

    pipeline = make_pipeline(
        add_feature_average_rides_last_4_weeks,
        add_temporal_features,
        lgb.LGBMRegressor(**params)
    )

    # Train the model
    pipeline.fit(X_train, y_train)

    # Predict and evaluate MAE
    y_pred = pipeline.predict(X_test)
    return mean_absolute_error(y_test, y_pred)

# Run hyperparameter tuning with Optuna
study_hp = optuna.create_study(direction="minimize")
study_hp.optimize(tune_hyperparameters, n_trials=30)  # Run 30 trials

# Get the best parameters
best_params = study_hp.best_params
print("Best Hyperparameters:", best_params)

# Train final model with best hyperparameters
final_pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_temporal_features,
    lgb.LGBMRegressor(**best_params)
)

final_pipeline.fit(X_train, y_train)
y_pred_final = final_pipeline.predict(X_test)

# Compute final MAE
final_mae = mean_absolute_error(y_test, y_pred_final)
print(f"Final MAE after tuning: {final_mae}")

[I 2025-03-03 12:48:20,919] A new study created in memory with name: no-name-b69a3130-19ad-4c56-974f-fdd923d365a4
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069188 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:48:34,767] Trial 0 finished with value: 3.830953272441932 and parameters: {'num_leaves': 95, 'max_depth': 11, 'subsample': 0.7364533352704296, 'colsample_bytree': 0.6925637247977867, 'reg_alpha': 0.32665333982882105, 'reg_lambda': 0.012961366615435201, 'n_estimators': 357}. Best is trial 0 with value: 3.830953272441932.




  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.150487 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:48:43,461] Trial 1 finished with value: 3.62599863417002 and parameters: {'num_leaves': 45, 'max_depth': 8, 'subsample': 0.6714458411566353, 'colsample_bytree': 0.9550150147903447, 'reg_alpha': 0.03254215132803435, 'reg_lambda': 2.2076409792539353, 'n_estimators': 297}. Best is trial 1 with value: 3.62599863417002.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a DataFram

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.067301 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:48:48,622] Trial 2 finished with value: 4.0988132945915625 and parameters: {'num_leaves': 53, 'max_depth': 9, 'subsample': 0.5511746950974608, 'colsample_bytree': 0.9671161021610761, 'reg_alpha': 0.015814456171390816, 'reg_lambda': 0.09843885200263464, 'n_estimators': 126}. Best is trial 1 with value: 3.62599863417002.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a Data

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.086426 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:48:54,081] Trial 3 finished with value: 3.928961112792655 and parameters: {'num_leaves': 45, 'max_depth': 10, 'subsample': 0.9632334908291634, 'colsample_bytree': 0.7809763935508908, 'reg_alpha': 0.0013040191084446036, 'reg_lambda': 1.6386639892366446, 'n_estimators': 136}. Best is trial 1 with value: 3.62599863417002.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a Data

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061727 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:49:13,952] Trial 4 finished with value: 3.8649364378949485 and parameters: {'num_leaves': 169, 'max_depth': 10, 'subsample': 0.5635645257157165, 'colsample_bytree': 0.5474803579411266, 'reg_alpha': 0.009063408164387452, 'reg_lambda': 0.32529400716280993, 'n_estimators': 448}. Best is trial 1 with value: 3.62599863417002.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a Da

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.081540 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:49:24,407] Trial 5 finished with value: 3.83077403555777 and parameters: {'num_leaves': 91, 'max_depth': 7, 'subsample': 0.9962759831035997, 'colsample_bytree': 0.9013652834552938, 'reg_alpha': 0.00017944457741328937, 'reg_lambda': 0.15812636134187794, 'n_estimators': 398}. Best is trial 1 with value: 3.62599863417002.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a Data

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.075294 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:49:30,886] Trial 6 finished with value: 4.458063526292736 and parameters: {'num_leaves': 170, 'max_depth': 8, 'subsample': 0.5278907901340938, 'colsample_bytree': 0.639491210353138, 'reg_alpha': 0.00016294325851258482, 'reg_lambda': 0.037867779036578224, 'n_estimators': 108}. Best is trial 1 with value: 3.62599863417002.




  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066311 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:49:35,151] Trial 7 finished with value: 3.7248789840973577 and parameters: {'num_leaves': 179, 'max_depth': 3, 'subsample': 0.7717010067090402, 'colsample_bytree': 0.759852505689305, 'reg_alpha': 1.469741782980076e-05, 'reg_lambda': 0.215293680907851, 'n_estimators': 392}. Best is trial 1 with value: 3.62599863417002.




  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.072152 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:49:41,908] Trial 8 finished with value: 3.610270945755558 and parameters: {'num_leaves': 164, 'max_depth': 6, 'subsample': 0.6325503376957964, 'colsample_bytree': 0.882572784572313, 'reg_alpha': 0.0005288225973295262, 'reg_lambda': 0.0006182848248655189, 'n_estimators': 237}. Best is trial 8 with value: 3.610270945755558.




  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063391 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:49:57,238] Trial 9 finished with value: 3.5202971836836445 and parameters: {'num_leaves': 127, 'max_depth': 10, 'subsample': 0.5082042241739079, 'colsample_bytree': 0.5726683115827711, 'reg_alpha': 2.3580519960053, 'reg_lambda': 0.8441510180435834, 'n_estimators': 252}. Best is trial 9 with value: 3.5202971836836445.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a DataFr

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.062309 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:50:10,068] Trial 10 finished with value: 3.58048182015489 and parameters: {'num_leaves': 134, 'max_depth': 14, 'subsample': 0.8816471963726946, 'colsample_bytree': 0.505403775811829, 'reg_alpha': 8.836885051082064, 'reg_lambda': 7.595580233293326e-05, 'n_estimators': 229}. Best is trial 9 with value: 3.5202971836836445.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a Dat

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061779 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:50:22,954] Trial 11 finished with value: 3.607054536176153 and parameters: {'num_leaves': 130, 'max_depth': 15, 'subsample': 0.8504508212234516, 'colsample_bytree': 0.5014099961847607, 'reg_alpha': 4.953769840621784, 'reg_lambda': 2.8396484466549767e-05, 'n_estimators': 233}. Best is trial 9 with value: 3.5202971836836445.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a 

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066365 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:50:35,606] Trial 12 finished with value: 3.5948510600797485 and parameters: {'num_leaves': 119, 'max_depth': 14, 'subsample': 0.8737393043152855, 'colsample_bytree': 0.589039306755977, 'reg_alpha': 8.043023163501555, 'reg_lambda': 0.0007066729742273947, 'n_estimators': 216}. Best is trial 9 with value: 3.5202971836836445.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a D

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.065737 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:50:50,696] Trial 13 finished with value: 3.6289162849226457 and parameters: {'num_leaves': 138, 'max_depth': 13, 'subsample': 0.85222424339212, 'colsample_bytree': 0.6118512937504494, 'reg_alpha': 0.618744247435022, 'reg_lambda': 1.3895610437035159e-05, 'n_estimators': 297}. Best is trial 9 with value: 3.5202971836836445.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a D

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063899 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:50:59,776] Trial 14 finished with value: 3.6315145644527984 and parameters: {'num_leaves': 79, 'max_depth': 12, 'subsample': 0.9270616618873471, 'colsample_bytree': 0.505928667683491, 'reg_alpha': 0.9560380207569389, 'reg_lambda': 0.0012232924403431612, 'n_estimators': 195}. Best is trial 9 with value: 3.5202971836836445.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a D

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.068682 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:51:05,769] Trial 15 finished with value: 3.721142090956696 and parameters: {'num_leaves': 200, 'max_depth': 5, 'subsample': 0.7729623999436692, 'colsample_bytree': 0.6999100287810361, 'reg_alpha': 0.1054339397104097, 'reg_lambda': 0.003174131455279687, 'n_estimators': 270}. Best is trial 9 with value: 3.5202971836836445.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a Da

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.084849 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:51:17,386] Trial 16 finished with value: 3.669815051342385 and parameters: {'num_leaves': 144, 'max_depth': 15, 'subsample': 0.6652877312856161, 'colsample_bytree': 0.5601753631325406, 'reg_alpha': 2.1727058781828124, 'reg_lambda': 0.00011751062204149159, 'n_estimators': 168}. Best is trial 9 with value: 3.5202971836836445.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.064388 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:51:31,883] Trial 17 finished with value: 3.654924455589917 and parameters: {'num_leaves': 110, 'max_depth': 12, 'subsample': 0.9080017751044347, 'colsample_bytree': 0.6646926349079545, 'reg_alpha': 0.08946267448114593, 'reg_lambda': 9.17403928153815, 'n_estimators': 353}. Best is trial 9 with value: 3.5202971836836445.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a Data

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070978 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:51:43,844] Trial 18 finished with value: 3.6599476083635967 and parameters: {'num_leaves': 150, 'max_depth': 13, 'subsample': 0.8184069504005974, 'colsample_bytree': 0.834332782853963, 'reg_alpha': 8.071003242901346, 'reg_lambda': 6.934161292639892e-05, 'n_estimators': 176}. Best is trial 9 with value: 3.5202971836836445.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a D

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066387 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:51:53,158] Trial 19 finished with value: 3.725692767716268 and parameters: {'num_leaves': 71, 'max_depth': 4, 'subsample': 0.6095646960168627, 'colsample_bytree': 0.5660651330376618, 'reg_alpha': 1.6681581100807252, 'reg_lambda': 0.008350233978684, 'n_estimators': 260}. Best is trial 9 with value: 3.5202971836836445.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a DataFr

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.159416 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:52:28,652] Trial 20 finished with value: 3.744705231096323 and parameters: {'num_leaves': 112, 'max_depth': 11, 'subsample': 0.717502499634478, 'colsample_bytree': 0.521808109895211, 'reg_alpha': 0.2195843841068504, 'reg_lambda': 0.0001648234362475738, 'n_estimators': 332}. Best is trial 9 with value: 3.5202971836836445.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a Da

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061039 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:52:45,653] Trial 21 finished with value: 3.601412176604224 and parameters: {'num_leaves': 122, 'max_depth': 14, 'subsample': 0.8965017708398585, 'colsample_bytree': 0.5949813331747655, 'reg_alpha': 9.217735834089554, 'reg_lambda': 0.00037998518599085813, 'n_estimators': 205}. Best is trial 9 with value: 3.5202971836836445.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a 

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.105535 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:53:02,507] Trial 22 finished with value: 3.5818610784895077 and parameters: {'num_leaves': 107, 'max_depth': 13, 'subsample': 0.8248744979093632, 'colsample_bytree': 0.5979235040670001, 'reg_alpha': 2.9047664358872645, 'reg_lambda': 0.0022558789130101235, 'n_estimators': 227}. Best is trial 9 with value: 3.5202971836836445.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.074336 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:53:18,590] Trial 23 finished with value: 3.62275774254723 and parameters: {'num_leaves': 100, 'max_depth': 13, 'subsample': 0.8120926898517755, 'colsample_bytree': 0.6445510983765311, 'reg_alpha': 2.184544618161621, 'reg_lambda': 0.003063711888896181, 'n_estimators': 267}. Best is trial 9 with value: 3.5202971836836445.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a Dat

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.065547 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:53:23,369] Trial 24 finished with value: 3.678872176809889 and parameters: {'num_leaves': 21, 'max_depth': 11, 'subsample': 0.8132431146850746, 'colsample_bytree': 0.7110984978821882, 'reg_alpha': 0.5763445703704586, 'reg_lambda': 0.025939353778197744, 'n_estimators': 168}. Best is trial 9 with value: 3.5202971836836445.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a Da

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.064525 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:53:52,484] Trial 25 finished with value: 3.9952403237270104 and parameters: {'num_leaves': 154, 'max_depth': 14, 'subsample': 0.5002738502575493, 'colsample_bytree': 0.541949626889349, 'reg_alpha': 2.4703894703012614, 'reg_lambda': 0.0026647085935734122, 'n_estimators': 493}. Best is trial 9 with value: 3.5202971836836445.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a 

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.071844 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:54:11,755] Trial 26 finished with value: 3.6810550763457295 and parameters: {'num_leaves': 131, 'max_depth': 12, 'subsample': 0.9486465625522413, 'colsample_bytree': 0.6075996208045535, 'reg_alpha': 0.1746610743160722, 'reg_lambda': 1.1354109650786446, 'n_estimators': 320}. Best is trial 9 with value: 3.5202971836836445.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a Da

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063726 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:54:25,453] Trial 27 finished with value: 3.6071949529507563 and parameters: {'num_leaves': 81, 'max_depth': 15, 'subsample': 0.7066872742199716, 'colsample_bytree': 0.5735731006482854, 'reg_alpha': 0.04038790786382453, 'reg_lambda': 4.041287506666597e-05, 'n_estimators': 244}. Best is trial 9 with value: 3.5202971836836445.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.071637 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:54:43,286] Trial 28 finished with value: 3.612052109580532 and parameters: {'num_leaves': 188, 'max_depth': 10, 'subsample': 0.7702615920174012, 'colsample_bytree': 0.643482196209823, 'reg_alpha': 0.0034336062977246115, 'reg_lambda': 0.00022155166745181247, 'n_estimators': 284}. Best is trial 9 with value: 3.5202971836836445.
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070918 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 12:54:53,063] Trial 29 finished with value: 3.8239030452394176 and parameters: {'num_leaves': 100, 'max_depth': 9, 'subsample': 0.8480265982027029, 'colsample_bytree': 0.7249340521431494, 'reg_alpha': 0.44029430020074833, 'reg_lambda': 0.01340721541176389, 'n_estimators': 149}. Best is trial 9 with value: 3.5202971836836445.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].

Best Hyperparameters: {'num_leaves': 127, 'max_depth': 10, 'subsample': 0.5082042241739079, 'colsample_bytree': 0.5726683115827711, 'reg_alpha': 2.3580519960053, 'reg_lambda': 0.8441510180435834, 'n_estimators': 252}
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.065629 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267
Final MAE after tuning: 4.160487813976321


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


In [27]:
import optuna
import lightgbm as lgb
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error

# Define the objective function
def objective(trial):
    params = {
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "max_depth": trial.suggest_int("max_depth", -1, 15),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
        "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
        "n_estimators": trial.suggest_int("n_estimators", 50, 500)
    }

    # Define the pipeline with feature engineering and LGBMRegressor
    pipeline = make_pipeline(
        add_feature_average_rides_last_4_weeks,
        add_temporal_features,
        lgb.LGBMRegressor(**params)
    )

    # Train the model
    pipeline.fit(X_train, y_train)

    # Predict on test data
    y_pred = pipeline.predict(X_test)

    # Compute MAE (Lower is better)
    return mean_absolute_error(y_test, y_pred)

# Run optimization with Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)  # Run 30 trials

# Print best parameters
print("Best Hyperparameters:", study.best_params_)

# Train final model with best hyperparameters
best_params = study.best_params_
final_pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_temporal_features,
    lgb.LGBMRegressor(**best_params)
)

final_pipeline.fit(X_train, y_train)
y_pred_final = final_pipeline.predict(X_test)

# Compute final MAE
final_mae = mean_absolute_error(y_test, y_pred_final)
print(f"Final MAE after tuning: {final_mae}")

[I 2025-03-03 00:58:56,621] A new study created in memory with name: no-name-7f607f64-486b-4d46-9a79-4bb29a553ac2
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073725 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 00:58:59,288] Trial 0 finished with value: 8.480232558747542 and parameters: {'num_leaves': 27, 'max_depth': 11, 'learning_rate': 0.01489726617616917, 'subsample': 0.5157514964387868, 'colsample_bytree': 0.8921081849153656, 'reg_alpha': 0.00017920491815914892, 'reg_lambda': 4.80726198752688e-05, 'n_estimators': 60}. Best is trial 0 with value: 8.480232558747542.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lamb

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066191 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 00:59:11,477] Trial 1 finished with value: 3.6858701461396595 and parameters: {'num_leaves': 29, 'max_depth': 8, 'learning_rate': 0.014548790475350107, 'subsample': 0.7556013304302056, 'colsample_bytree': 0.5559399123004378, 'reg_alpha': 0.030372961220879967, 'reg_lambda': 3.4563635457854507, 'n_estimators': 391}. Best is trial 1 with value: 3.6858701461396595.




  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.072956 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 00:59:20,237] Trial 2 finished with value: 5.2262433191858015 and parameters: {'num_leaves': 37, 'max_depth': 13, 'learning_rate': 0.005563729209750151, 'subsample': 0.8662643445607485, 'colsample_bytree': 0.9808401520169666, 'reg_alpha': 1.1840587102359112, 'reg_lambda': 0.0016930466685439074, 'n_estimators': 284}. Best is trial 1 with value: 3.6858701461396595.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lam

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.071829 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 00:59:24,904] Trial 3 finished with value: 8.804694522780911 and parameters: {'num_leaves': 41, 'max_depth': 9, 'learning_rate': 0.007357654132551573, 'subsample': 0.5387802289311732, 'colsample_bytree': 0.9287810401059915, 'reg_alpha': 0.0673618006461403, 'reg_lambda': 0.0003351150674929031, 'n_estimators': 116}. Best is trial 1 with value: 3.6858701461396595.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambd

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073039 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 00:59:28,408] Trial 4 finished with value: 3.958909134550711 and parameters: {'num_leaves': 90, 'max_depth': 6, 'learning_rate': 0.07055539919100041, 'subsample': 0.8353789721282365, 'colsample_bytree': 0.610815179628752, 'reg_alpha': 1.358660154119222, 'reg_lambda': 0.03875420056049469, 'n_estimators': 157}. Best is trial 1 with value: 3.6858701461396595.




  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.080149 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 00:59:34,502] Trial 5 finished with value: 6.211111233964284 and parameters: {'num_leaves': 59, 'max_depth': 9, 'learning_rate': 0.00897688624348907, 'subsample': 0.8743473864128117, 'colsample_bytree': 0.6259877973551622, 'reg_alpha': 4.200672242711083e-05, 'reg_lambda': 0.026219656560343092, 'n_estimators': 145}. Best is trial 1 with value: 3.6858701461396595.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lamb

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.074196 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 00:59:43,035] Trial 6 finished with value: 3.6110394957102607 and parameters: {'num_leaves': 131, 'max_depth': -1, 'learning_rate': 0.030068006503670367, 'subsample': 0.5141956093032208, 'colsample_bytree': 0.9227261187950799, 'reg_alpha': 0.0001896433258382781, 'reg_lambda': 0.11301759949439863, 'n_estimators': 129}. Best is trial 6 with value: 3.6110394957102607.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_l

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066211 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 01:00:03,641] Trial 7 finished with value: 3.8903468317564576 and parameters: {'num_leaves': 120, 'max_depth': 11, 'learning_rate': 0.015668050337003008, 'subsample': 0.652773414935856, 'colsample_bytree': 0.5735490712478288, 'reg_alpha': 0.0013437433217616682, 'reg_lambda': 0.21488668780125447, 'n_estimators': 494}. Best is trial 6 with value: 3.6110394957102607.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_la

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 01:00:13,964] Trial 8 finished with value: 3.7241014001915795 and parameters: {'num_leaves': 71, 'max_depth': 13, 'learning_rate': 0.011729951961693455, 'subsample': 0.8221621744497501, 'colsample_bytree': 0.8317014210496343, 'reg_alpha': 0.002367367944304672, 'reg_lambda': 0.06771092330220697, 'n_estimators': 257}. Best is trial 6 with value: 3.6110394957102607.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lam

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.065629 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 01:00:17,029] Trial 9 finished with value: 4.298372480643262 and parameters: {'num_leaves': 57, 'max_depth': 15, 'learning_rate': 0.11693486021583487, 'subsample': 0.8216681218458524, 'colsample_bytree': 0.8057481085785212, 'reg_alpha': 1.8792844463885252e-05, 'reg_lambda': 5.8937632577428225, 'n_estimators': 87}. Best is trial 6 with value: 3.6110394957102607.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambd

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.067681 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 01:00:31,600] Trial 10 finished with value: 4.0827280197236 and parameters: {'num_leaves': 150, 'max_depth': -1, 'learning_rate': 0.04342925158156559, 'subsample': 0.9914977170125114, 'colsample_bytree': 0.7204666852561619, 'reg_alpha': 0.0002926391785928622, 'reg_lambda': 0.3453371969636644, 'n_estimators': 244}. Best is trial 6 with value: 3.6110394957102607.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambd

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063091 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 01:00:40,867] Trial 11 finished with value: 3.7353280346601805 and parameters: {'num_leaves': 105, 'max_depth': 3, 'learning_rate': 0.02671832895846464, 'subsample': 0.6720386095446487, 'colsample_bytree': 0.5109252086001864, 'reg_alpha': 0.03951048464082864, 'reg_lambda': 9.953725548184037, 'n_estimators': 426}. Best is trial 6 with value: 3.6110394957102607.




  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.072869 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 01:00:45,588] Trial 12 finished with value: 3.9320358540965517 and parameters: {'num_leaves': 141, 'max_depth': 4, 'learning_rate': 0.030744475049639307, 'subsample': 0.6905286633671174, 'colsample_bytree': 0.7184702339988607, 'reg_alpha': 0.026829108159070976, 'reg_lambda': 0.7582971361262424, 'n_estimators': 369}. Best is trial 6 with value: 3.6110394957102607.




  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.083642 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 01:01:03,793] Trial 13 finished with value: 3.8175924040630966 and parameters: {'num_leaves': 123, 'max_depth': 0, 'learning_rate': 0.021915185410499683, 'subsample': 0.5984677563732322, 'colsample_bytree': 0.6769919188404971, 'reg_alpha': 0.19119946490284018, 'reg_lambda': 1.4692161226850917, 'n_estimators': 343}. Best is trial 6 with value: 3.6110394957102607.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lamb

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.080750 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 01:01:06,384] Trial 14 finished with value: 3.824355007213766 and parameters: {'num_leaves': 88, 'max_depth': 2, 'learning_rate': 0.051631926267352767, 'subsample': 0.7565864958470474, 'colsample_bytree': 0.8018494399375069, 'reg_alpha': 0.002178739262378864, 'reg_lambda': 0.006533033916780655, 'n_estimators': 192}. Best is trial 6 with value: 3.6110394957102607.




  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073132 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 01:01:16,221] Trial 15 finished with value: 4.253465685449847 and parameters: {'num_leaves': 109, 'max_depth': 6, 'learning_rate': 0.1372455933339761, 'subsample': 0.7470761108860966, 'colsample_bytree': 0.5267885793668182, 'reg_alpha': 0.0065825399995033925, 'reg_lambda': 2.0155515690524695, 'n_estimators': 348}. Best is trial 6 with value: 3.6110394957102607.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambd

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073032 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 01:01:20,100] Trial 16 finished with value: 4.26675484920168 and parameters: {'num_leaves': 74, 'max_depth': 1, 'learning_rate': 0.01974703045796077, 'subsample': 0.5814681645848532, 'colsample_bytree': 0.9910982384091177, 'reg_alpha': 5.024649049484895, 'reg_lambda': 0.14679337945670776, 'n_estimators': 429}. Best is trial 6 with value: 3.6110394957102607.




  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.076765 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 01:01:25,631] Trial 17 finished with value: 4.097202406280357 and parameters: {'num_leaves': 132, 'max_depth': 8, 'learning_rate': 0.06833620693962514, 'subsample': 0.9583734814275622, 'colsample_bytree': 0.8585338207024402, 'reg_alpha': 0.00013006095111506608, 'reg_lambda': 0.006683153475884533, 'n_estimators': 216}. Best is trial 6 with value: 3.6110394957102607.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_l

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.078783 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 01:01:30,577] Trial 18 finished with value: 3.9994111237021324 and parameters: {'num_leaves': 25, 'max_depth': 5, 'learning_rate': 0.04224223224371151, 'subsample': 0.7548436553559369, 'colsample_bytree': 0.7669982711662733, 'reg_alpha': 0.22270953062028767, 'reg_lambda': 2.46338000602308, 'n_estimators': 299}. Best is trial 6 with value: 3.6110394957102607.




  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.079793 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 01:01:39,194] Trial 19 finished with value: 4.359102191427615 and parameters: {'num_leaves': 107, 'max_depth': 7, 'learning_rate': 0.19124872835393145, 'subsample': 0.6076903463307156, 'colsample_bytree': 0.9253144793251535, 'reg_alpha': 0.000840532216527005, 'reg_lambda': 0.583110890833771, 'n_estimators': 409}. Best is trial 6 with value: 3.6110394957102607.




  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070968 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 01:01:43,537] Trial 20 finished with value: 3.874234351684533 and parameters: {'num_leaves': 48, 'max_depth': 3, 'learning_rate': 0.01027561851198206, 'subsample': 0.7234517286172545, 'colsample_bytree': 0.6493074042219241, 'reg_alpha': 0.010736539523322902, 'reg_lambda': 2.3372054441758216e-05, 'n_estimators': 308}. Best is trial 6 with value: 3.6110394957102607.




  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.078000 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 01:01:54,269] Trial 21 finished with value: 3.7134200840109743 and parameters: {'num_leaves': 69, 'max_depth': 14, 'learning_rate': 0.012166222168076257, 'subsample': 0.799141724767609, 'colsample_bytree': 0.8481954146701742, 'reg_alpha': 0.003538951767033702, 'reg_lambda': 0.05668969593434433, 'n_estimators': 236}. Best is trial 6 with value: 3.6110394957102607.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lam

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070608 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 01:02:03,591] Trial 22 finished with value: 3.681018274877105 and parameters: {'num_leaves': 66, 'max_depth': 15, 'learning_rate': 0.014976120748192599, 'subsample': 0.7847151389574248, 'colsample_bytree': 0.8765458370544233, 'reg_alpha': 0.007812779345988102, 'reg_lambda': 0.0017222495334447805, 'n_estimators': 205}. Best is trial 6 with value: 3.6110394957102607.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_l

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.068078 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 01:02:13,010] Trial 23 finished with value: 3.6921023285284207 and parameters: {'num_leaves': 96, 'max_depth': 11, 'learning_rate': 0.019767668565477615, 'subsample': 0.908933892533891, 'colsample_bytree': 0.9360139710740365, 'reg_alpha': 1.1130059993313653e-05, 'reg_lambda': 0.0006128665013776889, 'n_estimators': 172}. Best is trial 6 with value: 3.6110394957102607.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.080728 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 01:02:19,043] Trial 24 finished with value: 3.6763519554289252 and parameters: {'num_leaves': 79, 'max_depth': 10, 'learning_rate': 0.028205878446655917, 'subsample': 0.7078088653871575, 'colsample_bytree': 0.8819121758988244, 'reg_alpha': 0.013424578068024924, 'reg_lambda': 0.0026050179629713874, 'n_estimators': 120}. Best is trial 6 with value: 3.6110394957102607.




  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-5, 10),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.074719 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 01:02:24,957] Trial 25 finished with value: 3.6268489193076063 and parameters: {'num_leaves': 80, 'max_depth': 15, 'learning_rate': 0.03835506513148021, 'subsample': 0.6365191304661013, 'colsample_bytree': 0.8868763952587985, 'reg_alpha': 0.0005862650216230665, 'reg_lambda': 0.0019159282910604185, 'n_estimators': 111}. Best is trial 6 with value: 3.6110394957102607.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.146889 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 01:02:31,251] Trial 26 finished with value: 3.611578452902638 and parameters: {'num_leaves': 79, 'max_depth': 12, 'learning_rate': 0.037551983337443126, 'subsample': 0.6394942330979904, 'colsample_bytree': 0.9562715030483523, 'reg_alpha': 0.0005925598129714717, 'reg_lambda': 0.00014265053924604164, 'n_estimators': 106}. Best is trial 6 with value: 3.6110394957102607.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.069339 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 01:02:34,780] Trial 27 finished with value: 3.6383169175010797 and parameters: {'num_leaves': 97, 'max_depth': 13, 'learning_rate': 0.06602567847624359, 'subsample': 0.5586753341319608, 'colsample_bytree': 0.9485917953744667, 'reg_alpha': 0.0005148616860732419, 'reg_lambda': 0.00014569028729413103, 'n_estimators': 52}. Best is trial 6 with value: 3.6110394957102607.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.082110 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 01:02:40,560] Trial 28 finished with value: 3.676040676743624 and parameters: {'num_leaves': 80, 'max_depth': 12, 'learning_rate': 0.040879214657004136, 'subsample': 0.6393194403169666, 'colsample_bytree': 0.9603351239277065, 'reg_alpha': 8.931480963336875e-05, 'reg_lambda': 0.00013937081885831312, 'n_estimators': 95}. Best is trial 6 with value: 3.6110394957102607.
  "learning_rate": trial.suggest_loguniform("learning_rate", 0.005, 0.2),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-5, 10),
  "reg_

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.099333 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
[I 2025-03-03 01:02:46,089] Trial 29 finished with value: 3.9925319499848437 and parameters: {'num_leaves': 121, 'max_depth': 15, 'learning_rate': 0.08679210770552659, 'subsample': 0.5186167725341584, 'colsample_bytree': 0.8996393794315982, 'reg_alpha': 0.00034994233012159445, 'reg_lambda': 3.097436000532844e-05, 'n_estimators': 75}. Best is trial 6 with value: 3.6110394957102607.


AttributeError: 'Study' object has no attribute 'best_params_'

In [29]:
# Run optimization with 30 trials
# study.optimize(objective, n_trials=30)  # Run 30 trials

# # Print best parameters
print("Best Hyperparameters:", study.best_params)  # Fixed the attribute name

# Train final model with best hyperparameters
best_params = study.best_params  # Fixed the attribute name
final_pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_temporal_features,
    lgb.LGBMRegressor(**best_params)
)

final_pipeline.fit(X_train, y_train)
y_pred_final = final_pipeline.predict(X_test)

# Compute final MAE
final_mae = mean_absolute_error(y_test, y_pred_final)
print(f"Final MAE after tuning: {final_mae}")

Best Hyperparameters: {'num_leaves': 131, 'max_depth': -1, 'learning_rate': 0.030068006503670367, 'subsample': 0.5141956093032208, 'colsample_bytree': 0.9227261187950799, 'reg_alpha': 0.0001896433258382781, 'reg_lambda': 0.11301759949439863, 'n_estimators': 129}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.073465 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 161882
[LightGBM] [Info] Number of data points in the train set: 73017, number of used features: 674
[LightGBM] [Info] Start training from score 10.087267
Final MAE after tuning: 3.6110394957102607


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


In [21]:
from sklearn.metrics import mean_absolute_error
best_params = random_search.best_params_
best_model = random_search.best_estimator_

# Predict & compute MAE
y_pred = best_model.predict(X_test)
best_mae = mean_absolute_error(y_test, y_pred)
print(best_mae)

3.7376018753407343


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)


| **Hyperparameter**       | **Description**                                                                                                                                                                                                 | **Impact**                                                                                                                                                                                                                     |
|---------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `num_leaves`             | Maximum number of leaves in one tree. Larger values increase model complexity and accuracy but may lead to overfitting.                                                                                         | High: Directly controls the complexity of the model. Larger values can improve accuracy but increase the risk of overfitting.                                                                                                  |
| `max_depth`              | Maximum depth of a tree. Limits the depth of the tree to prevent overfitting. Use `-1` for no limit.                                                                                                           | High: Limits the model's ability to capture complex patterns. Shallower trees reduce overfitting but may underfit.                                                                                                             |
| `learning_rate`          | Step size for updating weights. Smaller values make training slower but improve generalization.                                                                                                                | High: Affects convergence speed and generalization. Lower values often require more iterations (`n_estimators`).                                                                                                               |
| `n_estimators`           | Number of boosting iterations (trees). Higher values improve accuracy but increase training time.                                                                                                              | High: Controls the number of trees in the ensemble. Too many trees can lead to overfitting if `learning_rate` is not adjusted.                                                                                                |
| `feature_fraction`       | Fraction of features (columns) to randomly select for each tree. Helps prevent overfitting.                                                                                                                    | Medium-High: Reduces overfitting and speeds up training. Lower values may lead to underfitting.                                                                                                                               |
| `bagging_fraction`       | Fraction of data (rows) to randomly select for each iteration. Works with `bagging_freq`.                                                                                                                       | Medium-High: Reduces overfitting and improves generalization. Lower values may lead to underfitting.                                                                                                                           |
| `bagging_freq`           | Frequency of bagging. For example, `5` means bagging is performed every 5 iterations.                                                                                                                          | Medium: Works with `bagging_fraction` to control how often bagging is applied.                                                                                                                                                |
| `min_child_samples`      | Minimum number of data points required in a leaf. Larger values prevent overfitting by limiting leaf size.                                                                                                     | Medium: Controls overfitting by limiting the size of leaf nodes. Higher values may lead to underfitting.                                                                                                                       |
| `colsample_bytree`       | Alias for `feature_fraction`. Fraction of features to randomly select for each tree.                                                                                                                            | Medium: Similar to `feature_fraction`, reduces overfitting and speeds up training.                                                                                                                                            |
| `subsample`              | Alias for `bagging_fraction`. Fraction of data to randomly select for each iteration.                                                                                                                           | Medium: Similar to `bagging_fraction`, reduces overfitting and improves generalization.                                                                                                                                       |
| `reg_alpha`              | L1 regularization term on weights. Adds a penalty for large coefficients to reduce overfitting.                                                                                                                | Medium: Helps reduce overfitting, especially in high-dimensional data.                                                                                                                                                        |
| `reg_lambda`             | L2 regularization term on weights. Adds a penalty for large coefficients to reduce overfitting.                                                                                                                | Medium: Similar to `reg_alpha`, but penalizes squared coefficients.                                                                                                                                                           |
| `max_bin`                | Maximum number of bins for discretizing continuous features. Higher values improve accuracy but increase training time.                                                                                         | Medium: Affects how continuous features are bucketed. Higher values improve precision but increase computational cost.                                                                                                         |
| `min_split_gain`         | Minimum gain required to split a node. Higher values prevent splitting nodes with low information gain.                                                                                                         | Medium-Low: Helps control overfitting by limiting unnecessary splits.                                                                                                                                                         |
| `boosting_type`          | Type of boosting algorithm. Options: `gbdt` (default), `dart`, `goss`.                                                                                                                                          | Medium-Low: Affects the boosting strategy. `dart` and `goss` are alternatives to `gbdt` for specific use cases.                                                                                                               |
| `objective`              | Objective function to optimize. Common options: `regression`, `regression_l1`, `huber`, `fair`.                                                                                                                | Medium-Low: Determines the loss function. Impacts how the model optimizes predictions.                                                                                                                                         |
| `verbosity`              | Controls the level of logging. Higher values provide more detailed logs.                                                                                                                                       | Low: Does not affect model performance but helps with debugging.                                                                                                                                                              |
| `random_state`           | Seed for reproducibility. Ensures consistent results across runs.                                                                                                                                               | Low: Does not affect model performance but ensures reproducibility.                                                                                                                                                           |
| `early_stopping_round`   | Stops training if validation score does not improve for a specified number of rounds.                                                                                                                           | Low: Helps save time during training but does not directly affect model performance.                                                                                                                                           |