In [1]:
%load_ext autoreload
%autoreload 2

import sys, os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

# Load Citi Bike tabular data
df = pd.read_parquet(TRANSFORMED_DATA_DIR / "citibike_tabular_data_28d.parquet")
df["pickup_hour"].info()


<class 'pandas.core.series.Series'>
RangeIndex: 2265 entries, 0 to 2264
Series name: pickup_hour
Non-Null Count  Dtype         
--------------  -----         
2265 non-null   datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 17.8 KB


In [2]:
from datetime import datetime
from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    df,  # or df_ts or your actual DataFrame
    cutoff_date=datetime(2025, 1, 1, 0, 0, 0),  # within your date range
    target_column="target"  # actual target column name
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


(1665, 674)
(1665,)
(600, 674)
(600,)


In [3]:
def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    last_4_weeks_columns = [f"rides_t-{7*24}", f"rides_t-{14*24}", f"rides_t-{21*24}", f"rides_t-{28*24}"]
    for col in last_4_weeks_columns:
        if col not in X.columns:
            raise ValueError(f"Missing required column: {col}")
    X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)
    return X


In [4]:
from sklearn.preprocessing import FunctionTransformer
add_feature_average_rides_last_4_weeks = FunctionTransformer(average_rides_last_4_weeks, validate=False)

from sklearn.base import BaseEstimator, TransformerMixin
class TemporalFeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X, y=None):
        X_ = X.copy()
        X_["hour"] = X_["pickup_hour"].dt.hour
        X_["day_of_week"] = X_["pickup_hour"].dt.dayofweek
        return X_.drop(columns=["pickup_hour", "pickup_location_id"])

add_temporal_features = TemporalFeatureEngineer()

import lightgbm as lgb
from sklearn.pipeline import make_pipeline
pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_temporal_features,
    lgb.LGBMRegressor()
)


In [6]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
import mlflow

param_distributions = {
    "lgbmregressor__num_leaves": [2, 50, 70, 256],
    "lgbmregressor__max_depth": [-1, 10, 20, 30],
    "lgbmregressor__learning_rate": [0.01, 0.05, 0.1, 0.2],
    "lgbmregressor__n_estimators": [100, 200, 500, 1000],
    "lgbmregressor__min_child_samples": [10, 20, 30, 50],
    "lgbmregressor__subsample": [0.6, 0.8, 1.0],
    "lgbmregressor__colsample_bytree": [0.6, 0.8, 1.0],
    "lgbmregressor__reg_alpha": [0, 0.1, 0.5, 1.0],
    "lgbmregressor__reg_lambda": [0, 0.1, 0.5, 1.0],
    "lgbmregressor__feature_fraction": [0.6, 0.7, 0.8, 0.9, 1.0], 
    "lgbmregressor__bagging_fraction": [0.6, 0.7, 0.8, 0.9, 1.0], 
    "lgbmregressor__bagging_freq": [1, 5, 10],
}

mlflow.start_run(run_name="LightGBM_LR_Tuning")

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=5,
    scoring="neg_mean_absolute_error",
    cv=3,
    verbose=2,
    random_state=42,
)
random_search.fit(X_train, y_train)


Exception: Run with UUID ab56cda4354c4b8396e94db335140d0d is already active. To start a new run, first end the current run with mlflow.end_run(). To start a nested run, call start_run with nested=True

In [7]:
best_lr = random_search.best_params_["lgbmregressor__learning_rate"]
y_pred_lr = random_search.best_estimator_.predict(X_test)
mae_lr = mean_absolute_error(y_test, y_pred_lr)

mlflow.log_param("best_learning_rate", best_lr)
mlflow.log_metric("test_mae_lr", mae_lr)
print("Learning Rate Tuning - Best LR:", best_lr)
print("Learning Rate Tuning - Test MAE:", mae_lr)

mlflow.end_run()


Learning Rate Tuning - Best LR: 0.01
Learning Rate Tuning - Test MAE: 0.8665733777863102
üèÉ View run LightGBM_LR_Tuning at: https://dagshub.com/yaseensiddiqui36/cda500_final_project_citi_bike.mlflow/#/experiments/0/runs/ab56cda4354c4b8396e94db335140d0d
üß™ View experiment at: https://dagshub.com/yaseensiddiqui36/cda500_final_project_citi_bike.mlflow/#/experiments/0


In [8]:
print("Best Parameters:", random_search.best_params_)
print("Best Score (Negative MAE):", random_search.best_score_)

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print("Test Set MAE:", mae)


Best Parameters: {'lgbmregressor__subsample': 0.8, 'lgbmregressor__reg_lambda': 0.5, 'lgbmregressor__reg_alpha': 0, 'lgbmregressor__num_leaves': 2, 'lgbmregressor__n_estimators': 1000, 'lgbmregressor__min_child_samples': 30, 'lgbmregressor__max_depth': 10, 'lgbmregressor__learning_rate': 0.01, 'lgbmregressor__feature_fraction': 0.6, 'lgbmregressor__colsample_bytree': 1.0, 'lgbmregressor__bagging_freq': 5, 'lgbmregressor__bagging_fraction': 1.0}
Best Score (Negative MAE): -0.7058792715887047
Test Set MAE: 0.8665733777863102


In [9]:
param_distributions_2 = {
    "lgbmregressor__learning_rate": [best_lr],
    "lgbmregressor__num_leaves": [31, 50, 70, 100],
    "lgbmregressor__max_depth": [-1, 10, 20, 30],
    "lgbmregressor__n_estimators": [100, 200, 500],
}

mlflow.start_run(run_name="LightGBM_2nd_Tuning")

random_search_2 = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions_2,
    n_iter=5,
    scoring="neg_mean_absolute_error",
    cv=3,
    verbose=2,
    random_state=42,
)
random_search_2.fit(X_train, y_train)

best_params = random_search_2.best_params_
y_pred_final = random_search_2.best_estimator_.predict(X_test)
mae_final = mean_absolute_error(y_test, y_pred_final)

mlflow.log_params(best_params)
mlflow.log_metric("test_mae_final", mae_final)

print("Other Parameters Tuning - Best Params:", best_params)
print("Other Parameters Tuning - Test MAE:", mae_final)

mlflow.end_run()


Fitting 3 folds for each of 5 candidates, totalling 15 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025426 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11497
[LightGBM] [Info] Number of data points in the train set: 1110, number of used features: 674
[LightGBM] [Info] Start training from score 0.530631
[CV] END lgbmregressor__learning_rate=0.01, lgbmregressor__max_depth=20, lgbmregressor__n_estimators=100, lgbmregressor__num_leaves=100; total time=   1.0s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016249 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12982
[LightGBM] [Info] Number of data points in the train set: 1110, number of used features: 674
[LightGBM] [Info] Start