Import Statements

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
import pandas as pd
from datetime import datetime
import lightgbm as lgb
from dotenv import load_dotenv
load_dotenv()
from sklearn.metrics import mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from src.config import TRANSFORMED_DATA_DIR
from src.data_utils import split_time_series_data
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow

Data Loading and Test Train split

In [2]:
# Load the tabular data
df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")
df.head(5)

# Split the data into training and testing sets
# Training period: January 2024 to August 2024
# Test period: September 2024 to January 2025
X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2025, 1, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# Select only the numeric features (lagged ride counts)
past_ride_columns = [c for c in X_train.columns if c.startswith("rides_")]
X_train_only_numeric = X_train[past_ride_columns]
X_test_only_numeric = X_test[past_ride_columns]

(24336, 674)
(24336,)
(2232, 674)
(2232,)


In [3]:
# Feature Engineering: Average rides over the last 4 weeks
def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    last_4_weeks_columns = [
        f"rides_t-{7*24}",  # 1 week ago
        f"rides_t-{14*24}", # 2 weeks ago
        f"rides_t-{21*24}", # 3 weeks ago
        f"rides_t-{28*24}"  # 4 weeks ago
    ]

    # Ensure the required columns exist in the DataFrame
    for col in last_4_weeks_columns:
        if col not in X.columns:
            raise ValueError(f"Missing required column: {col}")

    # Calculate the average of the last 4 weeks
    X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)

    return X

In [4]:
add_feature_average_rides_last_4_weeks = FunctionTransformer(
    average_rides_last_4_weeks, validate=False
)
add_feature_average_rides_last_4_weeks.fit_transform(X_train)

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,start_station_name,average_rides_last_4_weeks
0,4,9,3,0,2,0,0,0,1,2,...,4,6,8,10,7,2,0,2024-01-29 00:00:00,8 Ave & W 31 St,4.75
1,9,3,0,2,0,0,0,1,2,3,...,6,8,10,7,2,0,4,2024-01-29 01:00:00,8 Ave & W 31 St,2.75
2,3,0,2,0,0,0,1,2,3,3,...,8,10,7,2,0,4,0,2024-01-29 02:00:00,8 Ave & W 31 St,0.75
3,0,2,0,0,0,1,2,3,3,8,...,10,7,2,0,4,0,1,2024-01-29 03:00:00,8 Ave & W 31 St,0.00
4,2,0,0,0,1,2,3,3,8,5,...,7,2,0,4,0,1,1,2024-01-29 04:00:00,8 Ave & W 31 St,0.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24331,34,27,17,5,4,0,0,2,2,0,...,15,31,16,29,32,29,11,2024-12-31 19:00:00,W 21 St & 6 Ave,28.00
24332,27,17,5,4,0,0,2,2,0,1,...,31,16,29,32,29,11,12,2024-12-31 20:00:00,W 21 St & 6 Ave,22.50
24333,17,5,4,0,0,2,2,0,1,8,...,16,29,32,29,11,12,3,2024-12-31 21:00:00,W 21 St & 6 Ave,9.50
24334,5,4,0,0,2,2,0,1,8,16,...,29,32,29,11,12,3,2,2024-12-31 22:00:00,W 21 St & 6 Ave,4.50


In [5]:
# Feature Engineering: Temporal features (hour, day of week)
class TemporalFeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_["hour"] = X_["pickup_hour"].dt.hour
        X_["day_of_week"] = X_["pickup_hour"].dt.dayofweek
        return X_.drop(columns=["pickup_hour", "start_station_name"])

add_temporal_features = TemporalFeatureEngineer()
add_temporal_features.fit_transform(X_train)

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,average_rides_last_4_weeks,hour,day_of_week
0,4,9,3,0,2,0,0,0,1,2,...,4,6,8,10,7,2,0,4.75,0,0
1,9,3,0,2,0,0,0,1,2,3,...,6,8,10,7,2,0,4,2.75,1,0
2,3,0,2,0,0,0,1,2,3,3,...,8,10,7,2,0,4,0,0.75,2,0
3,0,2,0,0,0,1,2,3,3,8,...,10,7,2,0,4,0,1,0.00,3,0
4,2,0,0,0,1,2,3,3,8,5,...,7,2,0,4,0,1,1,0.75,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24331,34,27,17,5,4,0,0,2,2,0,...,15,31,16,29,32,29,11,28.00,19,1
24332,27,17,5,4,0,0,2,2,0,1,...,31,16,29,32,29,11,12,22.50,20,1
24333,17,5,4,0,0,2,2,0,1,8,...,16,29,32,29,11,12,3,9.50,21,1
24334,5,4,0,0,2,2,0,1,8,16,...,29,32,29,11,12,3,2,4.50,22,1


In [6]:
# Create a pipeline with feature engineering and LightGBM
pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_temporal_features,
    lgb.LGBMRegressor()
)

In [7]:
# Train the pipeline
pipeline.fit(X_train, y_train)

# Make predictions and evaluate
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f"LightGBM with Feature Engineering MAE: {test_mae:.4f}")

# Log the model to MLflow
mlflow = set_mlflow_tracking()
log_model_to_mlflow(pipeline, X_test, "LGBMRegressorWFE", "mean_absolute_error", score=test_mae)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049766 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 66142
[LightGBM] [Info] Number of data points in the train set: 24336, number of used features: 675
[LightGBM] [Info] Start training from score 17.580950


INFO:src.experiment_utils:MLflow tracking URI and credentials set.


LightGBM with Feature Engineering MAE: 2.9993


INFO:src.experiment_utils:Experiment set to: LGBMRegressorWFE
INFO:src.experiment_utils:Logged mean_absolute_error: 2.9992951412572237
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/05/01 00:04:54 INFO mlflow.models.model: Found the following environment variables used during model inference: [HOPSWORKS_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
Registered model 'Pipeline' already exists. Creating a new version of this model...
2025/05/01 00:05:10 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Pipeline, version 3
Created version '3' of model 'Pipeline'.
INFO:src.experiment_utils:Model logged with name: Pipeline


🏃 View run worried-sloth-632 at: https://dagshub.com/singhvarun0405/CDA500PF1.mlflow/#/experiments/5/runs/25cde462f1de4fc597247fb9331f3f70
🧪 View experiment at: https://dagshub.com/singhvarun0405/CDA500PF1.mlflow/#/experiments/5


<mlflow.models.model.ModelInfo at 0x1b883193560>