In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [3]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")
df

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id,target
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-01-29,4074.14,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-01-30,4074.14,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-01-31,4074.14,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-02-01,4074.14,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-02-02,4074.14,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69285,0,0,0,0,0,0,0,0,0,0,...,4,0,1,3,0,0,1,2024-12-27,JC116,0
69286,0,0,0,0,0,0,0,0,2,0,...,0,1,2,0,6,2,1,2024-12-28,JC116,0
69287,0,0,0,0,0,1,0,0,2,0,...,1,1,1,0,0,0,0,2024-12-29,JC116,3
69288,0,0,0,0,0,0,2,0,0,0,...,2,1,0,1,3,1,2,2024-12-30,JC116,1


In [5]:
from datetime import datetime
from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2024, 9, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


(44280, 674)
(44280,)
(25010, 674)
(25010,)


In [6]:
past_ride_columns = [c for c in X_train.columns if c.startswith("rides_")]
X_train_only_numeric = X_train[past_ride_columns]
X_test_only_numeric = X_test[past_ride_columns]

In [7]:
import pandas as pd

def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    # Define lag columns: 1, 2, 3, and 4 weeks ago (assuming hourly data)
    last_4_weeks_columns = [
        f"rides_t-{7*24}",   # 1 week ago
        f"rides_t-{14*24}",  # 2 weeks ago
        f"rides_t-{21*24}",  # 3 weeks ago
        f"rides_t-{28*24}"   # 4 weeks ago
    ]

    # Validate presence of required columns
    for col in last_4_weeks_columns:
        if col not in X.columns:
            raise ValueError(f"Missing required column: {col}")

    # Work on a copy to avoid modifying original DataFrame
    X_new = X.copy()
    X_new["average_rides_last_4_weeks"] = X_new[last_4_weeks_columns].mean(axis=1)

    return X_new


In [8]:
from sklearn.preprocessing import FunctionTransformer

add_feature_average_rides_last_4_weeks = FunctionTransformer(
    average_rides_last_4_weeks, validate=False
)

In [9]:
add_feature_average_rides_last_4_weeks.fit_transform(X_train)

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id,average_rides_last_4_weeks
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-01-29,4074.14,0.00
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-01-30,4074.14,0.00
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-01-31,4074.14,0.00
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-02-01,4074.14,0.00
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-02-02,4074.14,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44275,1,1,0,0,0,0,0,1,6,1,...,12,12,5,7,2,1,3,2024-08-27,JC116,1.00
44276,2,0,0,0,0,0,0,2,2,2,...,21,7,7,8,1,1,0,2024-08-28,JC116,0.75
44277,0,0,0,0,0,0,0,2,5,2,...,16,10,10,5,2,0,0,2024-08-29,JC116,0.25
44278,1,0,0,0,0,0,1,0,0,2,...,12,12,6,2,3,1,1,2024-08-30,JC116,0.75


In [10]:
from sklearn.base import BaseEstimator, TransformerMixin

class TemporalFeatureEngineer(BaseEstimator, TransformerMixin):
    """
    Extracts temporal features like hour of day and day of week from 'pickup_hour'.
    Optionally drops 'pickup_hour' and 'pickup_location_id'.
    """

    def __init__(self, drop_original=True):
        self.drop_original = drop_original

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()

        # Ensure pickup_hour is datetime
        if not pd.api.types.is_datetime64_any_dtype(X_["pickup_hour"]):
            X_["pickup_hour"] = pd.to_datetime(X_["pickup_hour"])

        # Extract temporal features
        X_["hour"] = X_["pickup_hour"].dt.hour
        X_["day_of_week"] = X_["pickup_hour"].dt.dayofweek

        # Drop columns if specified
        if self.drop_original:
            X_ = X_.drop(columns=["pickup_hour", "pickup_location_id"])

        return X_


In [11]:
add_temporal_features = TemporalFeatureEngineer()
add_temporal_features.fit_transform(X_train)

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,hour,day_of_week
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44275,1,1,0,0,0,0,0,1,6,1,...,8,12,12,5,7,2,1,3,0,1
44276,2,0,0,0,0,0,0,2,2,2,...,10,21,7,7,8,1,1,0,0,2
44277,0,0,0,0,0,0,0,2,5,2,...,4,16,10,10,5,2,0,0,0,3
44278,1,0,0,0,0,0,1,0,0,2,...,7,12,12,6,2,3,1,1,0,4


In [12]:
import lightgbm as lgb
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_temporal_features,
    lgb.LGBMRegressor(random_state=42)
)


In [13]:
X_train

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2024-01-29,4074.14
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2024-01-30,4074.14
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2024-01-31,4074.14
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2024-02-01,4074.14
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2024-02-02,4074.14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44275,1,1,0,0,0,0,0,1,6,1,...,8,12,12,5,7,2,1,3,2024-08-27,JC116
44276,2,0,0,0,0,0,0,2,2,2,...,10,21,7,7,8,1,1,0,2024-08-28,JC116
44277,0,0,0,0,0,0,0,2,5,2,...,4,16,10,10,5,2,0,0,2024-08-29,JC116
44278,1,0,0,0,0,0,1,0,0,2,...,7,12,12,6,2,3,1,1,2024-08-30,JC116


In [14]:
pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.120462 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13688
[LightGBM] [Info] Number of data points in the train set: 44280, number of used features: 674
[LightGBM] [Info] Start training from score 0.164792


In [15]:
from sklearn.metrics import mean_absolute_error
predictions = pipeline.predict(X_test)

In [16]:
test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae:.4f}")

0.1966


In [17]:
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
import os
from sklearn.metrics import mean_absolute_error

# Load environment variables (e.g., for Hopsworks, remote MLflow, etc.)
load_dotenv()  

# Set up MLflow client
mlflow = set_mlflow_tracking()

# Get predictions and evaluate
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)

# Log final model
log_model_to_mlflow(
    model=pipeline,
    input_data=X_test,
    experiment_name="LGBMRegressorWFE",
    metric_name="mean_absolute_error",
    model_name="LGBMRegressorWFE",
    score=test_mae
)


INFO:src.experiment_utils:MLflow tracking URI and credentials set.
2025/05/06 20:23:24 INFO mlflow.tracking.fluent: Experiment with name 'LGBMRegressorWFE' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: LGBMRegressorWFE
INFO:src.experiment_utils:Logged mean_absolute_error: 0.1965799558451603
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/05/06 20:24:10 INFO mlflow.models.model: Found the following environment variables used during model inference: [HOPSWORKS_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
Successfully registered model 'LGBMRegressorWFE'.
2025/05/06 20:25:23 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LGBMRegressorWFE, version 1
Created version '1' of model 'LGBMRegressorWFE'.
INFO:src.experiment_utils:Model logged with name: LGBMRegressorWFE


🏃 View run likeable-sponge-625 at: https://dagshub.com/vasukrishna001/cityBikes25_rides.mlflow/#/experiments/6/runs/195fe62cf9914f08940584c174e37650
🧪 View experiment at: https://dagshub.com/vasukrishna001/cityBikes25_rides.mlflow/#/experiments/6


<mlflow.models.model.ModelInfo at 0x17cde00bc50>