In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [64]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")
df

Unnamed: 0,rides_t-112,rides_t-111,rides_t-110,rides_t-109,rides_t-108,rides_t-107,rides_t-106,rides_t-105,rides_t-104,rides_t-103,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id,target
0,4,49,42,114,57,7,110,112,86,14,...,39,66,205,111,26,30,56,2024-01-28 18:00:00,5626,70
1,49,42,114,57,7,110,112,86,14,103,...,66,205,111,26,30,56,70,2024-01-29 00:00:00,5626,12
2,42,114,57,7,110,112,86,14,103,135,...,205,111,26,30,56,70,12,2024-01-29 06:00:00,5626,112
3,114,57,7,110,112,86,14,103,135,97,...,111,26,30,56,70,12,112,2024-01-29 12:00:00,5626,123
4,57,7,110,112,86,14,103,135,97,10,...,26,30,56,70,12,112,123,2024-01-29 18:00:00,5626,99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6760,108,11,160,180,115,15,162,192,111,9,...,15,37,131,69,10,63,171,2024-12-30 18:00:00,6072,80
6761,11,160,180,115,15,162,192,111,9,139,...,37,131,69,10,63,171,80,2024-12-31 00:00:00,6072,15
6762,160,180,115,15,162,192,111,9,139,162,...,131,69,10,63,171,80,15,2024-12-31 06:00:00,6072,87
6763,180,115,15,162,192,111,9,139,162,120,...,69,10,63,171,80,15,87,2024-12-31 12:00:00,6072,142


In [65]:
corr = df.corr()
corr = corr["target"].nlargest(26)
corr.index

Index(['target', 'rides_t-4', 'rides_t-28', 'rides_t-56', 'rides_t-84',
       'rides_t-24', 'rides_t-32', 'rides_t-112', 'rides_t-52', 'rides_t-60',
       'rides_t-8', 'rides_t-88', 'rides_t-108', 'rides_t-80', 'rides_t-20',
       'rides_t-36', 'rides_t-12', 'rides_t-48', 'rides_t-16', 'rides_t-64',
       'rides_t-40', 'rides_t-44', 'rides_t-76', 'rides_t-92', 'rides_t-104',
       'rides_t-72'],
      dtype='object')

In [66]:
# selected_cols = corr.index.tolist() + ["pickup_hour","pickup_location_id"]
# df_corr = df[selected_cols]
# df_corr

In [67]:
from datetime import datetime

from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2024, 10, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(4925, 114)
(4925,)
(1840, 114)
(1840,)


In [68]:
past_ride_columns = [c for c in X_train.columns if c.startswith("rides_")]
X_train_only_numeric = X_train[past_ride_columns]
X_test_only_numeric = X_test[past_ride_columns]

In [69]:
def average_rides_last_4_weeks(X: pd.DataFrame) -> pd.DataFrame:
    last_4_weeks_columns = [
            f"rides_t-{7*4}",  # 1 week ago
            f"rides_t-{14*4}", # 2 weeks ago
            f"rides_t-{21*4}", # 3 weeks ago
            f"rides_t-{28*4}"  # 4 weeks ago
        ]

        # Ensure the required columns exist in the test DataFrame
    for col in last_4_weeks_columns:
        if col not in X.columns:
            raise ValueError(f"Missing required column: {col}")

    # Calculate the average of the last 4 weeks
    X["average_rides_last_4_weeks"] = X[last_4_weeks_columns].mean(axis=1)

    return X

In [70]:
from sklearn.preprocessing import FunctionTransformer

add_feature_average_rides_last_4_weeks = FunctionTransformer(
    average_rides_last_4_weeks, validate=False
)

In [None]:
add_feature_average_rides_last_4_weeks.fit_transform(X_train)

In [91]:
from sklearn.base import BaseEstimator, TransformerMixin

class TemporalFeatureEngineer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_["hour"] = X_["pickup_hour"].dt.hour
        X_["day_of_week"] = X_["pickup_hour"].dt.dayofweek
        X_["is_weekend"] = X_["day_of_week"].isin([5, 6]).astype(int)


        return X_.drop(columns=["pickup_hour", "pickup_location_id"])

In [92]:
add_temporal_features = TemporalFeatureEngineer()
add_temporal_features.fit_transform(X_train)

Unnamed: 0,rides_t-112,rides_t-111,rides_t-110,rides_t-109,rides_t-108,rides_t-107,rides_t-106,rides_t-105,rides_t-104,rides_t-103,...,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,average_rides_last_4_weeks,hour,day_of_week,is_weekend
0,4,49,42,114,57,7,110,112,86,14,...,66,205,111,26,30,56,60.00,18,6,1
1,49,42,114,57,7,110,112,86,14,103,...,205,111,26,30,56,70,18.25,0,0,0
2,42,114,57,7,110,112,86,14,103,135,...,111,26,30,56,70,12,83.50,6,0,0
3,114,57,7,110,112,86,14,103,135,97,...,26,30,56,70,12,112,114.25,12,0,0
4,57,7,110,112,86,14,103,135,97,10,...,30,56,70,12,112,123,79.50,18,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4920,163,31,88,245,142,14,250,271,227,15,...,112,119,134,37,45,114,188.50,18,6,1
4921,31,88,245,142,14,250,271,227,15,242,...,119,134,37,45,114,130,21.00,0,0,0
4922,88,245,142,14,250,271,227,15,242,294,...,134,37,45,114,130,16,190.75,6,0,0
4923,245,142,14,250,271,227,15,242,294,294,...,37,45,114,130,16,227,267.75,12,0,0


In [109]:
from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA



# --- 3. Scale the Features ---
# PCA is sensitive to the scale of the features. It's crucial to scale them.
scaler = StandardScaler()
# pca = PCA(n_components=0.98)

In [110]:
import lightgbm as lgb

from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    add_feature_average_rides_last_4_weeks,
    add_temporal_features,
    scaler,
    lgb.LGBMRegressor()
)



In [111]:
X_train

Unnamed: 0,rides_t-112,rides_t-111,rides_t-110,rides_t-109,rides_t-108,rides_t-107,rides_t-106,rides_t-105,rides_t-104,rides_t-103,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id,average_rides_last_4_weeks
0,4,49,42,114,57,7,110,112,86,14,...,39,66,205,111,26,30,56,2024-01-28 18:00:00,5626,60.00
1,49,42,114,57,7,110,112,86,14,103,...,66,205,111,26,30,56,70,2024-01-29 00:00:00,5626,18.25
2,42,114,57,7,110,112,86,14,103,135,...,205,111,26,30,56,70,12,2024-01-29 06:00:00,5626,83.50
3,114,57,7,110,112,86,14,103,135,97,...,111,26,30,56,70,12,112,2024-01-29 12:00:00,5626,114.25
4,57,7,110,112,86,14,103,135,97,10,...,26,30,56,70,12,112,123,2024-01-29 18:00:00,5626,79.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4920,163,31,88,245,142,14,250,271,227,15,...,60,112,119,134,37,45,114,2024-09-29 18:00:00,6072,188.50
4921,31,88,245,142,14,250,271,227,15,242,...,112,119,134,37,45,114,130,2024-09-30 00:00:00,6072,21.00
4922,88,245,142,14,250,271,227,15,242,294,...,119,134,37,45,114,130,16,2024-09-30 06:00:00,6072,190.75
4923,245,142,14,250,271,227,15,242,294,294,...,134,37,45,114,130,16,227,2024-09-30 12:00:00,6072,267.75


In [112]:
pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008605 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28831
[LightGBM] [Info] Number of data points in the train set: 4925, number of used features: 116
[LightGBM] [Info] Start training from score 191.778680


In [113]:
from sklearn.metrics import mean_absolute_error
predictions = pipeline.predict(X_test)

In [114]:
test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae:.4f}")

34.7374


In [115]:
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
import os
load_dotenv() 

mlflow = set_mlflow_tracking()
log_model_to_mlflow(pipeline, X_test, "LGBMRegressorWFE", "mean_absolute_error", score=test_mae)

INFO:httpx:HTTP Request: GET https://dagshub.com/api/v1/user "HTTP/1.1 200 OK"


INFO:dagshub:Accessing as thanoojlingampally
INFO:httpx:HTTP Request: GET https://dagshub.com/api/v1/repos/thanoojlingampally/citi_bike "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://dagshub.com/api/v1/user "HTTP/1.1 200 OK"


INFO:dagshub:Initialized MLflow to track repo "thanoojlingampally/citi_bike"


INFO:dagshub:Repository thanoojlingampally/citi_bike initialized!
INFO:src.experiment_utils:MLflow tracking URI and credentials set.
2025/05/11 04:36:23 INFO mlflow.tracking.fluent: Experiment with name 'LGBMRegressorWFE' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: LGBMRegressorWFE
INFO:src.experiment_utils:Logged mean_absolute_error: 34.73740653282677
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Successfully registered model 'Pipeline'.
2025/05/11 04:36:40 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Pipeline, version 1
Created version '1' of model 'Pipeline'.
INFO:src.experiment_utils:Model logged with name: Pipeline


🏃 View run skittish-roo-214 at: https://dagshub.com/thanoojlingampally/citi_bike.mlflow/#/experiments/5/runs/fb13f41859ef4f228f565477b55206cb
🧪 View experiment at: https://dagshub.com/thanoojlingampally/citi_bike.mlflow/#/experiments/5


<mlflow.models.model.ModelInfo at 0x27908a7eaf0>