Import Statements

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.metrics import mean_absolute_error
from dotenv import load_dotenv
load_dotenv()
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from src.config import TRANSFORMED_DATA_DIR
from src.data_utils import split_time_series_data
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from src.plot_utils import plot_aggregated_time_series

Data Loading and Test Train split

In [2]:
# Load the tabular data
df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")
df.head(5)

# Split the data into training and testing sets
# Training period: January 2024 to August 2024
# Test period: September 2024 to January 2025
X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2025, 1, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,start_station_name,target
0,4,9,3,0,2,0,0,0,1,2,...,4,6,8,10,7,2,0,2024-01-29 00:00:00,8 Ave & W 31 St,4
1,9,3,0,2,0,0,0,1,2,3,...,6,8,10,7,2,0,4,2024-01-29 01:00:00,8 Ave & W 31 St,0
2,3,0,2,0,0,0,1,2,3,3,...,8,10,7,2,0,4,0,2024-01-29 02:00:00,8 Ave & W 31 St,1
3,0,2,0,0,0,1,2,3,3,8,...,10,7,2,0,4,0,1,2024-01-29 03:00:00,8 Ave & W 31 St,1
4,2,0,0,0,1,2,3,3,8,5,...,7,2,0,4,0,1,1,2024-01-29 04:00:00,8 Ave & W 31 St,1


(24336, 674)
(24336,)
(2232, 674)
(2232,)


Baseline Model using Previous Hour

In [3]:
# Baseline Model: Previous Hour
class BaselineModelPreviousHour:
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass

    def predict(self, X_test: pd.DataFrame) -> np.array:
        return X_test["rides_t-1"]

model = BaselineModelPreviousHour()
predictions = model.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f"BaselineModelPreviousHour MAE: {test_mae:.4f}")
mlflow = set_mlflow_tracking()
log_model_to_mlflow(model, X_test, "BaselineModelPreviousHour", "mean_absolute_error", score=test_mae)

INFO:src.experiment_utils:MLflow tracking URI and credentials set.


BaselineModelPreviousHour MAE: 4.1080


INFO:src.experiment_utils:Experiment set to: BaselineModelPreviousHour
INFO:src.experiment_utils:Logged mean_absolute_error: 4.107974910394265
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/05/01 00:10:35 INFO mlflow.models.model: Found the following environment variables used during model inference: [HOPSWORKS_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
Registered model 'BaselineModelPreviousHour' already exists. Creating a new version of this model...
2025/05/01 00:10:51 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: BaselineModelPreviousHour, version 4
Created version '4' of model 'BaselineModelPreviousHour'.
INFO:src.experiment_utils:Model logged with name: BaselineModelPreviousHour


🏃 View run fearless-mare-683 at: https://dagshub.com/singhvarun0405/CDA500PF1.mlflow/#/experiments/0/runs/97b1935a03e04392925bf7b874eb8376
🧪 View experiment at: https://dagshub.com/singhvarun0405/CDA500PF1.mlflow/#/experiments/0


<mlflow.models.model.ModelInfo at 0x1a0e00d5b20>

Baseline Model using the past week (7 days * 24 hours)

In [4]:
# Baseline Model: Previous Week
class BaselineModelPreviousWeek:
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass

    def predict(self, X_test: pd.DataFrame) -> np.array:
        return X_test[f"rides_t-{7*24}"]

model = BaselineModelPreviousWeek()
predictions = model.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f"BaselineModelPreviousWeek MAE: {test_mae:.4f}")
log_model_to_mlflow(model, X_test, "BaselineModelPreviousWeek", "mean_absolute_error", score=test_mae)

INFO:src.experiment_utils:Experiment set to: BaselineModelPreviousWeek


BaselineModelPreviousWeek MAE: 4.8396


INFO:src.experiment_utils:Logged mean_absolute_error: 4.839605734767025
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'BaselineModelPreviousWeek' already exists. Creating a new version of this model...
2025/05/01 00:11:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: BaselineModelPreviousWeek, version 3
Created version '3' of model 'BaselineModelPreviousWeek'.
INFO:src.experiment_utils:Model logged with name: BaselineModelPreviousWeek


🏃 View run rare-bear-104 at: https://dagshub.com/singhvarun0405/CDA500PF1.mlflow/#/experiments/1/runs/1e107ccb0939495dae08687bc7575362
🧪 View experiment at: https://dagshub.com/singhvarun0405/CDA500PF1.mlflow/#/experiments/1


<mlflow.models.model.ModelInfo at 0x1a0e0b25100>

Baseline Model using Past 4 Weeks (4 Weeks * 7 * 24 split from 0-7-14-21-28)

In [5]:
# Baseline Model: Last 4 Weeks
class BaselineModelLast4Weeks:
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass

    def predict(self, X_test: pd.DataFrame) -> np.array:
        last_4_weeks_columns = [
            f"rides_t-{7*24}",  # 1 week ago
            f"rides_t-{14*24}", # 2 weeks ago
            f"rides_t-{21*24}", # 3 weeks ago
            f"rides_t-{28*24}"  # 4 weeks ago
        ]
        for col in last_4_weeks_columns:
            if col not in X_test.columns:
                raise ValueError(f"Missing required column: {col}")
        predictions = X_test[last_4_weeks_columns].mean(axis=1)
        return predictions.to_numpy()

model = BaselineModelLast4Weeks()
predictions = model.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f"BaselineModelLast4Weeks MAE: {test_mae:.4f}")
log_model_to_mlflow(model, X_test, "BaselineModelLast4Weeks", "mean_absolute_error", score=test_mae)

INFO:src.experiment_utils:Experiment set to: BaselineModelLast4Weeks


BaselineModelLast4Weeks MAE: 3.9582


INFO:src.experiment_utils:Logged mean_absolute_error: 3.9582213261648747
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'BaselineModelLast4Weeks' already exists. Creating a new version of this model...
2025/05/01 00:11:43 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: BaselineModelLast4Weeks, version 3
Created version '3' of model 'BaselineModelLast4Weeks'.
INFO:src.experiment_utils:Model logged with name: BaselineModelLast4Weeks


🏃 View run carefree-shad-639 at: https://dagshub.com/singhvarun0405/CDA500PF1.mlflow/#/experiments/2/runs/f57a495dbac84f49aabdca4eb77e6894
🧪 View experiment at: https://dagshub.com/singhvarun0405/CDA500PF1.mlflow/#/experiments/2


<mlflow.models.model.ModelInfo at 0x1a0e1866d80>

Plotting prediction from the latest baseline model (4 weeks)

In [6]:
# Visualize the predictions for a specific row
# Ensure the row_id is within bounds of the test set
row_id = 1000
if row_id >= len(X_test):
    row_id = len(X_test) // 2  # Use the middle row if row_id is out of bounds
    print(f"Original row_id out of bounds. Using row_id = {row_id} (middle of test set)")

X_test[X_test['start_station_name'] == X_test.iloc[row_id]['start_station_name']].head()
plot_aggregated_time_series(X_test, y_test, row_id, predictions)

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,start_station_name
744,4,2,0,1,1,2,8,13,22,22,...,30,19,14,12,4,0,0,3,2025-01-01 00:00:00,University Pl & E 14 St
745,2,0,1,1,2,8,13,22,22,18,...,19,14,12,4,0,0,3,6,2025-01-01 01:00:00,University Pl & E 14 St
746,0,1,1,2,8,13,22,22,18,12,...,14,12,4,0,0,3,6,4,2025-01-01 02:00:00,University Pl & E 14 St
747,1,1,2,8,13,22,22,18,12,16,...,12,4,0,0,3,6,4,4,2025-01-01 03:00:00,University Pl & E 14 St
748,1,2,8,13,22,22,18,12,16,25,...,4,0,0,3,6,4,4,5,2025-01-01 04:00:00,University Pl & E 14 St
