In [1]:
import sys
from pathlib import Path

# Add project root to sys.path so `src/` can be imported
sys.path.append(str(Path("..").resolve()))


In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.metrics import mean_absolute_error
from dotenv import load_dotenv
from src.data_utils import split_time_series_data
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow

In [4]:
# === Load tabular data ===
df = pd.read_parquet("../data/transformed/tabular_data.parquet")

In [5]:

# === Rename cols to match previous pipeline if needed ===
df = df.rename(columns={
    "start_hour": "pickup_hour",
    "start_station_id": "pickup_location_id"
})

In [6]:
# === Split into train/test ===
X_train, y_train, X_test, y_test = split_time_series_data(
    df=df,
    cutoff_date=datetime(2024, 4, 1),
    target_column="target"
)

In [7]:

# === Baseline model: Previous Hour ===
class BaselineModelPreviousHour:
    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        pass

    def predict(self, X_test: pd.DataFrame) -> np.ndarray:
        # Naive lag prediction using rides_t-1
        return np.round(X_test["rides_t-1"]).astype(int)

In [8]:

# === Run and evaluate ===
model = BaselineModelPreviousHour()
predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
print(f"📉 MAE: {mae:.4f}")

📉 MAE: 1.3222


In [12]:
# === Log to MLflow ===
load_dotenv()
mlflow = set_mlflow_tracking()

log_model_to_mlflow(
    model=model,
    input_data=X_test,
    experiment_name="BaselineModelPreviousHour",
    metric_name="mae",
    score=mae
)


INFO:src.experiment_utils:MLflow tracking URI and credentials set.
INFO:src.experiment_utils:Experiment set to: BaselineModelPreviousHour
INFO:src.experiment_utils:Logged mae: 1.3222222222222222
INFO:src.experiment_utils:Model signature inferred.
  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1749.29it/s]
2025/05/09 20:52:24 INFO mlflow.models.model: Found the following environment variables used during model inference: [HOPSWORKS_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
Registered model 'BaselineModelPreviousHour' already exists. Creating a new version of this model...
2025/05/09 20:54:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: BaselineModelPreviousHour, version 2
Created version '2' of model 'BaselineModelP

🏃 View run thundering-smelt-456 at: https://dagshub.com/vidyuthkrishna03/citibike-ride-prediction.mlflow/#/experiments/0/runs/128952f046d547f8be6101ab1273fbc7
🧪 View experiment at: https://dagshub.com/vidyuthkrishna03/citibike-ride-prediction.mlflow/#/experiments/0


<mlflow.models.model.ModelInfo at 0x1e80f06ee90>

In [9]:
from src.plot_utils import plot_prediction

# For one row's prediction (e.g., first row of test set)
plot_prediction(X_test.head(1), pd.DataFrame({"predicted_demand": [predictions[0]]}))
