In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))


In [3]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

# Load your Citi Bike dataset
df = pd.read_parquet(TRANSFORMED_DATA_DIR / "citibike_tabular_data_28d.parquet")
df.head()


Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,target,pickup_hour,pickup_location_id
0,0,0,0,0,0,0,0,0,1,2,...,0,0,0,0,0,0,0,0,2024-02-03,HB101
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2024-02-04,HB101
2,0,0,0,0,0,0,1,6,5,3,...,0,0,0,0,0,0,0,0,2024-02-05,HB101
3,4,5,4,0,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,2024-02-06,HB101
4,0,0,0,0,0,0,0,3,9,2,...,4,7,7,3,5,3,1,0,2024-02-07,HB101


In [4]:
from datetime import datetime
from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    df,  # or df_ts or your actual DataFrame
    cutoff_date=datetime(2025, 1, 1, 0, 0, 0),  # within your date range
    target_column="target"  # actual target column name
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)


(1665, 674)
(1665,)
(600, 674)
(600,)


In [5]:
X_train

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
0,0,0,0,0,0,0,0,0,1,2,...,0,0,0,0,0,0,0,0,2024-02-03,HB101
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2024-02-04,HB101
2,0,0,0,0,0,0,1,6,5,3,...,0,0,0,0,0,0,0,0,2024-02-05,HB101
3,4,5,4,0,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,2024-02-06,HB101
4,0,0,0,0,0,0,0,3,9,2,...,8,4,7,7,3,5,3,1,2024-02-07,HB101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2140,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2024-12-27,JC115
2141,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2024-12-28,JC115
2142,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2024-12-29,JC115
2143,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2024-12-30,JC115


In [6]:
# Select only past ride features
past_ride_columns = [c for c in X_train.columns if c.startswith("rides_")]
X_train_only_numeric = X_train[past_ride_columns]
X_test_only_numeric = X_test[past_ride_columns]


In [7]:
import xgboost as xgb

model = xgb.XGBRegressor(max_depth=10)
model.fit(X_train_only_numeric, y_train)


In [8]:
from sklearn.metrics import mean_absolute_error

predictions = model.predict(X_test_only_numeric)
test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae:.4f}")


0.9136


In [9]:
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
import os

load_dotenv()
mlflow = set_mlflow_tracking()

log_model_to_mlflow(model, X_test_only_numeric, "XGBoost-Citibike", "mean_absolute_error", score=test_mae)


INFO:src.experiment_utils:MLflow tracking URI and credentials set.
2025/05/10 05:03:22 INFO mlflow.tracking.fluent: Experiment with name 'XGBoost-Citibike' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: XGBoost-Citibike
INFO:src.experiment_utils:Logged mean_absolute_error: 0.9135712527783258
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/05/10 05:03:35 INFO mlflow.models.model: Found the following environment variables used during model inference: [HOPSWORKS_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
Successfully registered model 'XGBRegressor'.
2025/05/10 05:03:43 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBRegressor, version 1
Created version '1' of model 'XGBRegressor'.
INFO:src.experiment_utils:Model logged with name: XGBRegressor


🏃 View run bright-moose-380 at: https://dagshub.com/yaseensiddiqui36/cda500_final_project_citi_bike.mlflow/#/experiments/3/runs/3ce29e3ae94946a9aa5a487f240368d1
🧪 View experiment at: https://dagshub.com/yaseensiddiqui36/cda500_final_project_citi_bike.mlflow/#/experiments/3


<mlflow.models.model.ModelInfo at 0x1e750a49790>