Import statements

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
import pandas as pd
from datetime import datetime
import xgboost as xgb
from dotenv import load_dotenv
load_dotenv()
from sklearn.metrics import mean_absolute_error

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from src.config import TRANSFORMED_DATA_DIR
from src.data_utils import split_time_series_data
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow

Data Loading and Test Train split

In [2]:
# Load the tabular data
df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")
df.head(5)

# Split the data into training and testing sets
# Training period: January 2024 to August 2024
# Test period: September 2024 to January 2025
X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2025, 1, 1, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# Select only the numeric features (lagged ride counts)
past_ride_columns = [c for c in X_train.columns if c.startswith("rides_")]
X_train_only_numeric = X_train[past_ride_columns]
X_test_only_numeric = X_test[past_ride_columns]

(24336, 674)
(24336,)
(2232, 674)
(2232,)


XGBoost Model Predictions and Logging

In [3]:
# Train an XGBoost model
model = xgb.XGBRegressor(max_depth=10)
model.fit(X_train_only_numeric, y_train)

# Make predictions and evaluate
predictions = model.predict(X_test_only_numeric)
test_mae = mean_absolute_error(y_test, predictions)
print(f"XGBoost MAE: {test_mae:.4f}")

# Log the model to MLflow
mlflow = set_mlflow_tracking()
log_model_to_mlflow(model, X_test_only_numeric, "XGBoost", "mean_absolute_error", score=test_mae)

INFO:src.experiment_utils:MLflow tracking URI and credentials set.


XGBoost MAE: 3.3661


INFO:src.experiment_utils:Experiment set to: XGBoost
INFO:src.experiment_utils:Logged mean_absolute_error: 3.366122007369995
INFO:src.experiment_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/05/01 00:02:45 INFO mlflow.models.model: Found the following environment variables used during model inference: [HOPSWORKS_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
Registered model 'XGBRegressor' already exists. Creating a new version of this model...
2025/05/01 00:03:01 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBRegressor, version 2
Created version '2' of model 'XGBRegressor'.
INFO:src.experiment_utils:Model logged with name: XGBRegressor


🏃 View run righteous-ray-764 at: https://dagshub.com/singhvarun0405/CDA500PF1.mlflow/#/experiments/3/runs/e5b0b97dc6fe443a8c0fb3ab8609593e
🧪 View experiment at: https://dagshub.com/singhvarun0405/CDA500PF1.mlflow/#/experiments/3


<mlflow.models.model.ModelInfo at 0x217f56d2fc0>

In [4]:
import pandas as pd
import numpy as np
def safe_mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    # Only compute MAPE for non-zero actual values to avoid division by zero issues
    non_zero_mask = y_true != 0
    if not np.any(non_zero_mask):
        return np.nan  # Return NaN if all actual values are 0
    y_true_non_zero = y_true[non_zero_mask]
    y_pred_non_zero = y_pred[non_zero_mask]
    # Add a small epsilon to the denominator for numerical stability
    epsilon = 1e-2  # Larger epsilon to prevent extreme values
    mape = np.mean(np.abs((y_true_non_zero - y_pred_non_zero) / (y_true_non_zero + epsilon))) * 100
    return mape

mape = safe_mean_absolute_percentage_error(y_test, predictions)
print(f"XGBoost Safe MAPE (non-zero actuals): {mape:.4f}%")

XGBoost Safe MAPE (non-zero actuals): 59.4620%


In [5]:
# Additional diagnostics for high MAPE
import numpy as np
import pandas as pd

# Recompute MAE for reference
print(f"XGBoost MAE (recomputed): {test_mae:.4f}")

# Compute percentage errors for non-zero actuals
non_zero_mask = y_test != 0
y_test_non_zero = y_test[non_zero_mask]
predictions_non_zero = predictions[non_zero_mask]
percentage_errors = np.abs((y_test_non_zero - predictions_non_zero) / (y_test_non_zero + 1e-2)) * 100

# Analyze the distribution of percentage errors
print("\nDistribution of Percentage Errors (non-zero actuals):")
print(pd.Series(percentage_errors).describe())

# Analyze the distribution of actual ride counts in the test set
print("\nDistribution of Actual Ride Counts in Test Set:")
print(y_test.describe())

# Analyze the distribution of actual ride counts for non-zero values
print("\nDistribution of Non-Zero Actual Ride Counts in Test Set:")
print(y_test_non_zero.describe())

# Visualize predictions for a few rows to inspect errors
from src.plot_utils import plot_aggregated_time_series

# Select a few rows to visualize (e.g., rows with high percentage errors)
high_error_indices = np.argsort(percentage_errors)[-3:]  # Top 3 highest percentage errors
for idx in high_error_indices:
    original_idx = y_test_non_zero.index[idx]
    print(f"\nVisualizing row {original_idx} (Percentage Error: {percentage_errors[idx]:.2f}%)")
    plot_aggregated_time_series(X_test, y_test, original_idx, predictions)

XGBoost MAE (recomputed): 3.3661

Distribution of Percentage Errors (non-zero actuals):
count    1879.000000
mean       59.461965
std        90.834545
min         0.005991
25%        15.250596
50%        33.244406
75%        67.250652
max      1613.194494
Name: target, dtype: float64

Distribution of Actual Ride Counts in Test Set:
count    2232.000000
mean        9.217294
std         9.269306
min         0.000000
25%         1.000000
50%         7.000000
75%        14.000000
max        53.000000
Name: target, dtype: float64

Distribution of Non-Zero Actual Ride Counts in Test Set:
count    1879.000000
mean       10.948909
std         9.115958
min         1.000000
25%         4.000000
50%         9.000000
75%        16.000000
max        53.000000
Name: target, dtype: float64

Visualizing row 1078 (Percentage Error: 108.44%)

Visualizing row 252 (Percentage Error: 87.53%)

Visualizing row 876 (Percentage Error: 21.35%)
