In [46]:
%load_ext autoreload
%autoreload 2
import sys
import os
import hopsworks
import pandas as pd
from datetime import timedelta
import plotly.express as px


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

import src.config as config
from src.inference import get_feature_store, fetch_predictions

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1]:
%load_ext autoreload
%autoreload 2
import sys
import os
import hopsworks
import pandas as pd
from datetime import timedelta

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

import src.config as config
from src.inference import get_feature_store

# Fetch actual rides without time filtering
def fetch_hourly_rides_unfiltered():
    fs = get_feature_store()
    fg = fs.get_feature_group(
        name="recent_time_series_hourly_feature_group",
        version=1
    )

    query = fg.select_all()
    return query.read()

# Fetch predictions without time filtering
def fetch_predictions_unfiltered(feature_group_name):
    fs = get_feature_store()
    fg = fs.get_feature_group(
        name=feature_group_name,
        version=1
    )

    query = fg.select_all()
    return query.read()

# Fetch actual rides data (unfiltered)
df_actual = fetch_hourly_rides_unfiltered()
df_actual["pickup_hour"] = pd.to_datetime(df_actual["pickup_hour"]).dt.tz_localize(None)
df_actual.sort_values(['start_station_name', 'pickup_hour'], inplace=True)
print("\nUnfiltered Actual Rides Data (recent_time_series_hourly_feature_group):")
print("Shape:", df_actual.shape)
print("pickup_hour range:", df_actual["pickup_hour"].min(), "to", df_actual["pickup_hour"].max())
print(df_actual)

# Define the list of models and corresponding feature group names
models = [
    "baseline_previous_hour",
    "lightgbm_28days_lags",
    "lightgbm_top10_features",
    "gradient_boosting_temporal_features",
    "lightgbm_enhanced_lags_cyclic_temporal_interactions"
]

prediction_feature_groups = {
    "baseline_previous_hour": "predictions_model_baseline",
    "lightgbm_28days_lags": "predictions_model_lgbm_28days",
    "lightgbm_top10_features": "predictions_model_lgbm_top10",
    "gradient_boosting_temporal_features": "predictions_model_gbt",
    "lightgbm_enhanced_lags_cyclic_temporal_interactions": "predictions_model_lgbm_enhanced"
}

# Fetch unfiltered predictions for all models to inspect available hours
all_predictions = {}
for model_name in models:
    df_pred = fetch_predictions_unfiltered(prediction_feature_groups[model_name])
    df_pred["pickup_hour"] = pd.to_datetime(df_pred["pickup_hour"]).dt.tz_localize(None)
    df_pred.sort_values(['start_station_name', 'pickup_hour'], inplace=True)
    print(f"\nUnfiltered Predicted Rides Data ({prediction_feature_groups[model_name]}):")
    print("Shape:", df_pred.shape)
    print("pickup_hour range:", df_pred["pickup_hour"].min(), "to", df_pred["pickup_hour"].max())
    print(df_pred)
    all_predictions[model_name] = df_pred

# Determine the most recent hour in df_actual
if df_actual.empty:
    print("No actual rides data found. Cannot proceed.")
else:
    recent_actual_hour = df_actual["pickup_hour"].max()
    print(f"\nMost recent actual rides hour: {recent_actual_hour}")

    # Filter actual rides for the most recent hour
    df_actual_filtered = df_actual[df_actual["pickup_hour"] == recent_actual_hour]
    print("\nFiltered Actual Rides Data:")
    print("Shape:", df_actual_filtered.shape)
    print(df_actual_filtered)

    # Check if predictions exist for this hour and calculate MAE
    for model_name in models:
        print(f"\nProcessing model: {model_name}")
        
        # Filter predictions for the most recent actual hour
        df_pred = all_predictions[model_name]
        df_pred_filtered = df_pred[df_pred["pickup_hour"] == recent_actual_hour]

        print(f"Predicted Rides Data for {model_name} at {recent_actual_hour}:")
        print("Shape:", df_pred_filtered.shape)
        print(df_pred_filtered)

        # Merge actual and predicted data
        merged_df = pd.merge(df_actual_filtered, df_pred_filtered, on=['start_station_name', 'pickup_hour'], how='inner')
        if merged_df.empty:
            print(f"No matching data found for {model_name} at {recent_actual_hour}. Skipping.")
            continue

        # Calculate the absolute error
        merged_df['absolute_error'] = abs(merged_df['predicted_rides'] - merged_df['rides'])

        # Calculate the MAE for this hour
        mae = merged_df['absolute_error'].mean()

        # Display the results
        print(f"MAE for {model_name} at {recent_actual_hour}: {mae:.4f}")
        print("Details:")
        print(merged_df[['start_station_name', 'pickup_hour', 'rides', 'predicted_rides', 'absolute_error']])

2025-05-10 04:53:53,826 INFO: Initializing external client
2025-05-10 04:53:53,827 INFO: Base URL: https://c.app.hopsworks.ai:443




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-05-10 04:53:54,684 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1225907
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.57s) 

Unfiltered Actual Rides Data (recent_time_series_hourly_feature_group):
Shape: (2100, 3)
pickup_hour range: 2025-04-11 04:00:00 to 2025-05-10 07:00:00
             pickup_hour start_station_name  rides
41   2025-04-11 04:00:00   11 Ave & W 41 St      1
1701 2025-04-11 05:00:00   11 Ave & W 41 St      4
1846 2025-04-11 06:00:00   11 Ave & W 41 St      9
1626 2025-04-11 07:00:00   11 Ave & W 41 St     18
11   2025-04-11 08:00:00   11 Ave & W 41 St     16
...                  ...                ...    ...
987  2025-05-10 03:00:00    W 31 St & 7 Ave      0
2088 2025-05-10 04:00:00    W 31 St & 7 Ave      1
2093 2025-05-10 05:00:00    W 31 St & 7 Ave      1
2096 2025-05-10 06:00:00    W 31 St & 7 Ave     10
2098 2025-05-10 07:00:00    W 31 St & 7 Ave      5

[2100 rows x



To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-05-10 04:53:58,465 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1225907
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.74s) 

Unfiltered Predicted Rides Data (predictions_model_baseline):
Shape: (21, 3)
pickup_hour range: 2025-04-17 08:00:00 to 2025-05-10 09:00:00
   start_station_name  predicted_rides         pickup_hour
9    11 Ave & W 41 St                1 2025-04-17 08:00:00
13   11 Ave & W 41 St               21 2025-05-09 09:00:00
8    11 Ave & W 41 St               24 2025-05-09 17:00:00
20   11 Ave & W 41 St                0 2025-05-10 06:00:00
0    11 Ave & W 41 St                2 2025-05-10 07:00:00
3    11 Ave & W 41 St               24 2025-05-10 08:00:00
19   11 Ave & W 41 St                1 2025-05-10 09:00:00
10    W 21 St & 6 Ave               26 2025-04-17 08:00:00
14    W 21 St & 6 Ave               22 2025-05-09 09:00:00
6     W 21 St & 6 Ave                9 2025-05-



To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-05-10 04:54:03,579 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1225907
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.59s) 

Unfiltered Predicted Rides Data (predictions_model_lgbm_28days):
Shape: (21, 3)
pickup_hour range: 2025-04-17 08:00:00 to 2025-05-10 09:00:00
   start_station_name  predicted_rides         pickup_hour
9    11 Ave & W 41 St             11.0 2025-04-17 08:00:00
13   11 Ave & W 41 St             16.0 2025-05-09 09:00:00
8    11 Ave & W 41 St             10.0 2025-05-09 17:00:00
20   11 Ave & W 41 St              1.0 2025-05-10 06:00:00
0    11 Ave & W 41 St              2.0 2025-05-10 07:00:00
3    11 Ave & W 41 St             10.0 2025-05-10 08:00:00
19   11 Ave & W 41 St             11.0 2025-05-10 09:00:00
10    W 21 St & 6 Ave              6.0 2025-04-17 08:00:00
14    W 21 St & 6 Ave             10.0 2025-05-09 09:00:00
6     W 21 St & 6 Ave              5.0 2025-



To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-05-10 04:54:06,756 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1225907
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.58s) 

Unfiltered Predicted Rides Data (predictions_model_lgbm_top10):
Shape: (18, 3)
pickup_hour range: 2025-04-17 08:00:00 to 2025-05-10 08:00:00
   start_station_name  predicted_rides         pickup_hour
9    11 Ave & W 41 St              8.0 2025-04-17 08:00:00
13   11 Ave & W 41 St             18.0 2025-05-09 09:00:00
8    11 Ave & W 41 St             14.0 2025-05-09 17:00:00
17   11 Ave & W 41 St              3.0 2025-05-10 06:00:00
0    11 Ave & W 41 St              6.0 2025-05-10 07:00:00
3    11 Ave & W 41 St             14.0 2025-05-10 08:00:00
10    W 21 St & 6 Ave             18.0 2025-04-17 08:00:00
14    W 21 St & 6 Ave             16.0 2025-05-09 09:00:00
6     W 21 St & 6 Ave              9.0 2025-05-09 17:00:00
15    W 21 St & 6 Ave              1.0 2025-0



To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-05-10 04:54:09,856 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1225907
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.81s) 

Unfiltered Predicted Rides Data (predictions_model_gbt):
Shape: (15, 3)
pickup_hour range: 2025-04-17 08:00:00 to 2025-05-10 08:00:00
   start_station_name  predicted_rides         pickup_hour
9    11 Ave & W 41 St              6.0 2025-04-17 08:00:00
13   11 Ave & W 41 St             18.0 2025-05-09 09:00:00
8    11 Ave & W 41 St             14.0 2025-05-09 17:00:00
0    11 Ave & W 41 St              5.0 2025-05-10 07:00:00
3    11 Ave & W 41 St             14.0 2025-05-10 08:00:00
10    W 21 St & 6 Ave             18.0 2025-04-17 08:00:00
14    W 21 St & 6 Ave             16.0 2025-05-09 09:00:00
6     W 21 St & 6 Ave              9.0 2025-05-09 17:00:00
2     W 21 St & 6 Ave              4.0 2025-05-10 07:00:00
4     W 21 St & 6 Ave              9.0 2025-05-10 08



To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-05-10 04:54:12,948 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1225907
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.63s) 

Unfiltered Predicted Rides Data (predictions_model_lgbm_enhanced):
Shape: (15, 3)
pickup_hour range: 2025-04-17 08:00:00 to 2025-05-10 08:00:00
   start_station_name  predicted_rides         pickup_hour
9    11 Ave & W 41 St              5.0 2025-04-17 08:00:00
13   11 Ave & W 41 St             19.0 2025-05-09 09:00:00
8    11 Ave & W 41 St             13.0 2025-05-09 17:00:00
0    11 Ave & W 41 St              6.0 2025-05-10 07:00:00
3    11 Ave & W 41 St             13.0 2025-05-10 08:00:00
10    W 21 St & 6 Ave             25.0 2025-04-17 08:00:00
14    W 21 St & 6 Ave             19.0 2025-05-09 09:00:00
6     W 21 St & 6 Ave             13.0 2025-05-09 17:00:00
2     W 21 St & 6 Ave              3.0 2025-05-10 07:00:00
4     W 21 St & 6 Ave             13.0 202

In [2]:
%load_ext autoreload
%autoreload 2
import sys
import os
import hopsworks
import pandas as pd
from datetime import timedelta
import plotly.express as px

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

import src.config as config
from src.inference import get_feature_store

# Fetch actual rides for the last 24 hours from the recent feature group
def fetch_hourly_rides(hours):
    current_hour = (pd.Timestamp.now(tz="Etc/UTC") - timedelta(hours=hours)).floor('h')
    
    fs = get_feature_store()
    fg = fs.get_feature_group(
        name="recent_time_series_hourly_feature_group",
        version=1
    )

    query = fg.select_all()
    query = query.filter(fg.pickup_hour >= current_hour)

    return query.read()

# Fetch actual rides data for the last 24 hours
df_actual = fetch_hourly_rides(24)
df_actual["pickup_hour"] = pd.to_datetime(df_actual["pickup_hour"]).dt.tz_localize(None)
df_actual.sort_values(['pickup_hour'])
print("\nActual Rides Data:")
print(df_actual)

# Define the list of models and corresponding feature group names
models = [
    "baseline_previous_hour",
    "lightgbm_28days_lags",
    "lightgbm_top10_features",
    "gradient_boosting_temporal_features",
    "lightgbm_enhanced_lags_cyclic_temporal_interactions"
]

prediction_feature_groups = {
    "baseline_previous_hour": "predictions_model_baseline",
    "lightgbm_28days_lags": "predictions_model_lgbm_28days",
    "lightgbm_top10_features": "predictions_model_lgbm_top10",
    "gradient_boosting_temporal_features": "predictions_model_gbt",
    "lightgbm_enhanced_lags_cyclic_temporal_interactions": "predictions_model_lgbm_enhanced"
}

# Fetch predictions for each model and create individual MAE plots
fs = get_feature_store()
mae_results = []

for model_name in models:
    print(f"\nProcessing model: {model_name}")
    
    # Fetch predictions for the last 24 hours
    fg = fs.get_feature_group(
        name=prediction_feature_groups[model_name],
        version=1
    )

    current_hour = (pd.Timestamp.now(tz="Etc/UTC") - timedelta(hours=24)).floor('h')
    query = fg.select_all()
    query = query.filter(fg.pickup_hour >= current_hour)
    df_pred = query.read()
    df_pred["pickup_hour"] = pd.to_datetime(df_pred["pickup_hour"]).dt.tz_localize(None)
    print(f"Predicted Rides Data for {model_name}:")
    print(df_pred)

    # Merge actual and predicted data
    merged_df = pd.merge(df_actual, df_pred, on=['start_station_name', 'pickup_hour'], how='inner')
    if merged_df.empty:
        print(f"No matching data found for {model_name}. Skipping plot.")
        continue

    # Calculate the absolute error
    merged_df['absolute_error'] = abs(merged_df['predicted_rides'] - merged_df['rides'])

    # Group by 'pickup_hour' and calculate the mean absolute error (MAE)
    mae_by_hour = merged_df.groupby('pickup_hour')['absolute_error'].mean().reset_index()
    mae_by_hour.rename(columns={'absolute_error': 'MAE'}, inplace=True)
    mae_by_hour.sort_values('pickup_hour', inplace=True)  # Ensure chronological order
    mae_by_hour['model'] = model_name  # Add model name for plotting
    mae_results.append(mae_by_hour)

    # Create a Plotly plot for this model
    fig = px.line(
        mae_by_hour,
        x='pickup_hour',
        y='MAE',
        title=f'Mean Absolute Error (MAE) by Pickup Hour for {model_name}',
        labels={'pickup_hour': 'Pickup Hour', 'MAE': 'Mean Absolute Error'},
        markers=True
    )

    # Show the plot
    fig.show()

    # Print the average MAE for this model
    avg_mae = mae_by_hour["MAE"].mean()
    print(f"Average MAE for {model_name}: {avg_mae:.4f}")

# Combine all MAE results for a comparative plot
if mae_results:
    mae_combined = pd.concat(mae_results, ignore_index=True)
    fig = px.line(
        mae_combined,
        x='pickup_hour',
        y='MAE',
        color='model',
        title='Mean Absolute Error (MAE) by Pickup Hour for All Models',
        labels={'pickup_hour': 'Pickup Hour', 'MAE': 'Mean Absolute Error'},
        markers=True
    )
    fig.show()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
2025-05-10 05:06:20,787 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-05-10 05:06:20,794 INFO: Initializing external client
2025-05-10 05:06:20,794 INFO: Base URL: https://c.app.hopsworks.ai:443




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-05-10 05:06:21,559 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1225907
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.42s) 

Actual Rides Data:
           pickup_hour start_station_name  rides
0  2025-05-09 10:00:00    W 31 St & 7 Ave      7
1  2025-05-09 15:00:00    W 31 St & 7 Ave     15
2  2025-05-09 17:00:00   11 Ave & W 41 St     21
3  2025-05-09 21:00:00   11 Ave & W 41 St      4
4  2025-05-10 01:00:00   11 Ave & W 41 St      1
..                 ...                ...    ...
76 2025-05-10 06:00:00   11 Ave & W 41 St      2
77 2025-05-10 06:00:00    W 31 St & 7 Ave     10
78 2025-05-10 07:00:00   11 Ave & W 41 St      7
79 2025-05-10 07:00:00    W 31 St & 7 Ave      5
80 2025-05-10 07:00:00    W 21 St & 6 Ave      2

[81 rows x 3 columns]
2025-05-10 05:06:23,585 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-05-10 05:06:23,604 INFO: Initializing 



To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-05-10 05:06:24,276 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1225907

Processing model: baseline_previous_hour
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.38s) 
Predicted Rides Data for baseline_previous_hour:
   start_station_name  predicted_rides         pickup_hour
0    11 Ave & W 41 St                2 2025-05-10 07:00:00
1     W 31 St & 7 Ave               10 2025-05-10 07:00:00
2     W 21 St & 6 Ave                3 2025-05-10 07:00:00
3    11 Ave & W 41 St               24 2025-05-10 08:00:00
4     W 21 St & 6 Ave                9 2025-05-10 08:00:00
5     W 31 St & 7 Ave               12 2025-05-10 08:00:00
6     W 21 St & 6 Ave                9 2025-05-09 17:00:00
7     W 31 St & 7 Ave               12 2025-05-09 17:00:00
8    11 Ave & W 41 St               24 2025-05-09 17:00:00
9     W 31 St & 7 Ave               23 2025-05-09 09:00:00
10   11 Ave & W 41 St               2

Average MAE for baseline_previous_hour: 5.6667

Processing model: lightgbm_28days_lags
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.36s) 
Predicted Rides Data for lightgbm_28days_lags:
   start_station_name  predicted_rides         pickup_hour
0    11 Ave & W 41 St              2.0 2025-05-10 07:00:00
1     W 31 St & 7 Ave              5.0 2025-05-10 07:00:00
2     W 21 St & 6 Ave              1.0 2025-05-10 07:00:00
3    11 Ave & W 41 St             10.0 2025-05-10 08:00:00
4     W 21 St & 6 Ave              5.0 2025-05-10 08:00:00
5     W 31 St & 7 Ave              4.0 2025-05-10 08:00:00
6     W 21 St & 6 Ave              5.0 2025-05-09 17:00:00
7     W 31 St & 7 Ave              4.0 2025-05-09 17:00:00
8    11 Ave & W 41 St             10.0 2025-05-09 17:00:00
9     W 31 St & 7 Ave             16.0 2025-05-09 09:00:00
10   11 Ave & W 41 St             16.0 2025-05-09 09:00:00
11    W 21 St & 6 Ave             10.0 2025-05-09 09:00:00
12    W 21 St

Average MAE for lightgbm_28days_lags: 7.2500

Processing model: lightgbm_top10_features
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.39s) 
Predicted Rides Data for lightgbm_top10_features:
   start_station_name  predicted_rides         pickup_hour
0    11 Ave & W 41 St              6.0 2025-05-10 07:00:00
1     W 31 St & 7 Ave              9.0 2025-05-10 07:00:00
2     W 21 St & 6 Ave              3.0 2025-05-10 07:00:00
3    11 Ave & W 41 St             14.0 2025-05-10 08:00:00
4     W 21 St & 6 Ave              9.0 2025-05-10 08:00:00
5     W 31 St & 7 Ave             10.0 2025-05-10 08:00:00
6     W 21 St & 6 Ave              9.0 2025-05-09 17:00:00
7     W 31 St & 7 Ave             10.0 2025-05-09 17:00:00
8    11 Ave & W 41 St             14.0 2025-05-09 17:00:00
9     W 31 St & 7 Ave             20.0 2025-05-09 09:00:00
10   11 Ave & W 41 St             18.0 2025-05-09 09:00:00
11    W 21 St & 6 Ave             16.0 2025-05-09 09:00:00
12    W 2

Average MAE for lightgbm_top10_features: 5.3333

Processing model: gradient_boosting_temporal_features
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.38s) 
Predicted Rides Data for gradient_boosting_temporal_features:
   start_station_name  predicted_rides         pickup_hour
0    11 Ave & W 41 St              7.0 2025-05-10 07:00:00
1     W 31 St & 7 Ave              7.0 2025-05-10 07:00:00
2     W 21 St & 6 Ave              4.0 2025-05-10 07:00:00
3    11 Ave & W 41 St             14.0 2025-05-10 08:00:00
4     W 21 St & 6 Ave              9.0 2025-05-10 08:00:00
5     W 31 St & 7 Ave             11.0 2025-05-10 08:00:00
6     W 21 St & 6 Ave              9.0 2025-05-09 17:00:00
7     W 31 St & 7 Ave             11.0 2025-05-09 17:00:00
8    11 Ave & W 41 St             14.0 2025-05-09 17:00:00
9     W 31 St & 7 Ave             17.0 2025-05-09 09:00:00
10   11 Ave & W 41 St             18.0 2025-05-09 09:00:00
11    W 21 St & 6 Ave             16.0 20

Average MAE for gradient_boosting_temporal_features: 4.7500

Processing model: lightgbm_enhanced_lags_cyclic_temporal_interactions
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.38s) 
Predicted Rides Data for lightgbm_enhanced_lags_cyclic_temporal_interactions:
   start_station_name  predicted_rides         pickup_hour
0    11 Ave & W 41 St              8.0 2025-05-10 07:00:00
1     W 31 St & 7 Ave             11.0 2025-05-10 07:00:00
2     W 21 St & 6 Ave              5.0 2025-05-10 07:00:00
3    11 Ave & W 41 St             13.0 2025-05-10 08:00:00
4     W 21 St & 6 Ave             13.0 2025-05-10 08:00:00
5     W 31 St & 7 Ave             11.0 2025-05-10 08:00:00
6     W 21 St & 6 Ave             13.0 2025-05-09 17:00:00
7     W 31 St & 7 Ave             11.0 2025-05-09 17:00:00
8    11 Ave & W 41 St             13.0 2025-05-09 17:00:00
9     W 31 St & 7 Ave             19.0 2025-05-09 09:00:00
10   11 Ave & W 41 St             19.0 2025-05-09 09:00:

Average MAE for lightgbm_enhanced_lags_cyclic_temporal_interactions: 4.9167


In [12]:
%load_ext autoreload
%autoreload 2
import sys
import os
from datetime import datetime, timedelta
import pandas as pd
import ipywidgets as widgets
from IPython.display import display
from sklearn.feature_selection import SelectKBest, f_regression
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

import src.config as config
from src.inference import get_feature_store
from src.data_utils import transform_ts_data_info_features
from src.inference import load_batch_of_features_from_store
from src.inference import load_model_from_registry
from src.inference import get_model_predictions
from src.inference import BaselineModelPreviousHour

# Step 1: Calculate current time
current_date = pd.Timestamp.now(tz='Etc/UTC')
current_hour_floored = current_date.floor('h')
current_hour_ceiled = current_date.ceil('h')
print(f"Current time: {current_date}")
print(f"Current hour (floored): {current_hour_floored}")
print(f"Current hour (ceiled): {current_hour_ceiled}")

# Step 2: Fetch actual rides without time filtering
def fetch_hourly_rides_unfiltered():
    fs = get_feature_store()
    fg = fs.get_feature_group(
        name="recent_time_series_hourly_feature_group",
        version=1
    )

    query = fg.select_all()
    return query.read()

# Fetch and inspect actual rides data
df_actual = fetch_hourly_rides_unfiltered()
df_actual["pickup_hour"] = pd.to_datetime(df_actual["pickup_hour"]).dt.tz_localize(None)
df_actual.sort_values(['start_station_name', 'pickup_hour'], inplace=True)
print("\nUnfiltered Actual Rides Data (recent_time_series_hourly_feature_group):")
print("Shape:", df_actual.shape)
print("pickup_hour range:", df_actual["pickup_hour"].min(), "to", df_actual["pickup_hour"].max())
print("Sample data:")
print(df_actual)

# Check for current floored and ceiled hours in actual rides
df_actual_floored = df_actual[df_actual["pickup_hour"] == current_hour_floored]
print(f"\nActual Rides Data for current hour (floored: {current_hour_floored}):")
print("Shape:", df_actual_floored.shape)
print(df_actual_floored)

df_actual_ceiled = df_actual[df_actual["pickup_hour"] == current_hour_ceiled]
print(f"\nActual Rides Data for current hour (ceiled: {current_hour_ceiled}):")
print("Shape:", df_actual_ceiled.shape)
print(df_actual_ceiled)

# Step 3: Fetch predicted rides without time filtering (using predictions_model_baseline)
def fetch_predictions_unfiltered(feature_group_name):
    fs = get_feature_store()
    fg = fs.get_feature_group(
        name=feature_group_name,
        version=1
    )

    query = fg.select_all()
    return query.read()

# Fetch and inspect predicted rides data
feature_group_name = "predictions_model_baseline"
df_pred = fetch_predictions_unfiltered(feature_group_name)
df_pred["pickup_hour"] = pd.to_datetime(df_pred["pickup_hour"]).dt.tz_localize(None)
df_pred.sort_values(['start_station_name', 'pickup_hour'], inplace=True)
print(f"\nUnfiltered Predicted Rides Data ({feature_group_name}):")
print("Shape:", df_pred.shape)
print("pickup_hour range:", df_pred["pickup_hour"].min(), "to", df_pred["pickup_hour"].max())
print("Sample data:")
print(df_pred)

# Check for current floored and ceiled hours in predicted rides
df_pred_floored = df_pred[df_pred["pickup_hour"] == current_hour_floored]
print(f"\nPredicted Rides Data for current hour (floored: {current_hour_floored}):")
print("Shape:", df_pred_floored.shape)
print(df_pred_floored)

df_pred_ceiled = df_pred[df_pred["pickup_hour"] == current_hour_ceiled]
print(f"\nPredicted Rides Data for current hour (ceiled: {current_hour_ceiled}):")
print("Shape:", df_pred_ceiled.shape)
print(df_pred_ceiled)

# Step 4: Replicate the feature pipeline data fetch from 17_inference_pipeline.ipynb
feature_store = get_feature_store()
feature_view = feature_store.get_feature_view(
    name="citi_bike_recent_hourly_feature_view",
    version=1
)

fetch_data_to = current_date.floor('h')
fetch_data_from = current_date - timedelta(days=1*29)
print(f"\nFeature Pipeline: Fetching data from {fetch_data_from} to {fetch_data_to}")

ts_data = feature_view.get_batch_data(
    start_time=(fetch_data_from - timedelta(days=1)),
    end_time=(fetch_data_to + timedelta(days=1)),
)
ts_data = ts_data[ts_data.pickup_hour.between(fetch_data_from, fetch_data_to)]
ts_data.sort_values(["start_station_name", "pickup_hour"]).reset_index(drop=True)
ts_data["pickup_hour"] = ts_data["pickup_hour"].dt.tz_localize(None)

print("\nFeature Pipeline ts_data (after filtering):")
print("Shape:", ts_data.shape)
print("pickup_hour range:", ts_data["pickup_hour"].min(), "to", ts_data["pickup_hour"].max())
print("Sample data:")
print(ts_data)

# Transform the data into features (as in 17_inference_pipeline.ipynb)
features = transform_ts_data_info_features(ts_data, window_size=24*28, step_size=23)
features_next_hour = features.groupby("start_station_name").last().reset_index()
recent_hour = features_next_hour["pickup_hour"].max()

print("\nFeature Pipeline features_next_hour:")
print("Shape:", features_next_hour.shape)
print("Most recent hour in features_next_hour:", recent_hour)
print("Sample data:")
print(features_next_hour)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Current time: 2025-05-10 08:06:15.903420+00:00
Current hour (floored): 2025-05-10 08:00:00+00:00
Current hour (ceiled): 2025-05-10 09:00:00+00:00
2025-05-10 04:06:15,904 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-05-10 04:06:15,914 INFO: Initializing external client
2025-05-10 04:06:15,915 INFO: Base URL: https://c.app.hopsworks.ai:443




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-05-10 04:06:16,510 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1225907
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.87s) 

Unfiltered Actual Rides Data (recent_time_series_hourly_feature_group):
Shape: (2097, 3)
pickup_hour range: 2025-04-11 04:00:00 to 2025-05-10 06:00:00
Sample data:
             pickup_hour start_station_name  rides
41   2025-04-11 04:00:00   11 Ave & W 41 St      1
1701 2025-04-11 05:00:00   11 Ave & W 41 St      4
1846 2025-04-11 06:00:00   11 Ave & W 41 St      9
1626 2025-04-11 07:00:00   11 Ave & W 41 St     18
11   2025-04-11 08:00:00   11 Ave & W 41 St     16
...                  ...                ...    ...
1902 2025-05-10 02:00:00    W 31 St & 7 Ave      1
987  2025-05-10 03:00:00    W 31 St & 7 Ave      0
2088 2025-05-10 04:00:00    W 31 St & 7 Ave      1
2093 2025-05-10 05:00:00    W 31 St & 7 Ave      1
2096 2025-05-10 06:00:00    W 31 St & 7 Ave     10




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-05-10 04:06:19,497 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1225907
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.59s) 

Unfiltered Predicted Rides Data (predictions_model_baseline):
Shape: (12, 3)
pickup_hour range: 2025-04-17 08:00:00 to 2025-05-10 08:00:00
Sample data:
   start_station_name  predicted_rides         pickup_hour
9    11 Ave & W 41 St                1 2025-04-17 08:00:00
8    11 Ave & W 41 St               24 2025-05-09 17:00:00
0    11 Ave & W 41 St                2 2025-05-10 07:00:00
3    11 Ave & W 41 St               24 2025-05-10 08:00:00
10    W 21 St & 6 Ave               26 2025-04-17 08:00:00
6     W 21 St & 6 Ave                9 2025-05-09 17:00:00
2     W 21 St & 6 Ave                0 2025-05-10 07:00:00
4     W 21 St & 6 Ave                9 2025-05-10 08:00:00
11    W 31 St & 7 Ave                7 2025-04-17 08:00:00
7     W 31 St & 7 Ave             



To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-05-10 04:06:22,553 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1225907

Feature Pipeline: Fetching data from 2025-04-11 08:06:15.903420+00:00 to 2025-05-10 08:00:00+00:00
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.57s) 


Unnamed: 0,pickup_hour,start_station_name,rides
0,2025-04-11 09:00:00+00:00,11 Ave & W 41 St,16
1,2025-04-11 10:00:00+00:00,11 Ave & W 41 St,14
2,2025-04-11 11:00:00+00:00,11 Ave & W 41 St,7
3,2025-04-11 12:00:00+00:00,11 Ave & W 41 St,14
4,2025-04-11 13:00:00+00:00,11 Ave & W 41 St,12
...,...,...,...
2077,2025-05-10 02:00:00+00:00,W 31 St & 7 Ave,1
2078,2025-05-10 03:00:00+00:00,W 31 St & 7 Ave,0
2079,2025-05-10 04:00:00+00:00,W 31 St & 7 Ave,1
2080,2025-05-10 05:00:00+00:00,W 31 St & 7 Ave,1



Feature Pipeline ts_data (after filtering):
Shape: (2082, 3)
pickup_hour range: 2025-04-11 09:00:00 to 2025-05-10 06:00:00
Sample data:
             pickup_hour start_station_name  rides
0    2025-04-26 22:00:00    W 21 St & 6 Ave      1
1    2025-04-18 18:00:00    W 21 St & 6 Ave     14
2    2025-04-16 16:00:00   11 Ave & W 41 St      8
3    2025-04-14 00:00:00    W 31 St & 7 Ave      0
4    2025-05-09 01:00:00    W 21 St & 6 Ave      1
...                  ...                ...    ...
2092 2025-05-10 05:00:00   11 Ave & W 41 St      0
2093 2025-05-10 05:00:00    W 31 St & 7 Ave      1
2094 2025-05-10 06:00:00    W 21 St & 6 Ave      3
2095 2025-05-10 06:00:00   11 Ave & W 41 St      2
2096 2025-05-10 06:00:00    W 31 St & 7 Ave     10

[2082 rows x 3 columns]

Feature Pipeline features_next_hour:
Shape: (3, 674)
Most recent hour in features_next_hour: 2025-04-17 08:00:00
Sample data:
  start_station_name  rides_t-672  rides_t-671  rides_t-670  rides_t-669  \
0   11 Ave & W 41 St   

In [47]:

def fetch_hourly_rides(hours):
    current_hour = (pd.Timestamp.now(tz="Etc/UTC") - timedelta(hours=hours)).floor('h')

    fs = get_feature_store()
    fg = fs.get_feature_group(
        name=config.FEATURE_GROUP_NAME,
        version=1
    )

    query = fg.select_all()
    query = query.filter(fg.pickup_hour >= current_hour)

    return query.read()

In [48]:
df = fetch_hourly_rides(12)

2025-03-04 21:27:30,576 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-04 21:27:30,583 INFO: Initializing external client
2025-03-04 21:27:30,584 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-04 21:27:31,373 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214689
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (4.34s) 


In [63]:
df.sort_values(['pickup_hour'])

Unnamed: 0,pickup_hour,pickup_location_id,rides
1151,2025-03-04 09:00:00+00:00,114,31
1045,2025-03-04 09:00:00+00:00,196,0
256,2025-03-04 09:00:00+00:00,53,1
2499,2025-03-04 09:00:00+00:00,54,0
1056,2025-03-04 09:00:00+00:00,20,0
...,...,...,...
4104,2025-03-05 01:00:00+00:00,52,0
4105,2025-03-05 01:00:00+00:00,6,0
4106,2025-03-05 01:00:00+00:00,235,0
4093,2025-03-05 01:00:00+00:00,119,1


In [50]:
df_pred = fetch_predictions(12)

2025-03-04 21:27:37,652 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-04 21:27:37,658 INFO: Initializing external client
2025-03-04 21:27:37,658 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-04 21:27:38,333 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214689
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.48s) 


In [51]:
df_pred

Unnamed: 0,pickup_location_id,predicted_demand,pickup_hour
0,229,30.0,2025-03-05 00:00:00+00:00
1,85,0.0,2025-03-05 00:00:00+00:00
2,36,0.0,2025-03-05 00:00:00+00:00
3,252,0.0,2025-03-05 00:00:00+00:00
4,133,0.0,2025-03-05 00:00:00+00:00
...,...,...,...
748,242,0.0,2025-03-05 03:00:00+00:00
749,144,18.0,2025-03-05 03:00:00+00:00
750,29,1.0,2025-03-05 03:00:00+00:00
751,91,0.0,2025-03-05 03:00:00+00:00


In [52]:
merged_df = pd.merge(df, df_pred, on=['pickup_location_id', 'pickup_hour'])

In [53]:
merged_df

Unnamed: 0,pickup_hour,pickup_location_id,rides,predicted_demand
0,2025-03-05 00:00:00+00:00,38,0,0.0
1,2025-03-05 00:00:00+00:00,47,0,0.0
2,2025-03-05 00:00:00+00:00,50,7,15.0
3,2025-03-05 00:00:00+00:00,260,1,0.0
4,2025-03-05 00:00:00+00:00,32,0,0.0
...,...,...,...,...
246,2025-03-05 00:00:00+00:00,86,0,0.0
247,2025-03-05 00:00:00+00:00,214,1,0.0
248,2025-03-05 00:00:00+00:00,96,0,0.0
249,2025-03-05 00:00:00+00:00,255,0,2.0


In [54]:
merged_df['difference'] = merged_df['predicted_demand'] - merged_df['rides']

In [55]:
merged_df.sort_values(["pickup_location_id", "pickup_hour"])

Unnamed: 0,pickup_hour,pickup_location_id,rides,predicted_demand,difference
202,2025-03-05 00:00:00+00:00,2,0,0.0,0.0
231,2025-03-05 00:00:00+00:00,3,0,0.0,0.0
10,2025-03-05 00:00:00+00:00,4,5,31.0,26.0
93,2025-03-05 00:00:00+00:00,6,0,0.0,0.0
129,2025-03-05 00:00:00+00:00,7,1,2.0,1.0
...,...,...,...,...,...
169,2025-03-05 00:00:00+00:00,259,0,0.0,0.0
3,2025-03-05 00:00:00+00:00,260,1,0.0,-1.0
98,2025-03-05 00:00:00+00:00,261,4,12.0,8.0
174,2025-03-05 00:00:00+00:00,262,1,39.0,38.0


In [56]:
merged_df

Unnamed: 0,pickup_hour,pickup_location_id,rides,predicted_demand,difference
0,2025-03-05 00:00:00+00:00,38,0,0.0,0.0
1,2025-03-05 00:00:00+00:00,47,0,0.0,0.0
2,2025-03-05 00:00:00+00:00,50,7,15.0,8.0
3,2025-03-05 00:00:00+00:00,260,1,0.0,-1.0
4,2025-03-05 00:00:00+00:00,32,0,0.0,0.0
...,...,...,...,...,...
246,2025-03-05 00:00:00+00:00,86,0,0.0,0.0
247,2025-03-05 00:00:00+00:00,214,1,0.0,-1.0
248,2025-03-05 00:00:00+00:00,96,0,0.0,0.0
249,2025-03-05 00:00:00+00:00,255,0,2.0,2.0


In [57]:
df1 = df
df2 = df_pred

# Merge the DataFrames on 'pickup_location_id' and 'pickup_hour'  
merged_df = pd.merge(df1, df2, on=['pickup_location_id', 'pickup_hour'])  

# Calculate the absolute error  
merged_df['absolute_error'] = abs(merged_df['predicted_demand'] - merged_df['rides'])  

# Group by 'pickup_hour' and calculate the mean absolute error (MAE)  
mae_by_hour = merged_df.groupby('pickup_hour')['absolute_error'].mean().reset_index()  
mae_by_hour.rename(columns={'absolute_error': 'MAE'}, inplace=True)  

# Create a Plotly plot  
fig = px.line(  
    mae_by_hour,  
    x='pickup_hour',  
    y='MAE',  
    title='Mean Absolute Error (MAE) by Pickup Hour',  
    labels={'pickup_hour': 'Pickup Hour', 'MAE': 'Mean Absolute Error'},  
    markers=True  
)  

# Show the plot  
fig.show()

In [58]:
mae_by_hour["MAE"].mean()

11.98804780876494