In [46]:
%load_ext autoreload
%autoreload 2
import sys
import os
import hopsworks
import pandas as pd
from datetime import timedelta
import plotly.express as px


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

import src.config as config
from src.inference import get_feature_store, fetch_predictions

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [47]:

def fetch_hourly_rides(hours):
    current_hour = (pd.Timestamp.now(tz="Etc/UTC") - timedelta(hours=hours)).floor('h')

    fs = get_feature_store()
    fg = fs.get_feature_group(
        name=config.FEATURE_GROUP_NAME,
        version=1
    )

    query = fg.select_all()
    query = query.filter(fg.pickup_hour >= current_hour)

    return query.read()

In [48]:
df = fetch_hourly_rides(12)

2025-03-04 21:27:30,576 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-04 21:27:30,583 INFO: Initializing external client
2025-03-04 21:27:30,584 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-04 21:27:31,373 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214689
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (4.34s) 


In [63]:
df.sort_values(['pickup_hour'])

Unnamed: 0,pickup_hour,pickup_location_id,rides
1151,2025-03-04 09:00:00+00:00,114,31
1045,2025-03-04 09:00:00+00:00,196,0
256,2025-03-04 09:00:00+00:00,53,1
2499,2025-03-04 09:00:00+00:00,54,0
1056,2025-03-04 09:00:00+00:00,20,0
...,...,...,...
4104,2025-03-05 01:00:00+00:00,52,0
4105,2025-03-05 01:00:00+00:00,6,0
4106,2025-03-05 01:00:00+00:00,235,0
4093,2025-03-05 01:00:00+00:00,119,1


In [50]:
df_pred = fetch_predictions(12)

2025-03-04 21:27:37,652 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-04 21:27:37,658 INFO: Initializing external client
2025-03-04 21:27:37,658 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-04 21:27:38,333 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214689
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.48s) 


In [51]:
df_pred

Unnamed: 0,pickup_location_id,predicted_demand,pickup_hour
0,229,30.0,2025-03-05 00:00:00+00:00
1,85,0.0,2025-03-05 00:00:00+00:00
2,36,0.0,2025-03-05 00:00:00+00:00
3,252,0.0,2025-03-05 00:00:00+00:00
4,133,0.0,2025-03-05 00:00:00+00:00
...,...,...,...
748,242,0.0,2025-03-05 03:00:00+00:00
749,144,18.0,2025-03-05 03:00:00+00:00
750,29,1.0,2025-03-05 03:00:00+00:00
751,91,0.0,2025-03-05 03:00:00+00:00


In [52]:
merged_df = pd.merge(df, df_pred, on=['pickup_location_id', 'pickup_hour'])

In [53]:
merged_df

Unnamed: 0,pickup_hour,pickup_location_id,rides,predicted_demand
0,2025-03-05 00:00:00+00:00,38,0,0.0
1,2025-03-05 00:00:00+00:00,47,0,0.0
2,2025-03-05 00:00:00+00:00,50,7,15.0
3,2025-03-05 00:00:00+00:00,260,1,0.0
4,2025-03-05 00:00:00+00:00,32,0,0.0
...,...,...,...,...
246,2025-03-05 00:00:00+00:00,86,0,0.0
247,2025-03-05 00:00:00+00:00,214,1,0.0
248,2025-03-05 00:00:00+00:00,96,0,0.0
249,2025-03-05 00:00:00+00:00,255,0,2.0


In [54]:
merged_df['difference'] = merged_df['predicted_demand'] - merged_df['rides']

In [55]:
merged_df.sort_values(["pickup_location_id", "pickup_hour"])

Unnamed: 0,pickup_hour,pickup_location_id,rides,predicted_demand,difference
202,2025-03-05 00:00:00+00:00,2,0,0.0,0.0
231,2025-03-05 00:00:00+00:00,3,0,0.0,0.0
10,2025-03-05 00:00:00+00:00,4,5,31.0,26.0
93,2025-03-05 00:00:00+00:00,6,0,0.0,0.0
129,2025-03-05 00:00:00+00:00,7,1,2.0,1.0
...,...,...,...,...,...
169,2025-03-05 00:00:00+00:00,259,0,0.0,0.0
3,2025-03-05 00:00:00+00:00,260,1,0.0,-1.0
98,2025-03-05 00:00:00+00:00,261,4,12.0,8.0
174,2025-03-05 00:00:00+00:00,262,1,39.0,38.0


In [56]:
merged_df

Unnamed: 0,pickup_hour,pickup_location_id,rides,predicted_demand,difference
0,2025-03-05 00:00:00+00:00,38,0,0.0,0.0
1,2025-03-05 00:00:00+00:00,47,0,0.0,0.0
2,2025-03-05 00:00:00+00:00,50,7,15.0,8.0
3,2025-03-05 00:00:00+00:00,260,1,0.0,-1.0
4,2025-03-05 00:00:00+00:00,32,0,0.0,0.0
...,...,...,...,...,...
246,2025-03-05 00:00:00+00:00,86,0,0.0,0.0
247,2025-03-05 00:00:00+00:00,214,1,0.0,-1.0
248,2025-03-05 00:00:00+00:00,96,0,0.0,0.0
249,2025-03-05 00:00:00+00:00,255,0,2.0,2.0


In [57]:
df1 = df
df2 = df_pred

# Merge the DataFrames on 'pickup_location_id' and 'pickup_hour'  
merged_df = pd.merge(df1, df2, on=['pickup_location_id', 'pickup_hour'])  

# Calculate the absolute error  
merged_df['absolute_error'] = abs(merged_df['predicted_demand'] - merged_df['rides'])  

# Group by 'pickup_hour' and calculate the mean absolute error (MAE)  
mae_by_hour = merged_df.groupby('pickup_hour')['absolute_error'].mean().reset_index()  
mae_by_hour.rename(columns={'absolute_error': 'MAE'}, inplace=True)  

# Create a Plotly plot  
fig = px.line(  
    mae_by_hour,  
    x='pickup_hour',  
    y='MAE',  
    title='Mean Absolute Error (MAE) by Pickup Hour',  
    labels={'pickup_hour': 'Pickup Hour', 'MAE': 'Mean Absolute Error'},  
    markers=True  
)  

# Show the plot  
fig.show()

In [58]:
mae_by_hour["MAE"].mean()

11.98804780876494