In [40]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
import src.config as config

In [42]:
import hopsworks
import pandas as pd
from datetime import timedelta
from src.inference import get_feature_store, fetch_predictions

def fetch_hourly_rides(hours):
    current_hour = (pd.Timestamp.now(tz="Etc/UTC") - timedelta(hours=hours)).floor('h')

    fs = get_feature_store()
    fg = fs.get_feature_group(
        name=config.FEATURE_GROUP_NAME,
        version=1
    )
    print(fg)
    query = fg.select_all()
    query = query.filter(fg.pickup_hour >= current_hour)

    return query.read()

In [43]:
df = fetch_hourly_rides(12)

2025-03-05 14:48:24,264 INFO: Closing external client and cleaning up certificates.
2025-03-05 14:48:24,268 INFO: Initializing external client
2025-03-05 14:48:24,268 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-05 14:48:25,056 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214681
<hsfs.feature_group.FeatureGroup object at 0x00000288BEE75BD0>
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (5.78s) 


In [44]:
df

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2025-03-05 03:00:00+00:00,7,0
1,2025-03-05 06:00:00+00:00,141,58
2,2025-03-05 16:00:00+00:00,179,0
3,2025-03-05 11:00:00+00:00,111,0
4,2025-03-05 15:00:00+00:00,163,258
...,...,...,...
3775,2025-03-05 10:00:00+00:00,138,199
3776,2025-03-05 10:00:00+00:00,9,0
3777,2025-03-05 13:00:00+00:00,225,1
3778,2025-03-05 16:00:00+00:00,160,0


             pickup_hour  pickup_location_id  rides
804  2025-03-05 08:00:00                  27      0
967  2025-03-05 09:00:00                  27      0
1014 2025-03-05 15:00:00                  27      0
1132 2025-03-05 07:00:00                  27      0
1438 2025-03-05 03:00:00                  27      0
1882 2025-03-05 05:00:00                  27      0
1908 2025-03-05 04:00:00                  27      0
2498 2025-03-05 16:00:00                  27      0
2823 2025-03-05 13:00:00                  27      0
3002 2025-03-05 14:00:00                  27      0
3142 2025-03-05 12:00:00                  27      0
3351 2025-03-05 10:00:00                  27      0
3390 2025-03-05 02:00:00                  27      0
3624 2025-03-05 11:00:00                  27      0
3656 2025-03-05 06:00:00                  27      0


In [45]:
df_pred = fetch_predictions(12)

2025-03-05 14:48:33,093 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-03-05 14:48:33,105 INFO: Initializing external client
2025-03-05 14:48:33,105 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-05 14:48:33,709 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1214681
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.37s) 


In [46]:
df_pred

Unnamed: 0,pickup_location_id,predicted_demand,pickup_hour
0,176,0.0,2025-03-05 19:00:00+00:00
1,27,0.0,2025-03-05 19:00:00+00:00
2,152,3.0,2025-03-05 19:00:00+00:00
3,43,73.0,2025-03-05 19:00:00+00:00
4,262,7.0,2025-03-05 19:00:00+00:00
...,...,...,...
247,138,90.0,2025-03-05 19:00:00+00:00
248,200,0.0,2025-03-05 19:00:00+00:00
249,80,0.0,2025-03-05 19:00:00+00:00
250,78,1.0,2025-03-05 19:00:00+00:00


In [58]:
print(df.dtypes)
print(df_pred.dtypes)


pickup_hour           datetime64[us]
pickup_location_id             int32
rides                          int32
dtype: object
pickup_location_id             int32
predicted_demand             float64
pickup_hour           datetime64[us]
dtype: object


In [61]:
merged_df = pd.merge(df, df_pred, on=['pickup_hour'])

In [60]:
merged_df

Unnamed: 0,pickup_hour_x,pickup_location_id,rides,predicted_demand,pickup_hour_y
0,2025-03-05 03:00:00,7,0,1.0,2025-03-05 19:00:00
1,2025-03-05 07:00:00,7,1,1.0,2025-03-05 19:00:00
2,2025-03-05 11:00:00,7,2,1.0,2025-03-05 19:00:00
3,2025-03-05 10:00:00,7,3,1.0,2025-03-05 19:00:00
4,2025-03-05 16:00:00,7,4,1.0,2025-03-05 19:00:00
...,...,...,...,...,...
3775,2025-03-05 04:00:00,85,0,1.0,2025-03-05 19:00:00
3776,2025-03-05 09:00:00,85,0,1.0,2025-03-05 19:00:00
3777,2025-03-05 07:00:00,85,0,1.0,2025-03-05 19:00:00
3778,2025-03-05 12:00:00,85,0,1.0,2025-03-05 19:00:00


In [51]:
merged_df['difference'] = merged_df['predicted_demand'] - merged_df['rides']

In [52]:
merged_df.sort_values(["pickup_location_id", "pickup_hour"])

Unnamed: 0,pickup_hour,pickup_location_id,rides,predicted_demand,difference


In [53]:
merged_df

Unnamed: 0,pickup_hour,pickup_location_id,rides,predicted_demand,difference


In [54]:
import pandas as pd  
import plotly.express as px
df1 = df
df2 = df_pred

# Merge the DataFrames on 'pickup_location_id' and 'pickup_hour'  
merged_df = pd.merge(df1, df2, on=['pickup_location_id', 'pickup_hour'])  

# Calculate the absolute error  
merged_df['absolute_error'] = abs(merged_df['predicted_demand'] - merged_df['rides'])  

# Group by 'pickup_hour' and calculate the mean absolute error (MAE)  
mae_by_hour = merged_df.groupby('pickup_hour')['absolute_error'].mean().reset_index()  
mae_by_hour.rename(columns={'absolute_error': 'MAE'}, inplace=True)  

# Create a Plotly plot  
fig = px.line(  
    mae_by_hour,  
    x='pickup_hour',  
    y='MAE',  
    title='Mean Absolute Error (MAE) by Pickup Hour',  
    labels={'pickup_hour': 'Pickup Hour', 'MAE': 'Mean Absolute Error'},  
    markers=True  
)  

# Show the plot  
fig.show()

In [55]:
mae_by_hour["MAE"].mean()

nan