In [49]:
HOPSWORKS_PROJECT_NAME = "nyc_taxi_demand_forecast"

In [50]:
import os
from dotenv import load_dotenv
from src.paths import PARENT_DIR

# load key-value pairs from .env file
load_dotenv(PARENT_DIR / ".env")

HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY']

## fetching raw nyc yellow taxi historical data 

In [51]:
from datetime import datetime
import pandas as pd
from src.data import load_raw_data

from_year = 2022
to_year = datetime.now().year
# to_year = 2025
print(f"Downloading raw data from {from_year} to {to_year}")

rides = pd.DataFrame()
for year in range(from_year, to_year+1):

    # download data for the whole year
    rides_one_year = load_raw_data(year)

    # append rows
    rides = pd.concat([rides, rides_one_year])

Downloading raw data from 2022 to 2025
File 2022-01 was already in local storage
File 2022-02 was already in local storage
File 2022-03 was already in local storage
File 2022-04 was already in local storage
File 2022-05 was already in local storage
File 2022-06 was already in local storage
File 2022-07 was already in local storage
File 2022-08 was already in local storage
File 2022-09 was already in local storage
File 2022-10 was already in local storage
File 2022-11 was already in local storage
File 2022-12 was already in local storage
File 2023-01 was already in local storage
File 2023-02 was already in local storage
File 2023-03 was already in local storage
File 2023-04 was already in local storage
File 2023-05 was already in local storage
File 2023-06 was already in local storage
File 2023-07 was already in local storage
File 2023-08 was already in local storage
File 2023-09 was already in local storage
File 2023-10 was already in local storage
File 2023-11 was already in local sto

In [52]:
print(f"{len(rides)=}")

len(rides)=122608634


In [53]:
rides

Unnamed: 0,pickup_datetime,pickup_location_id
0,2022-01-01 00:35:40,142
1,2022-01-01 00:33:43,236
2,2022-01-01 00:53:21,166
3,2022-01-01 00:25:21,114
4,2022-01-01 00:36:48,68
...,...,...
3475221,2025-01-31 23:01:48,79
3475222,2025-01-31 23:50:29,161
3475223,2025-01-31 23:26:59,144
3475224,2025-01-31 23:14:34,142


In [54]:
from src.data import transform_raw_data_into_ts_data

ts_data = transform_raw_data_into_ts_data(rides)

100%|██████████| 265/265 [00:07<00:00, 34.97it/s]


In [55]:
# string to datetime
ts_data['pickup_hour'] = pd.to_datetime(ts_data['pickup_hour'], utc=True)

# add column with Unix epoch milliseconds
ts_data['pickup_ts'] = ts_data['pickup_hour'].astype(int) // 10**6

In [56]:
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id,pickup_ts
0,2022-01-01 00:00:00+00:00,0,1,1640995200000
1,2022-01-01 01:00:00+00:00,0,1,1640998800000
2,2022-01-01 02:00:00+00:00,0,1,1641002400000
3,2022-01-01 03:00:00+00:00,0,1,1641006000000
4,2022-01-01 04:00:00+00:00,1,1,1641009600000
...,...,...,...,...
7167715,2025-01-31 19:00:00+00:00,2,265,1738350000000
7167716,2025-01-31 20:00:00+00:00,3,265,1738353600000
7167717,2025-01-31 21:00:00+00:00,0,265,1738357200000
7167718,2025-01-31 22:00:00+00:00,2,265,1738360800000


In [57]:
ts_data.dtypes

pickup_hour           datetime64[ns, UTC]
rides                               int64
pickup_location_id                  int64
pickup_ts                           int64
dtype: object

## Load timeseries data into the hopsworks feature store

In [58]:
import hopsworks

In [59]:
project = hopsworks.login(
    project=HOPSWORKS_PROJECT_NAME,
    api_key_value=HOPSWORKS_API_KEY
)

2025-04-05 11:26:26,203 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-04-05 11:26:26,208 INFO: Initializing external client
2025-04-05 11:26:26,208 INFO: Base URL: https://c.app.hopsworks.ai:443




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-04-05 11:26:26,771 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1211558


In [60]:
project

Project('nyc_taxi_demand_forecast', 'sushmithahs047@gmail.com', 'Default project')

In [61]:
# connect to the feature store - this gives pointer to the feature store
feature_store = project.get_feature_store()
feature_store

<hsfs.feature_store.FeatureStore at 0x3272c1e50>

### using feature group (the write api) to save data to the feature_store

In [62]:
FEATURE_GROUP_NAME = "time_series_hourly_feature_group"
FEATURE_GROUP_VERSION = 1

In [63]:
feature_group = feature_store.get_or_create_feature_group(
    name=FEATURE_GROUP_NAME,
    version=FEATURE_GROUP_VERSION,
    description="Time-series data at hourly frequency",
    primary_key = ['pickup_location_id', 'pickup_ts'],
    event_time="pickup_ts",
)

In [64]:
feature_group.insert(ts_data, write_options={"wait_for_job": False})

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1211558/fs/1200206/fg/1434995


Uploading Dataframe: 100.00% |██████████| Rows 7167720/7167720 | Elapsed Time: 05:26 | Remaining Time: 00:00


Launching job: time_series_hourly_feature_group_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1211558/jobs/named/time_series_hourly_feature_group_1_offline_fg_materialization/executions


(Job('time_series_hourly_feature_group_1_offline_fg_materialization', 'SPARK'),
 None)