In [1]:
HOPSWORKS_PROJECT_NAME = "nyc_taxi_demand_forecast"

In [7]:
import os
from dotenv import load_dotenv
from src.paths import PARENT_DIR

# load key-value pairs from .env file
load_dotenv(PARENT_DIR / ".env")

HOPSWORKS_API_KEY = os.environ['HOPSWORKS_API_KEY']

## fetching raw nyc yellow taxi historical data 

In [8]:
from datetime import datetime
import pandas as pd
from src.data import load_raw_data

from_year = 2022
# to_year = datetime.now().year
to_year = 2024
print(f"Downloading raw data from {from_year} to {to_year}")

rides = pd.DataFrame()
for year in range(from_year, to_year+1):

    # download data for the whole year
    rides_one_year = load_raw_data(year)

    # append rows
    rides = pd.concat([rides, rides_one_year])

Downloading raw data from 2022 to 2024
File 2022-01 was already in local storage
File 2022-02 was already in local storage
File 2022-03 was already in local storage
File 2022-04 was already in local storage
File 2022-05 was already in local storage
File 2022-06 was already in local storage
File 2022-07 was already in local storage
File 2022-08 was already in local storage
File 2022-09 was already in local storage
File 2022-10 was already in local storage
File 2022-11 was already in local storage
File 2022-12 was already in local storage
File 2023-01 was already in local storage
File 2023-02 was already in local storage
File 2023-03 was already in local storage
File 2023-04 was already in local storage
File 2023-05 was already in local storage
File 2023-06 was already in local storage
File 2023-07 was already in local storage
File 2023-08 was already in local storage
File 2023-09 was already in local storage
File 2023-10 was already in local storage
File 2023-11 was already in local sto

In [9]:
print(f"{len(rides)=}")

len(rides)=115465093


In [11]:
rides

Unnamed: 0,pickup_datetime,pickup_location_id
0,2022-01-01 00:35:40,142
1,2022-01-01 00:33:43,236
2,2022-01-01 00:53:21,166
3,2022-01-01 00:25:21,114
4,2022-01-01 00:36:48,68
...,...,...
3646364,2024-11-30 23:11:15,162
3646365,2024-11-30 23:49:30,132
3646366,2024-11-30 23:31:46,100
3646367,2024-11-30 23:41:21,42


In [12]:
from src.data import transform_raw_data_into_ts_data

ts_data = transform_raw_data_into_ts_data(rides)

100%|██████████| 263/263 [00:07<00:00, 37.50it/s]


In [13]:
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id
0,2022-01-01 00:00:00,11,4
1,2022-01-01 01:00:00,15,4
2,2022-01-01 02:00:00,26,4
3,2022-01-01 03:00:00,8,4
4,2022-01-01 04:00:00,9,4
...,...,...,...
6722275,2024-11-30 19:00:00,0,110
6722276,2024-11-30 20:00:00,0,110
6722277,2024-11-30 21:00:00,0,110
6722278,2024-11-30 22:00:00,0,110


## Load timeseries data into the hopsworks feature store

In [14]:
import hopsworks

In [15]:
project = hopsworks.login(
    project=HOPSWORKS_PROJECT_NAME,
    api_key_value=HOPSWORKS_API_KEY
)

2025-02-03 12:57:20,050 INFO: Initializing external client
2025-02-03 12:57:20,051 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-02-03 12:57:21,250 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1211558


In [16]:
project

Project('nyc_taxi_demand_forecast', 'sushmithahs047@gmail.com', 'Default project')

In [17]:
# connect to the feature store - this gives pointer to the feature store
feature_store = project.get_feature_store()
feature_store

<hsfs.feature_store.FeatureStore at 0x110eeb9b0>

### using feature group (the write api) to save data to the feature_store

In [18]:
FEATURE_GROUP_NAME = "time_series_hourly_feature_group"
FEATURE_GROUP_VERSION = 1

In [19]:
feature_group = feature_store.get_or_create_feature_group(
    name=FEATURE_GROUP_NAME,
    version=FEATURE_GROUP_VERSION,
    description="Time-series data at hourly frequency",
    primary_key = ['pickup_location_id', 'pickup_hour'],
    event_time="pickup_hour",
)

In [20]:
feature_group.insert(ts_data, write_options={"wait_for_job": False})

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1211558/fs/1200206/fg/1400282


Uploading Dataframe: 100.00% |██████████| Rows 6722280/6722280 | Elapsed Time: 04:37 | Remaining Time: 00:00


Launching job: time_series_hourly_feature_group_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1211558/jobs/named/time_series_hourly_feature_group_1_offline_fg_materialization/executions


(Job('time_series_hourly_feature_group_1_offline_fg_materialization', 'SPARK'),
 None)