In [1]:
from datetime import datetime, timedelta, timezone
import pandas as pd

current_date = pd.to_datetime(datetime.now(timezone.utc).replace(tzinfo=None)).floor('h')
print(f"{current_date=}")

current_date=Timestamp('2025-02-04 20:00:00')


In [2]:
# fetching last 28 days data
fetch_data_to = current_date
fetch_data_from = current_date - timedelta(days=28)

In [3]:
print(f"{fetch_data_to=}")
print(f"{fetch_data_from=}")

fetch_data_to=Timestamp('2025-02-04 20:00:00')
fetch_data_from=Timestamp('2025-01-07 20:00:00')


### Fetching raw batch data from the original data source i.e., NYC yellow taxi data website

In [4]:
from src.data import load_raw_data

def fetch_batch_raw_data(from_date: datetime, to_date: datetime) -> pd.DataFrame:
    """
    Simulate production data by sampling historical data from 52 weeks ago (i.e. 1 year)
    """

    # we dont have access to recent data, so to simulate fetching data every hour, we will back date to 1 year and simulate fetching from that date

    from_date_ = from_date - timedelta(days=7*52)
    to_date_ = to_date - timedelta(days=7*52)

    # download 2 files from website
    rides = load_raw_data(year=from_date_.year, months=from_date_.month)
    rides = rides[rides.pickup_datetime >= from_date_]
    rides_2 = load_raw_data(year=to_date_.year, months=to_date_.month)
    rides_2 = rides_2[rides_2.pickup_datetime < to_date_]

    rides = pd.concat([rides, rides_2])

    # shift the data to pretend this is recent data
    rides['pickup_datetime'] += timedelta(days=7*52)

    rides.sort_values(by=['pickup_location_id', 'pickup_datetime'], inplace=True)

    return rides

In [5]:
rides = fetch_batch_raw_data(from_date=fetch_data_from, to_date=fetch_data_to)

File 2024-01 was already in local storage
File 2024-02 was already in local storage


### Transforming the fetched raw data to timeseries data

In [6]:
from src.data import transform_raw_data_into_ts_data
ts_data = transform_raw_data_into_ts_data(rides)

100%|██████████| 257/257 [00:00<00:00, 763.30it/s]


In [7]:
ts_data["pickup_location_id"].dtype

dtype('int32')

In [8]:
ts_data["pickup_location_id"] = ts_data["pickup_location_id"].astype("int64")
ts_data["pickup_location_id"].dtype

dtype('int64')

### Make a connection to hopsworks feature store to insert this data

In [9]:
import hopsworks
import src.config as config

# connect to the project
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME,
    api_key_value=config.HOPSWORKS_API_KEY
)

2025-02-04 15:49:22,472 INFO: Initializing external client
2025-02-04 15:49:22,472 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-02-04 15:49:24,386 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1211558


In [10]:
# connect to the feature store

feature_store = project.get_feature_store()
feature_store

<hsfs.feature_store.FeatureStore at 0x31f9eed80>

In [11]:
# connect to the feature group

feature_group = feature_store.get_or_create_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION,
    description="Time-series data at hourly frequency",
    primary_key = ['pickup_location_id', 'pickup_hour'],
    event_time="pickup_hour",
)
feature_group

<hsfs.feature_group.FeatureGroup at 0x1204be3f0>

In [12]:
feature_group.get_feature("pickup_location_id").type

'bigint'

In [13]:
feature_group.insert(ts_data, write_options={"wait_for_job": False})

Uploading Dataframe: 100.00% |██████████| Rows 172704/172704 | Elapsed Time: 01:24 | Remaining Time: 00:00


Launching job: time_series_hourly_feature_group_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1211558/jobs/named/time_series_hourly_feature_group_1_offline_fg_materialization/executions


(Job('time_series_hourly_feature_group_1_offline_fg_materialization', 'SPARK'),
 None)