In [5]:
%load_ext autoreload
%autoreload 2
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
import src.config as config

import logging
import os
import sys
from datetime import datetime, timedelta, timezone

import hopsworks
import pandas as pd

import src.config as config
from src.data_utils import fetch_batch_citibike_data, transform_raw_data_into_ts_data

# Configure logging
logging.basicConfig(
    level=logging.INFO,  # Set the logging level
    format="%(asctime)s - %(levelname)s - %(message)s",  # Log format
    handlers=[
        logging.StreamHandler(sys.stdout),  # Output logs to stdout
    ],
)
logger = logging.getLogger(__name__)


# Step 1: Get the current date and time (timezone-aware)
current_date = pd.to_datetime(datetime.now(timezone.utc)).ceil("h")
logger.info(f"Current date and time (UTC): {current_date}")

# Step 2: Define the data fetching range
fetch_data_to = current_date
fetch_data_from = current_date - timedelta(days=28)
logger.info(f"Fetching data from {fetch_data_from} to {fetch_data_to}")

# Step 3: Fetch raw data
logger.info("Fetching raw data...")
rides = fetch_batch_citibike_data(fetch_data_from, fetch_data_to)
logger.info(f"Raw data fetched. Number of records: {len(rides)}")

# Step 4: Transform raw data into time-series data
logger.info("Transforming raw data into time-series data...")
ts_data = transform_raw_data_into_ts_data(rides)
logger.info(
    f"Transformation complete. Number of records in time-series data: {len(ts_data)}"
)

# Step 5: Connect to the Hopsworks project
logger.info("Connecting to Hopsworks project...")
project = hopsworks.login(
    project=config.HOPSWORKS_PROJECT_NAME, api_key_value=config.HOPSWORKS_API_KEY
)
logger.info("Connected to Hopsworks project.")

# Step 6: Connect to the feature store
logger.info("Connecting to the feature store...")
feature_store = project.get_feature_store()
logger.info("Connected to the feature store.")

# Step 7: Connect to or create the feature group
logger.info(
    f"Connecting to the feature group: {config.FEATURE_GROUP_NAME} (version {config.FEATURE_GROUP_VERSION})..."
)
feature_group = feature_store.get_feature_group(
    name=config.FEATURE_GROUP_NAME,
    version=config.FEATURE_GROUP_VERSION,
)
logger.info("Feature group ready.")

# Step 8: Insert data into the feature group
logger.info("Inserting data into the feature group...")
feature_group.insert(ts_data, write_options={"wait_for_job": False})
logger.info("Data insertion completed.")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
2025-05-08 18:55:44,232 INFO: Current date and time (UTC): 2025-05-08 23:00:00
2025-05-08 18:55:44,233 INFO: Fetching data from 2025-04-10 23:00:00 to 2025-05-08 23:00:00
2025-05-08 18:55:44,234 INFO: Fetching raw data...
File already exists for 2024-04.
Loading Citi Bike data for 2024-04...
Total records: 79,116
Valid records: 78,948
Records dropped: 168 (0.21%)
Successfully processed data for 2024-04.
Combining all monthly Citi Bike data...
Citi Bike data loading and processing complete!
File already exists for 2024-05.
Loading Citi Bike data for 2024-05...
Total records: 97,479
Valid records: 97,225
Records dropped: 254 (0.26%)
Successfully processed data for 2024-05.
Combining all monthly Citi Bike data...
Citi Bike data loading and processing complete!
2025-05-08 18:55:45,329 INFO: Raw data fetched. Number of records: 82326
2025-05-08 18:55:45,330 INFO: Transforming raw data into time-series fo

FeatureStoreException: Features are not compatible with Feature Group schema: 
 - event_hour (type: 'timestamp') does not exist in feature group.
Note that feature (or column) names are case insensitive and spaces are automatically replaced with underscores.