Import Statements

In [1]:
from dotenv import load_dotenv
import os
load_dotenv()
import sys
from datetime import datetime
import pandas as pd
import hopsworks

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from src.data_utils import load_and_process_citi_bike_data
from src.data_utils import transform_raw_data_into_ts_data
import src.config as config  # Import config to use FEATURE_GROUP_NAME and FEATURE_GROUP_VERSION

Loading data using load_and_process function and transforming

In [2]:
# Define the range of years to process
from_year = 2024
to_year = 2025
print(f"Download raw data from {from_year} to {to_year}")

rides = pd.DataFrame()
chunks = []
for year in range(from_year, to_year + 1):
    # For 2024, process all 12 months
    # For 2025, process only January to March (as per original data loading in 05_transform_raw_data_into_features_and_targets.ipynb)
    months = list(range(1, 13)) if year == 2024 else list(range(1, 4))
    rides_one_year = load_and_process_citi_bike_data(year=year, months=months)
    chunks.append(rides_one_year)

# Concatenate all chunks
rides = pd.concat(chunks, ignore_index=True)
print("Data loading complete.")

# Transform raw data into time series format
ts_data = transform_raw_data_into_ts_data(rides)

# Set up MLflow tracking for DagsHub
os.environ["MLFLOW_TRACKING_URI"] = os.getenv("MLFLOW_TRACKING_URI")
os.environ["MLFLOW_TRACKING_USERNAME"] = os.getenv("MLFLOW_TRACKING_USERNAME")
os.environ["MLFLOW_TRACKING_PASSWORD"] = os.getenv("MLFLOW_TRACKING_PASSWORD")

Download raw data from 2024 to 2025
File already exists for 2024-01.
Loading data for 2024-01...
Before filtering - 2024-01: 1888085 rows
After duration filter - 2024-01: 1884640 rows
After station filter - 2024-01: 1886925 rows
After date range filter - 2024-01: 1887675 rows
Total records: 1,888,085
Valid records: 1,883,159
Records dropped: 4,926 (0.26%)
Successfully processed data for 2024-01.
File already exists for 2024-02.
Loading data for 2024-02...
Before filtering - 2024-02: 2121501 rows
After duration filter - 2024-02: 2118148 rows
After station filter - 2024-02: 2119635 rows
After date range filter - 2024-02: 2121268 rows
Total records: 2,121,501
Valid records: 2,116,154
Records dropped: 5,347 (0.25%)
Successfully processed data for 2024-02.
File already exists for 2024-03.
Loading data for 2024-03...
Before filtering - 2024-03: 2663295 rows
After duration filter - 2024-03: 2656656 rows
After station filter - 2024-03: 2660499 rows
After date range filter - 2024-03: 2663057 ro

Logging into Hopsworks, and specifically into project CDA500P1

In [3]:
# Connect to Hopsworks
api_key = os.getenv('HOPSWORKS_API_KEY')
project_name = os.getenv('HOPSWORKS_PROJECT_NAME')

project = hopsworks.login(
    api_key_value=api_key,
    project=project_name
)
print(f"Successfully connected to Hopsworks project: {project_name}")

2025-05-10 03:01:09,611 INFO: Initializing external client
2025-05-10 03:01:09,621 INFO: Base URL: https://c.app.hopsworks.ai:443




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-05-10 03:01:10,608 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1225907
Successfully connected to Hopsworks project: CDA500PF1


Setting up pointers, Feature groups

In [4]:
# Create or retrieve a feature group in Hopsworks
feature_store = project.get_feature_store()

feature_group = feature_store.get_or_create_feature_group(
    name=config.FEATURE_GROUP_NAME,  # Use config.FEATURE_GROUP_NAME
    version=config.FEATURE_GROUP_VERSION,  # Use config.FEATURE_GROUP_VERSION
    description="Time series data at hourly frequency",
    primary_key=["start_station_name", "pickup_hour"],
    event_time="pickup_hour"
)

In [5]:
# Upload the time series data to the feature group
feature_group.insert(ts_data, write_options={"wait_for_job": False})

# Print memory usage and data info for diagnostics
df_memory_mb = rides.memory_usage(deep=True).sum() / (1024 * 1024)
print(f"DataFrame size: {df_memory_mb:.2f} MB")
ts_data.info()

Uploading Dataframe: 100.00% |██████████| Rows 32832/32832 | Elapsed Time: 00:02 | Remaining Time: 00:00


Launching job: citi_bike_hourly_feature_group_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1225907/jobs/named/citi_bike_hourly_feature_group_1_offline_fg_materialization/executions
DataFrame size: 4960.43 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32832 entries, 0 to 32831
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   pickup_hour         32832 non-null  datetime64[ns]
 1   start_station_name  32832 non-null  string        
 2   rides               32832 non-null  int16         
dtypes: datetime64[ns](1), int16(1), string(1)
memory usage: 577.3 KB
