In [1]:
from dotenv import load_dotenv
import os
load_dotenv() 

True

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [4]:
import pandas as pd
from src.data_utils import load_and_process_citibike_data

print("🚴 Loading Citi Bike data for 2024...")

chunks = []
rides_one_year = load_and_process_citibike_data(2024)
chunks.append(rides_one_year)

# Combine (though it's just one year)
rides = pd.concat(chunks, ignore_index=True)

print("✅ Data loading complete.")
print("Data shape:", rides.shape)


🚴 Loading Citi Bike data for 2024...
File already exists for 2024-01.
Loading Citi Bike data for 2024-01...
Total records: 50,661
Valid records: 50,589
Records dropped: 72 (0.14%)
Successfully processed data for 2024-01.
File already exists for 2024-02.
Loading Citi Bike data for 2024-02...
Total records: 55,613
Valid records: 55,532
Records dropped: 81 (0.15%)
Successfully processed data for 2024-02.
File already exists for 2024-03.
Loading Citi Bike data for 2024-03...
Total records: 65,581
Valid records: 65,383
Records dropped: 198 (0.30%)
Successfully processed data for 2024-03.
File already exists for 2024-04.
Loading Citi Bike data for 2024-04...
Total records: 79,116
Valid records: 78,948
Records dropped: 168 (0.21%)
Successfully processed data for 2024-04.
File already exists for 2024-05.
Loading Citi Bike data for 2024-05...
Total records: 97,479
Valid records: 97,225
Records dropped: 254 (0.26%)
Successfully processed data for 2024-05.
File already exists for 2024-06.
Loading

In [5]:
rides

Unnamed: 0,pickup_datetime,pickup_location_id
0,2024-01-15 15:18:07.000,JC072
1,2024-01-13 15:32:50.000,JC110
2,2024-01-19 13:11:00.000,JC072
3,2024-01-23 07:03:49.000,JC072
4,2024-01-01 16:46:10.000,JC072
...,...,...
1050473,2024-12-28 09:45:30.704,JC013
1050474,2024-12-12 16:21:50.427,JC013
1050475,2024-12-11 19:23:24.109,JC115
1050476,2024-12-12 20:48:40.471,JC115


In [6]:
rides.shape

(1050478, 2)

In [7]:
from src.data_utils import transform_raw_data_into_ts_data

ts_data = transform_raw_data_into_ts_data(rides)

In [8]:
ts_data.shape

(1800720, 3)

In [9]:
ts_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1800720 entries, 0 to 1800719
Data columns (total 3 columns):
 #   Column              Dtype         
---  ------              -----         
 0   pickup_hour         datetime64[ns]
 1   pickup_location_id  object        
 2   rides               int32         
dtypes: datetime64[ns](1), int32(1), object(1)
memory usage: 34.3+ MB


In [10]:
ts_data

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2024-01-01 00:00:00,4074.14,0
1,2024-01-01 01:00:00,4074.14,0
2,2024-01-01 02:00:00,4074.14,0
3,2024-01-01 03:00:00,4074.14,0
4,2024-01-01 04:00:00,4074.14,0
...,...,...,...
1800715,2024-12-31 19:00:00,JC116,0
1800716,2024-12-31 20:00:00,JC116,0
1800717,2024-12-31 21:00:00,JC116,0
1800718,2024-12-31 22:00:00,JC116,1


In [11]:
ts_data["pickup_location_id"] = ts_data["pickup_location_id"].astype(str)


In [12]:
ts_data = ts_data.sort_values(["pickup_location_id", "pickup_hour"]).reset_index(drop=True)


In [13]:
ts_data.dtypes


pickup_hour           datetime64[ns]
pickup_location_id            object
rides                          int32
dtype: object

In [14]:
n_hours = ts_data["pickup_hour"].nunique()
n_stations = ts_data["pickup_location_id"].nunique()
expected_rows = n_hours * n_stations

print(f"Unique hours: {n_hours}")
print(f"Unique stations: {n_stations}")
print(f"Expected rows: {expected_rows}")
print(f"Actual rows: {ts_data.shape[0]}")


Unique hours: 8784
Unique stations: 205
Expected rows: 1800720
Actual rows: 1800720


In [16]:
import os
import hopsworks

# Load API key and project name from environment
api_key = os.getenv('HOPSWORKS_API_KEY')  
project_name = os.getenv('HOPSWORKS_PROJECT_NAME')  

if not api_key or not project_name:
    raise ValueError("❌ Missing HOPSWORKS_API_KEY or HOPSWORKS_PROJECT_NAME in environment variables.")

# Connect to Hopsworks
project = hopsworks.login(
    api_key_value=api_key,
    project=project_name
)

print(f"✅ Successfully connected to Hopsworks project: {project_name}")


2025-05-06 21:17:20,468 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-05-06 21:17:20,472 INFO: Initializing external client
2025-05-06 21:17:20,473 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-06 21:17:21,619 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1213633
✅ Successfully connected to Hopsworks project: sp25_taxi


In [17]:
feature_store = project.get_feature_store()

In [18]:
FEATURE_GROUP_NAME = "citi_bike_hourly_features"
FEATURE_GROUP_VERSION = 1

In [19]:
feature_group = feature_store.get_or_create_feature_group(
    name=FEATURE_GROUP_NAME,
    version=FEATURE_GROUP_VERSION,
    description="Citi Bike time-series features (hourly)",
    primary_key=["pickup_location_id", "pickup_hour"],
    event_time="pickup_hour"
)

In [20]:
feature_group.insert(ts_data, write_options={"wait_for_job": False})

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1213633/fs/1201258/fg/1438471


Uploading Dataframe: 100.00% |█████████████████████| Rows 1800720/1800720 | Elapsed Time: 02:30 | Remaining Time: 00:00


Launching job: citi_bike_hourly_features_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1213633/jobs/named/citi_bike_hourly_features_1_offline_fg_materialization/executions


(Job('citi_bike_hourly_features_1_offline_fg_materialization', 'SPARK'), None)

In [21]:
df_memory_mb = rides.memory_usage(deep=True).sum() / (1024 * 1024)  
print(f"DataFrame size: {df_memory_mb:.2f} MB")

DataFrame size: 78.14 MB


In [22]:
ts_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1800720 entries, 0 to 1800719
Data columns (total 3 columns):
 #   Column              Dtype         
---  ------              -----         
 0   pickup_hour         datetime64[ns]
 1   pickup_location_id  object        
 2   rides               int32         
dtypes: datetime64[ns](1), int32(1), object(1)
memory usage: 34.3+ MB
