Import statements and Autoreload

In [1]:
# Enable autoreload for development
%load_ext autoreload
%autoreload 2

import sys
import os
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Add parent directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

# Import utility functions
from src.data_utils import load_and_process_citi_bike_data
from src.data_utils import transform_raw_data_into_ts_data
from src.data_utils import transform_ts_data_info_features_and_target_loop
from src.config import TRANSFORMED_DATA_DIR
from pathlib import Path  # Added import for Path

Loading data for 2023, transforming, and adding attributes and target

In [2]:
# Load and process Citi Bike data for 2024 (Jan-Dec) and 2025 (Jan-Mar)
rides_2024 = load_and_process_citi_bike_data(year=2024, months=list(range(1, 13)))
print(f"Rows after loading 2024 data: {rides_2024.shape[0]}")

rides_2025 = load_and_process_citi_bike_data(year=2025, months=list(range(1, 4)))
print(f"Rows after loading 2025 data: {rides_2025.shape[0]}")

# Combine the data for 2024 and 2025
rides = pd.concat([rides_2024, rides_2025], ignore_index=True)
print(f"Rows after combining 2024 and 2025 data: {rides.shape[0]}")

# Filter for the training period: Jan 2024 to Jan 2025 (inclusive)
rides = rides[
    (rides['pickup_datetime'] >= '2024-01-01') &
    (rides['pickup_datetime'] < '2025-02-01')
]
print(f"Rows after filtering for training period (Jan 2024 - Jan 2025): {rides.shape[0]}")
rides.head(5)
rides.shape

# Transform raw data into time series format
# The function will now filter to the top 3 stations internally
ts_data = transform_raw_data_into_ts_data(rides)
print(f"Rows after transforming to time series: {ts_data.shape[0]}")
ts_data.head(5)
ts_data.shape

# Transform time series data into features and targets with a 28-day window
features, targets = transform_ts_data_info_features_and_target_loop(
    ts_data, window_size=24*28, step_size=1  # Hourly predictions
)
print(f"Rows after creating features and targets: {features.shape[0]}")
features.head(5)
features.shape

File already exists for 2024-01.
Loading data for 2024-01...
Before filtering - 2024-01: 1888085 rows
After duration filter - 2024-01: 1884640 rows
After station filter - 2024-01: 1886925 rows
After date range filter - 2024-01: 1887675 rows
Total records: 1,888,085
Valid records: 1,883,159
Records dropped: 4,926 (0.26%)
Successfully processed data for 2024-01.
File already exists for 2024-02.
Loading data for 2024-02...
Before filtering - 2024-02: 2121501 rows
After duration filter - 2024-02: 2118148 rows
After station filter - 2024-02: 2119635 rows
After date range filter - 2024-02: 2121268 rows
Total records: 2,121,501
Valid records: 2,116,154
Records dropped: 5,347 (0.25%)
Successfully processed data for 2024-02.
File already exists for 2024-03.
Loading data for 2024-03...
Before filtering - 2024-03: 2663295 rows
After duration filter - 2024-03: 2656656 rows
After station filter - 2024-03: 2660499 rows
After date range filter - 2024-03: 2663057 rows
Total records: 2,663,295
Valid re

Unnamed: 0,pickup_datetime,ended_at,start_station_name,duration
0,2024-01-22 18:43:19.012,2024-01-22 18:48:10.708,Frederick Douglass Blvd & W 145 St,0 days 00:04:51.696000
1,2024-01-11 19:19:18.721,2024-01-11 19:47:36.007,W 54 St & 6 Ave,0 days 00:28:17.286000
2,2024-01-30 19:17:41.693,2024-01-30 19:32:49.857,E 11 St & Ave B,0 days 00:15:08.164000
3,2024-01-27 11:27:01.759,2024-01-27 11:38:01.213,W 54 St & 6 Ave,0 days 00:10:59.454000
4,2024-01-16 15:15:41.000,2024-01-16 15:29:26.156,Madison Ave & E 99 St,0 days 00:13:45.156000


(46269092, 4)

Inspecting rides DataFrame before transformation:
          pickup_datetime                ended_at  \
0 2024-01-22 18:43:19.012 2024-01-22 18:48:10.708   
1 2024-01-11 19:19:18.721 2024-01-11 19:47:36.007   
2 2024-01-30 19:17:41.693 2024-01-30 19:32:49.857   
3 2024-01-27 11:27:01.759 2024-01-27 11:38:01.213   
4 2024-01-16 15:15:41.000 2024-01-16 15:29:26.156   

                   start_station_name               duration  
0  Frederick Douglass Blvd & W 145 St 0 days 00:04:51.696000  
1                     W 54 St & 6 Ave 0 days 00:28:17.286000  
2                     E 11 St & Ave B 0 days 00:15:08.164000  
3                     W 54 St & 6 Ave 0 days 00:10:59.454000  
4               Madison Ave & E 99 St 0 days 00:13:45.156000  
Number of rows in rides: 46269092
pickup_datetime range: 2024-01-01 00:00:03.208000 to 2025-01-31 23:58:14.634000
Unique start_station_name values: 2281
pickup_datetime dtype: datetime64[ns]
start_station_name dtype: object
Sample start_station_name val

Unnamed: 0,pickup_hour,start_station_name,rides
0,2024-01-01 00:00:00,8 Ave & W 31 St,4
1,2024-01-01 01:00:00,8 Ave & W 31 St,9
2,2024-01-01 02:00:00,8 Ave & W 31 St,2
3,2024-01-01 03:00:00,8 Ave & W 31 St,0
4,2024-01-01 04:00:00,8 Ave & W 31 St,2


(28584, 3)

Rows after creating features and targets: 26568


Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,start_station_name
0,4,9,2,0,2,0,0,0,1,2,...,7,4,6,8,10,7,2,0,2024-01-29 00:00:00,8 Ave & W 31 St
1,9,2,0,2,0,0,0,1,2,3,...,4,6,8,10,7,2,0,4,2024-01-29 01:00:00,8 Ave & W 31 St
2,2,0,2,0,0,0,1,2,3,3,...,6,8,10,7,2,0,4,0,2024-01-29 02:00:00,8 Ave & W 31 St
3,0,2,0,0,0,1,2,3,3,8,...,8,10,7,2,0,4,0,1,2024-01-29 03:00:00,8 Ave & W 31 St
4,2,0,0,0,1,2,3,3,8,5,...,10,7,2,0,4,0,1,1,2024-01-29 04:00:00,8 Ave & W 31 St


(26568, 674)

Merging the features and target (along with shape check)

In [3]:
# Combine features and targets into a single DataFrame
tabular_data = features.copy()
tabular_data["target"] = targets
features.shape
targets.shape
tabular_data.shape

(26568, 674)

(26568,)

(26568, 675)

Saving the merged df tabular data to transformed dir in parquet format

In [4]:
tabular_data.to_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet", engine="pyarrow")
print("Tabular data saved to:", TRANSFORMED_DATA_DIR / "tabular_data.parquet")

# Inspect the final tabular data to confirm additional columns
tabular_data.head(5)

Tabular data saved to: C:\Users\singh\Downloads\CDS500_Applied_ML_DS\Projects\CDA500Final\data\transformed\tabular_data.parquet


Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,start_station_name,target
0,4,9,3,0,2,0,0,0,1,2,...,4,6,8,10,7,2,0,2024-01-29 00:00:00,8 Ave & W 31 St,4
1,9,3,0,2,0,0,0,1,2,3,...,6,8,10,7,2,0,4,2024-01-29 01:00:00,8 Ave & W 31 St,0
2,3,0,2,0,0,0,1,2,3,3,...,8,10,7,2,0,4,0,2024-01-29 02:00:00,8 Ave & W 31 St,1
3,0,2,0,0,0,1,2,3,3,8,...,10,7,2,0,4,0,1,2024-01-29 03:00:00,8 Ave & W 31 St,1
4,2,0,0,0,1,2,3,3,8,5,...,7,2,0,4,0,1,1,2024-01-29 04:00:00,8 Ave & W 31 St,1
