## Steps
1. Download all raw data for a given year and month
1. For each raw data, apply the filter before saving it
1. Transform the saved raw data into TS data
1. Convert the ts data into features and targets
1. Save the transformed data


Main objective is to write utility functions to do all these things so we can reuse them later. 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

# Now you can import the updated Citi Bike data function
from src.data_utils import load_and_process_citibike_data


In [3]:
rides = load_and_process_citibike_data(year=2024)

Downloading Citi Bike data for 2024-01...
Successfully downloaded data for 2024-01.
Loading Citi Bike data for 2024-01...
Total records: 50,661
Valid records: 50,589
Records dropped: 72 (0.14%)
Successfully processed data for 2024-01.
Downloading Citi Bike data for 2024-02...
Successfully downloaded data for 2024-02.
Loading Citi Bike data for 2024-02...
Total records: 55,613
Valid records: 55,532
Records dropped: 81 (0.15%)
Successfully processed data for 2024-02.
Downloading Citi Bike data for 2024-03...
Successfully downloaded data for 2024-03.
Loading Citi Bike data for 2024-03...
Total records: 65,581
Valid records: 65,383
Records dropped: 198 (0.30%)
Successfully processed data for 2024-03.
Downloading Citi Bike data for 2024-04...
Successfully downloaded data for 2024-04.
Loading Citi Bike data for 2024-04...
Total records: 79,116
Valid records: 78,948
Records dropped: 168 (0.21%)
Successfully processed data for 2024-04.
Downloading Citi Bike data for 2024-05...
Successfully dow

In [4]:
rides

Unnamed: 0,pickup_datetime,pickup_location_id
0,2024-01-15 15:18:07.000,JC072
1,2024-01-13 15:32:50.000,JC110
2,2024-01-19 13:11:00.000,JC072
3,2024-01-23 07:03:49.000,JC072
4,2024-01-01 16:46:10.000,JC072
...,...,...
1050473,2024-12-28 09:45:30.704,JC013
1050474,2024-12-12 16:21:50.427,JC013
1050475,2024-12-11 19:23:24.109,JC115
1050476,2024-12-12 20:48:40.471,JC115


In [5]:
from src.data_utils import transform_raw_data_into_ts_data

ts_data = transform_raw_data_into_ts_data(rides)
ts_data

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2024-01-01 00:00:00,4074.14,0
1,2024-01-01 01:00:00,4074.14,0
2,2024-01-01 02:00:00,4074.14,0
3,2024-01-01 03:00:00,4074.14,0
4,2024-01-01 04:00:00,4074.14,0
...,...,...,...
1800715,2024-12-31 19:00:00,JC116,0
1800716,2024-12-31 20:00:00,JC116,0
1800717,2024-12-31 21:00:00,JC116,0
1800718,2024-12-31 22:00:00,JC116,1


In [6]:
ts_data.shape

(1800720, 3)

In [9]:
from src.data_utils import transform_ts_data_into_features_and_target_loop

features, targets = transform_ts_data_into_features_and_target_loop(ts_data, window_size=24*7, step_size=1)


In [10]:
features

Unnamed: 0,rides_t-168,rides_t-167,rides_t-166,rides_t-165,rides_t-164,rides_t-163,rides_t-162,rides_t-161,rides_t-160,rides_t-159,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2024-01-08 00:00:00,4074.14
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2024-01-08 01:00:00,4074.14
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2024-01-08 02:00:00,4074.14
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2024-01-08 03:00:00,4074.14
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2024-01-08 04:00:00,4074.14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1766275,0,0,0,2,1,0,0,0,1,0,...,3,2,3,1,1,3,2,4,2024-12-31 19:00:00,JC116
1766276,0,0,2,1,0,0,0,1,0,0,...,2,3,1,1,3,2,4,0,2024-12-31 20:00:00,JC116
1766277,0,2,1,0,0,0,1,0,0,0,...,3,1,1,3,2,4,0,0,2024-12-31 21:00:00,JC116
1766278,2,1,0,0,0,1,0,0,0,0,...,1,1,3,2,4,0,0,0,2024-12-31 22:00:00,JC116


In [10]:
import numpy as np

window_size = 4
step_size = 1
num_windows = 4

step_size * np.arange(num_windows)[:, None]  # Output: array([[0], [1], [2], [3]])

indices = np.arange(window_size)[None, :] + step_size * np.arange(num_windows)[:, None]


In [11]:
features["pickup_location_id"] = features["pickup_location_id"].astype(str)
features.dtypes.tail()  # Check the last few columns


rides_t-3                      int64
rides_t-2                      int64
rides_t-1                      int64
pickup_hour           datetime64[ns]
pickup_location_id            object
dtype: object

In [12]:
window_size = 4
step_size = 1
num_windows = 4

In [13]:
import numpy as np
np.arange(num_windows)

array([0, 1, 2, 3])

In [14]:
np.arange(num_windows)[:, None]

array([[0],
       [1],
       [2],
       [3]])

In [15]:
step_size * np.arange(num_windows)[:, None]

array([[0],
       [1],
       [2],
       [3]])

In [16]:
indices = np.arange(window_size)[None, :] + step_size * np.arange(num_windows)[:, None]


In [17]:
np.arange(window_size)[None, :]

array([[0, 1, 2, 3]])

In [18]:
np.arange(window_size)[None, :] + step_size * np.arange(num_windows)[:, None]

array([[0, 1, 2, 3],
       [1, 2, 3, 4],
       [2, 3, 4, 5],
       [3, 4, 5, 6]])

In [19]:
# Array 1: A 2D array with 2 rows and 2 columns
array1 = np.array([[1, 2],
                   [3, 4]])

# Array 2: A 2D array with 2 rows and 1 column
array2 = np.array([[5],
                   [6]])

# Array 3: A 2D array with 2 rows and 1 column
array3 = np.array([[7],
                   [8]])


np.hstack([array1, array2, array3])

array([[1, 2, 5, 7],
       [3, 4, 6, 8]])

In [21]:
from src.data_utils import transform_ts_data_into_features_and_target_loop

features, targets = transform_ts_data_into_features_and_target_loop(
    ts_data,
    feature_col="rides",       # important to explicitly define
    window_size=24*28*1,       # 4 weeks window (672 hours)
    step_size=24               # 1-day stride
)


In [22]:
tabular_data = features
tabular_data["target"] = targets

from src.config import TRANSFORMED_DATA_DIR
tabular_data.to_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet", engine="pyarrow")

In [25]:
ts_data["pickup_location_id"].unique()


array(['4074.14', '4298.05', '4461.07', '4762.05', '4977.03', '4993.02',
       '4993.15', '5024.10', '5033.01', '5105.01', '5105.09', '5184.08',
       '5187.03', '5210.01', '5219.05', '5235.05', '5257.01', '5282.02',
       '5297.02', '5308.04', '5340.01', '5351.03', '5359.11', '5368.03',
       '5369.01', '5374.01', '5414.06', '5422.09', '5430.08', '5431.01',
       '5445.02', '5470.10', '5470.12', '5570.04', '5616.01', '5626.15',
       '5712.12', '5730.08', '5746.02', '5746.14', '5772.05', '5785.05',
       '5805.05', '5847.01', '5869.04', '5905.12', '5905.14', '5914.08',
       '6030.06', '6098.10', '6098.12', '6101.11', '6115.06', '6131.12',
       '6140.05', '6157.04', '6215.07', '6224.05', '6233.05', '6239.08',
       '6289.06', '6313.10', '6322.01', '6331.01', '6425.04', '6432.11',
       '6459.04', '6492.08', '6551.02', '6551.11', '6560.01', '6566.01',
       '6569.07', '6575.03', '6599.01', '6602.03', '6659.03', '6667.04',
       '6676.02', '6726.01', '6740.01', '6747.06', 

In [28]:
ts_data.groupby("pickup_location_id")["rides"].sum().sort_values(ascending=False)


pickup_location_id
HB102      54972
JC115      47083
HB101      26198
HB105      25968
JC066      24423
           ...  
6740.01        1
6747.06        1
6753.08        1
6762.02        1
7253.04        1
Name: rides, Length: 205, dtype: int32

In [29]:
features, targets = transform_ts_data_into_features_and_target_loop(
    ts_data[ts_data["pickup_location_id"] == "HB102"],
    window_size=12,
    step_size=1
)


In [30]:
features

Unnamed: 0,rides_t-12,rides_t-11,rides_t-10,rides_t-9,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
0,5,5,22,2,2,0,0,0,0,1,1,5,2024-01-01 12:00:00,HB102
1,5,22,2,2,0,0,0,0,1,1,5,5,2024-01-01 13:00:00,HB102
2,22,2,2,0,0,0,0,1,1,5,5,5,2024-01-01 14:00:00,HB102
3,2,2,0,0,0,0,1,1,5,5,5,4,2024-01-01 15:00:00,HB102
4,2,0,0,0,0,1,1,5,5,5,4,2,2024-01-01 16:00:00,HB102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8767,0,0,0,3,3,7,8,8,13,10,15,10,2024-12-31 19:00:00,HB102
8768,0,0,3,3,7,8,8,13,10,15,10,8,2024-12-31 20:00:00,HB102
8769,0,3,3,7,8,8,13,10,15,10,8,2,2024-12-31 21:00:00,HB102
8770,3,3,7,8,8,13,10,15,10,8,2,1,2024-12-31 22:00:00,HB102


In [31]:
ts_data[ts_data["pickup_location_id"]=="HB102"].head(36)

Unnamed: 0,pickup_hour,pickup_location_id,rides
1080432,2024-01-01 00:00:00,HB102,5
1080433,2024-01-01 01:00:00,HB102,5
1080434,2024-01-01 02:00:00,HB102,22
1080435,2024-01-01 03:00:00,HB102,2
1080436,2024-01-01 04:00:00,HB102,2
1080437,2024-01-01 05:00:00,HB102,0
1080438,2024-01-01 06:00:00,HB102,0
1080439,2024-01-01 07:00:00,HB102,0
1080440,2024-01-01 08:00:00,HB102,0
1080441,2024-01-01 09:00:00,HB102,1


In [32]:
from src.data_utils import transform_ts_data_into_features_and_target_loop

features, targets = transform_ts_data_into_features_and_target_loop(
    ts_data[ts_data["pickup_location_id"] == "HB102"], 
    window_size=12, 
    step_size=1
)


In [33]:
features

Unnamed: 0,rides_t-12,rides_t-11,rides_t-10,rides_t-9,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
0,5,5,22,2,2,0,0,0,0,1,1,5,2024-01-01 12:00:00,HB102
1,5,22,2,2,0,0,0,0,1,1,5,5,2024-01-01 13:00:00,HB102
2,22,2,2,0,0,0,0,1,1,5,5,5,2024-01-01 14:00:00,HB102
3,2,2,0,0,0,0,1,1,5,5,5,4,2024-01-01 15:00:00,HB102
4,2,0,0,0,0,1,1,5,5,5,4,2,2024-01-01 16:00:00,HB102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8767,0,0,0,3,3,7,8,8,13,10,15,10,2024-12-31 19:00:00,HB102
8768,0,0,3,3,7,8,8,13,10,15,10,8,2024-12-31 20:00:00,HB102
8769,0,3,3,7,8,8,13,10,15,10,8,2,2024-12-31 21:00:00,HB102
8770,3,3,7,8,8,13,10,15,10,8,2,1,2024-12-31 22:00:00,HB102


In [34]:
tabular_data = features.copy()
tabular_data["target"] = targets

from src.config import TRANSFORMED_DATA_DIR
tabular_data.to_parquet(TRANSFORMED_DATA_DIR / "hb102_tabular_data.parquet", engine="pyarrow")


## ADDONS:2.Include a name column for the top 10 pickup locations.


In [28]:

import pandas as pd
# Step 1 - Create a Copy
ts_data_copy = ts_data.copy()

# Step 2 - Load and Merge Taxi Zone Lookup
lookup_path = r"C:\Users\vasub\Downloads\taxi_zone_lookup.csv"
taxi_zone_lookup = pd.read_csv(lookup_path)

ts_data_copy = ts_data_copy.merge(taxi_zone_lookup[['LocationID', 'Zone']],
                                  left_on='pickup_location_id',
                                  right_on='LocationID', how='left')

ts_data_copy.rename(columns={'Zone': 'pickup_location_name'}, inplace=True)
ts_data_copy.drop(columns=['LocationID'], inplace=True)



In [29]:
print(ts_data_copy['pickup_location_id'].nunique())


260


In [30]:
print(ts_data_copy['pickup_location_name'].nunique())
print(ts_data_copy['pickup_location_name'].value_counts())


259
pickup_location_name
Corona                       17520
Jamaica Bay                   8760
Pelham Bay                    8760
Murray Hill                   8760
Murray Hill-Queens            8760
                             ...  
Forest Hills                  8760
Forest Park/Highland Park     8760
Fort Greene                   8760
Fresh Meadows                 8760
Yorkville West                8760
Name: count, Length: 259, dtype: int64


In [31]:
print(taxi_zone_lookup.head())
print(taxi_zone_lookup['Zone'].nunique())


   LocationID        Borough                     Zone service_zone
0           1            EWR           Newark Airport          EWR
1           2         Queens              Jamaica Bay    Boro Zone
2           3          Bronx  Allerton/Pelham Gardens    Boro Zone
3           4      Manhattan            Alphabet City  Yellow Zone
4           5  Staten Island            Arden Heights    Boro Zone
261


In [32]:
# Step 3 - Top 10 Pickup Locations
top_10_locations = ts_data_copy['pickup_location_name'].value_counts().head(10).index
print("Top 10 Locations:")
print(top_10_locations)

# Assign top 10 or 'Other'
ts_data_copy['top_10_location'] = ts_data_copy['pickup_location_name'].apply(lambda x: x if x in top_10_locations else 'Other')

# Check if it worked
print(ts_data_copy[['pickup_location_name', 'top_10_location']].drop_duplicates())

ts_data_copy['top_10_location']
# Group by location_id and location_name to get ride counts
location_counts = ts_data_copy.groupby(['pickup_location_id', 'pickup_location_name'])['rides'].sum().reset_index()

# Sort by total rides and get top 10
top_10_locations_df = location_counts.sort_values(by='rides', ascending=False).head(10)

# Rename for clarity
top_10_locations_df = top_10_locations_df.rename(columns={'rides': 'total_rides'})



Top 10 Locations:
Index(['Corona', 'Jamaica Bay', 'Pelham Bay', 'Murray Hill',
       'Murray Hill-Queens', 'New Dorp/Midland Beach', 'North Corona',
       'Norwood', 'Oakland Gardens', 'Oakwood'],
      dtype='object', name='pickup_location_name')
            pickup_location_name top_10_location
0                    Jamaica Bay     Jamaica Bay
8760     Allerton/Pelham Gardens           Other
17520              Alphabet City           Other
26280              Arden Heights           Other
35040    Arrochar/Fort Wadsworth           Other
...                          ...             ...
2233800       Woodlawn/Wakefield           Other
2242560                 Woodside           Other
2251320       World Trade Center           Other
2260080           Yorkville East           Other
2268840           Yorkville West           Other

[259 rows x 2 columns]


In [33]:
# Final Output
print(top_10_locations_df)

     pickup_location_id          pickup_location_name  total_rides
128                 132                   JFK Airport      1929204
233                 237         Upper East Side South      1775461
157                 161                Midtown Center      1747486
232                 236         Upper East Side North      1584408
158                 162                  Midtown East      1340317
182                 186  Penn Station/Madison Sq West      1291296
134                 138             LaGuardia Airport      1285498
226                 230     Times Sq/Theatre District      1252681
138                 142           Lincoln Square East      1244948
166                 170                   Murray Hill      1120312
