In [1]:
import pandas as pd
from pathlib import Path

In [3]:
import pandas as pd
from pathlib import Path

# Use integer versions of your 3 top stations
TOP_3_IDS = [6140, 6822, 5905]

# Path to raw parquet files
raw_dir = Path("..") / "data" / "raw"
all_dfs = []

# Loop through all months
for year in [2023, 2024]:
    months = range(5, 13) if year == 2023 else range(1, 5)
    for month in months:
        path = raw_dir / f"rides_{year}_{month:02}.parquet"
        print(f"📂 Reading {path.name} ...")
        df = pd.read_parquet(path)

        # Clean and convert station IDs
        df = df[df["start_station_id"].notna()]

        # Drop any non-numeric station IDs like 'SYS016'
        df = df[df["start_station_id"].astype(str).str.replace(".", "", regex=False).str.isnumeric()]

        # Convert to int
        df["start_station_id"] = df["start_station_id"].astype(float).astype(int)

        # Filter to only the top 3 stations
        df_filtered = df[df["start_station_id"].isin(TOP_3_IDS)]
        all_dfs.append(df_filtered)

# Combine all months
df_top3 = pd.concat(all_dfs, ignore_index=True)
print(f"\n✅ Filtered data shape: {df_top3.shape}")
print("📊 Sample data:")
print(df_top3[["start_station_id", "start_station_name"]].drop_duplicates())

# Save filtered data
processed_dir = Path("..") / "data" / "processed"
processed_dir.mkdir(parents=True, exist_ok=True)

output_path = processed_dir / "top_3_stations.parquet"
df_top3.to_parquet(output_path, engine="pyarrow", index=False)

print(f"💾 Saved filtered data to: {output_path}")


📂 Reading rides_2023_05.parquet ...
📂 Reading rides_2023_06.parquet ...
📂 Reading rides_2023_07.parquet ...
📂 Reading rides_2023_08.parquet ...
📂 Reading rides_2023_09.parquet ...
📂 Reading rides_2023_10.parquet ...
📂 Reading rides_2023_11.parquet ...
📂 Reading rides_2023_12.parquet ...
📂 Reading rides_2024_01.parquet ...
📂 Reading rides_2024_02.parquet ...
📂 Reading rides_2024_03.parquet ...
📂 Reading rides_2024_04.parquet ...

✅ Filtered data shape: (230278, 13)
📊 Sample data:
      start_station_id       start_station_name
0                 5905  University Pl & E 14 St
7                 6822          1 Ave & E 68 St
1656              6140          W 21 St & 6 Ave
4098              5905       Broadway & E 14 St
💾 Saved filtered data to: ..\data\processed\top_3_stations.parquet


In [4]:
df_filtered = df[df["start_station_id"].isin(TOP_3_IDS)]
if df_filtered.empty:
    print(f"⚠️  No matching rows found in {path.name} for top 3 stations.")
else:
    print(f"✅ Filtered {len(df_filtered)} rows from {path.name}")


✅ Filtered 44405 rows from rides_2024_04.parquet
