In [1]:
"""
Pre-aggregate CitiBike data for efficient dashboard loading.
"""

import pandas as pd
import numpy as np
from pathlib import Path

# Paths
DATA_DIR = Path(r"C:\Users\magia\OneDrive\Desktop\NY_Citi_Bike\2.Data\Prepared Data") # Streamlit-Access: Check Google Drive Link for file
print(f"File size: {file_size / 1024**3:.2f} GB")
INPUT_FILE = DATA_DIR / "nyc_2022_essential_data.csv"
OUTPUT_DIR = DATA_DIR
OUTPUT_DIR.mkdir(exist_ok=True)

print("Loading main dataset...")
df = pd.read_csv(INPUT_FILE, low_memory=False)

# Parse datetimes
df['started_at'] = pd.to_datetime(df['started_at'], errors='coerce')
df['ended_at'] = pd.to_datetime(df['ended_at'], errors='coerce')
df = df.dropna(subset=['started_at', 'ended_at'])

print(f"Loaded {len(df):,} trips")

# ==================== Chart 1: Trip Duration Sample ====================
print("\n1. Creating trip duration sample...")
df['tripduration_min'] = (df['ended_at'] - df['started_at']).dt.total_seconds() / 60

# Clean and filter
df_duration = df[
    (df['ended_at'] >= df['started_at']) &
    (df['tripduration_min'] >= 1) &
    (df['tripduration_min'] <= 120) &
    (df['tripduration_min'] <= 75)
].copy()

# Sample 100k trips for histogram (more than enough for visualization)
duration_sample = df_duration['tripduration_min'].sample(n=min(100000, len(df_duration)), random_state=42)
duration_sample.to_csv(OUTPUT_DIR / "trip_durations.csv", index=False, header=['tripduration_min'])
print(f"   Saved {len(duration_sample):,} duration samples")

# ==================== Chart 2: Weekday vs Weekend Patterns ====================
print("\n2. Creating weekday/weekend hourly patterns...")
df['hour'] = df['started_at'].dt.hour
df['date'] = df['started_at'].dt.date
weekend_days = {"Saturday", "Sunday"}
df['period'] = df['started_at'].dt.day_name().isin(weekend_days).map({True: "Weekend", False: "Weekday"})

overlay = (df
           .groupby(["period", "date", "hour"])
           .size()
           .groupby(["period", "hour"])
           .mean()
           .reset_index(name="trips_per_day"))

overlay.to_csv(OUTPUT_DIR / "hourly_patterns.csv", index=False)
print(f"   Saved {len(overlay)} hourly pattern records")

# ==================== Chart 3: Day of Week Totals ====================
print("\n3. Creating day of week totals...")
dow_counts = (df.groupby(df['started_at'].dt.day_name())
                .size()
                .reset_index(name='trips'))
dow_counts.columns = ['day_name', 'trips']
dow_counts.to_csv(OUTPUT_DIR / "day_of_week_totals.csv", index=False)
print(f"   Saved {len(dow_counts)} day-of-week records")

# ==================== Chart 4: Daily Trips and Temperature ====================
print("\n4. Creating daily aggregates (trips + temperature)...")
df_daily = df.groupby('date').agg(
    bike_rides_daily=('date', 'size'),
    avgTemp=('avgTemp', 'mean')
).reset_index()

df_daily.to_csv(OUTPUT_DIR / "daily_aggregates.csv", index=False)
print(f"   Saved {len(df_daily)} daily records")

print(f"\nAggregated files saved to: {OUTPUT_DIR}")
print("\nFiles created:")
print("  - trip_durations.csv (100k samples)")
print("  - hourly_patterns.csv (weekday/weekend by hour)")
print("  - day_of_week_totals.csv (7 records)")
print("  - daily_aggregates.csv (365 records)")

Loading main dataset...
Loaded 29,838,166 trips

1. Creating trip duration sample...
   Saved 100,000 duration samples

2. Creating weekday/weekend hourly patterns...
   Saved 48 hourly pattern records

3. Creating day of week totals...
   Saved 7 day-of-week records

4. Creating daily aggregates (trips + temperature)...
   Saved 365 daily records

Aggregated files saved to: C:\Users\magia\OneDrive\Desktop\NY_Citi_Bike\2.Data\Prepared Data

Files created:
  - trip_durations.csv (100k samples)
  - hourly_patterns.csv (weekday/weekend by hour)
  - day_of_week_totals.csv (7 records)
  - daily_aggregates.csv (365 records)
