In [8]:
import pandas as pd
from pathlib import Path

In [9]:
# === 1. Load filtered data ===
path = Path("..") / "data" / "processed" / "top_3_stations.parquet"
df = pd.read_parquet(path)

In [10]:
# === 2. Convert to datetime + truncate to hour ===
df["started_at"] = pd.to_datetime(df["started_at"])
df["start_hour"] = df["started_at"].dt.floor("h")

In [11]:
# === 3. Aggregate hourly ride counts per station ===
agg = (
    df.groupby(["start_hour", "start_station_id"])
    .size()
    .reset_index(name="rides")
)

In [14]:
# === 4. Fill missing hours and stations ===
def fill_missing_rides_full_range(df, hour_col, location_col, rides_col):
    df[hour_col] = pd.to_datetime(df[hour_col])
    full_hours = pd.date_range(df[hour_col].min(), df[hour_col].max(), freq="h")
    all_locations = df[location_col].unique()

    full_combinations = pd.DataFrame(
        [(hour, loc) for hour in full_hours for loc in all_locations],
        columns=[hour_col, location_col]
    )

    merged = pd.merge(full_combinations, df, on=[hour_col, location_col], how="left")
    merged[rides_col] = merged[rides_col].fillna(0).astype(int)
    return merged

# Safety check before filling
if agg.empty or agg["start_hour"].isna().any():
    raise ValueError("❌ Aggregated data is empty or has missing timestamps!")

agg_filled = fill_missing_rides_full_range(
    df=agg,
    hour_col="start_hour",
    location_col="start_station_id",
    rides_col="rides"
)

In [15]:
# === 5. Add time-based features ===
agg_filled["hour"] = agg_filled["start_hour"].dt.hour
agg_filled["day_of_week"] = agg_filled["start_hour"].dt.dayofweek  # 0 = Monday
agg_filled["is_weekend"] = agg_filled["day_of_week"] >= 5
agg_filled["month"] = agg_filled["start_hour"].dt.month
agg_filled["date"] = agg_filled["start_hour"].dt.date


In [16]:
# === 6. Save the transformed time series ===
transformed_path = Path("..") / "data" / "transformed"
transformed_path.mkdir(parents=True, exist_ok=True)

output_path = transformed_path / "top_3_hourly_timeseries.parquet"
agg_filled.to_parquet(output_path, engine="pyarrow", index=False)

In [17]:
print(f"✅ Transformed time series saved to: {output_path}")

✅ Transformed time series saved to: ..\data\transformed\top_3_hourly_timeseries.parquet


In [19]:
df_final = pd.read_parquet("../data/transformed/top_3_hourly_timeseries.parquet")
df_final.head(10)


Unnamed: 0,start_hour,start_station_id,rides,hour,day_of_week,is_weekend,month,date
0,2023-05-01 03:00:00,5905,1,3,0,False,5,2023-05-01
1,2023-05-01 03:00:00,6822,1,3,0,False,5,2023-05-01
2,2023-05-01 03:00:00,6140,0,3,0,False,5,2023-05-01
3,2023-05-01 04:00:00,5905,0,4,0,False,5,2023-05-01
4,2023-05-01 04:00:00,6822,0,4,0,False,5,2023-05-01
5,2023-05-01 04:00:00,6140,0,4,0,False,5,2023-05-01
6,2023-05-01 05:00:00,5905,1,5,0,False,5,2023-05-01
7,2023-05-01 05:00:00,6822,0,5,0,False,5,2023-05-01
8,2023-05-01 05:00:00,6140,0,5,0,False,5,2023-05-01
9,2023-05-01 06:00:00,5905,2,6,0,False,5,2023-05-01
