Import Statements

In [1]:
# Import required libraries
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
from pathlib import Path
import pyarrow.parquet as pq
from typing import Optional, List
import plotly.express as px

Loading and aggreration

In [2]:
# Set month and year for processing
month = 1
year = 2024
path = Path("..") / "data" / "processed" / f"rides_{year}_{month:02}.parquet"

# Load the processed data
table = pq.read_table(path)
rides = table.to_pandas()
rides.head()
rides.shape

# Truncate started_at to the nearest hour
rides['pickup_hour'] = rides['started_at'].dt.floor('h')

# Aggregate rides by pickup_hour and start_station_name
agg_rides = rides.groupby(['pickup_hour', 'start_station_name']).size().reset_index()
agg_rides.rename(columns={0: 'rides'}, inplace=True)
agg_rides.tail()

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,member_casual,duration
0,electric_bike,2024-01-22 18:43:19.012,2024-01-22 18:48:10.708,Frederick Douglass Blvd & W 145 St,St Nicholas Ave & W 126 St,member,0 days 00:04:51.696000
1,electric_bike,2024-01-11 19:19:18.721,2024-01-11 19:47:36.007,W 54 St & 6 Ave,E 74 St & 1 Ave,member,0 days 00:28:17.286000
2,electric_bike,2024-01-30 19:17:41.693,2024-01-30 19:32:49.857,E 11 St & Ave B,W 10 St & Washington St,casual,0 days 00:15:08.164000
3,electric_bike,2024-01-27 11:27:01.759,2024-01-27 11:38:01.213,W 54 St & 6 Ave,E 74 St & 1 Ave,member,0 days 00:10:59.454000
4,electric_bike,2024-01-16 15:15:41.000,2024-01-16 15:29:26.156,Madison Ave & E 99 St,E 74 St & 1 Ave,member,0 days 00:13:45.156000


(1883159, 7)

Unnamed: 0,pickup_hour,start_station_name,rides
605851,2024-01-31 23:00:00,Wyckoff Ave & Jefferson St,1
605852,2024-01-31 23:00:00,Wyckoff Ave & Stanhope St,2
605853,2024-01-31 23:00:00,Wyckoff St & 3 Ave,2
605854,2024-01-31 23:00:00,Wythe Ave & Metropolitan Ave,1
605855,2024-01-31 23:00:00,Wythe Ave & N 13 St,3


Filling the missing location id, hours combinations

In [3]:
# Identify the top 3 stations by total rides
station_rides = agg_rides.groupby('start_station_name')['rides'].sum().reset_index()
top_stations = station_rides.nlargest(3, 'rides')
print("Top 3 stations by total rides:")
print(top_stations)

# Save the top stations to a CSV file for later use in the Streamlit app
top_stations_path = Path("..") / "data" / "top_stations.csv"
top_stations.to_csv(top_stations_path, index=False)
print(f"Top stations saved to: {top_stations_path}")

Top 3 stations by total rides:
     start_station_name  rides
1944    W 21 St & 6 Ave   8320
8       1 Ave & E 68 St   6565
440     8 Ave & W 31 St   6334
Top stations saved to: ..\data\top_stations.csv


In [4]:
# Function to fill missing rides for all hours and stations
def fill_missing_rides_full_range(df, hour_col, station_col, rides_col):
    """
    Fills in missing rides for all hours in the range and all unique stations.

    Parameters:
    - df: DataFrame with columns [hour_col, station_col, rides_col]
    - hour_col: Name of the column containing hourly timestamps
    - station_col: Name of the column containing station names
    - rides_col: Name of the column containing ride counts

    Returns:
    - DataFrame with missing hours and stations filled in with 0 rides
    """
    # Ensure the hour column is in datetime format
    df[hour_col] = pd.to_datetime(df[hour_col])

    # Get the full range of hours (from min to max) with hourly frequency
    full_hours = pd.date_range(
        start=df[hour_col].min(),
        end=df[hour_col].max(),
        freq="h"
    )

    # Get all unique station names
    all_stations = df[station_col].unique()

    # Create a DataFrame with all combinations of hours and stations
    full_combinations = pd.DataFrame(
        [(hour, station) for hour in full_hours for station in all_stations],
        columns=[hour_col, station_col]
    )

    # Merge the original DataFrame with the full combinations DataFrame
    merged_df = pd.merge(full_combinations, df, on=[hour_col, station_col], how='left')

    # Fill missing rides with 0
    merged_df[rides_col] = merged_df[rides_col].fillna(0).astype(int)

    return merged_df
#Function that filles created entry of 0 rides for location id and hour combinations which are absent in the original dataset 
#(as there are no recorded rides for that combination)


# Apply the function to fill missing slots
hour_col = "pickup_hour"
station_col = "start_station_name"
rides_col = "rides"
agg_data_filled = fill_missing_rides_full_range(
    agg_rides, hour_col, station_col, rides_col).sort_values([station_col, hour_col]).reset_index(drop=True)

# Display the first 10 rows
pd.set_option('display.max_rows', 10)
agg_data_filled.head(10)

Unnamed: 0,pickup_hour,start_station_name,rides
0,2024-01-01 00:00:00,1 Ave & E 110 St,0
1,2024-01-01 01:00:00,1 Ave & E 110 St,0
2,2024-01-01 02:00:00,1 Ave & E 110 St,0
3,2024-01-01 03:00:00,1 Ave & E 110 St,0
4,2024-01-01 04:00:00,1 Ave & E 110 St,0
5,2024-01-01 05:00:00,1 Ave & E 110 St,0
6,2024-01-01 06:00:00,1 Ave & E 110 St,2
7,2024-01-01 07:00:00,1 Ave & E 110 St,0
8,2024-01-01 08:00:00,1 Ave & E 110 St,1
9,2024-01-01 09:00:00,1 Ave & E 110 St,1


Plotting function

In [5]:
# Function to plot rides for specific stations
def plot_rides(
    rides: pd.DataFrame,
    stations: Optional[List[str]] = None
):
    """
    Plots the number of rides over time for specified stations.

    Parameters:
    - rides: DataFrame with columns [pickup_hour, start_station_name, rides]
    - stations: Optional list of station names to plot; if None, plots all stations
    """
    rides_to_plot = rides[rides.start_station_name.isin(stations)] if stations else rides

    fig = px.line(
        rides_to_plot,
        x="pickup_hour",
        y="rides",
        color="start_station_name",
        template="none"
    )

    fig.show()

# Plot the top 3 stations
top_station_names = top_stations['start_station_name'].tolist()
plot_rides(agg_data_filled, stations=top_station_names)

# Save the time series data
ts_path = Path("..") / "data" / "processed" / f"ts_data_{year}_{month:02}.parquet"
agg_data_filled.to_parquet(ts_path, engine="pyarrow", index=False)
print(f"Time series data saved to: {ts_path}")

Time series data saved to: ..\data\processed\ts_data_2024_01.parquet


Saving the processed data

In [5]:
month = 1
year = 2023
path = Path("..") / "data" / "processed" / f"ts_data_{year}_{month:02}.parquet"
agg_data_filled.to_parquet(path)