Import Statements

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
from pathlib import Path
import pyarrow.parquet as pq
from typing import Optional, List
import plotly.express as px
import pandas as pd

Loading and aggreration

In [2]:
month = 1
year = 2023
path = Path("..") / "data" / "processed" / f"rides_{year}_{month:02}.parquet"
#Used for loading a specific processed file

table = pq.read_table(path)
rides = table.to_pandas() #Loading the datafile and converting to dataframe 

rides["pickup_hour"] = rides["pickup_datetime"].dt.floor('h') #Truncates to the nearest hour

agg_rides = rides.groupby(["pickup_hour", "pickup_location_id"]).size().reset_index()
# Grouping as truncating created many rows with the same hour (this gives us the total rides for the hour)
agg_rides.rename(columns={0: "rides"}, inplace=True)
agg_rides.tail()

Unnamed: 0,pickup_hour,pickup_location_id,rides
69772,2023-01-31 23:00:00,249,137
69773,2023-01-31 23:00:00,256,2
69774,2023-01-31 23:00:00,261,5
69775,2023-01-31 23:00:00,262,11
69776,2023-01-31 23:00:00,263,41


Filling the missing location id, hours combinations

In [4]:
def fill_missing_rides_full_range(df, hour_col, location_col, rides_col):
    """
    Fills in missing rides for all hours in the range and all unique locations.

    Parameters:
    - df: DataFrame with columns [hour_col, location_col, rides_col]
    - hour_col: Name of the column containing hourly timestamps
    - location_col: Name of the column containing location IDs
    - rides_col: Name of the column containing ride counts

    Returns:
    - DataFrame with missing hours and locations filled in with 0 rides
    """
    # Ensure the hour column is in datetime format
    df[hour_col] = pd.to_datetime(df[hour_col])

    # Get the full range of hours (from min to max) with hourly frequency
    full_hours = pd.date_range(
        start=df[hour_col].min(),
        end=df[hour_col].max(),
        freq="h"
    )

    # Get all unique location IDs
    all_locations = df[location_col].unique()

    # Create a DataFrame with all combinations of hours and locations
    full_combinations = pd.DataFrame(
        [(hour, location) for hour in full_hours for location in all_locations],
        columns=[hour_col, location_col]
    )

    # Merge the original DataFrame with the full combinations DataFrame
    merged_df = pd.merge(full_combinations, df, on=[hour_col, location_col], how='left')

    # Fill missing rides with 0
    merged_df[rides_col] = merged_df[rides_col].fillna(0).astype(int)

    return merged_df
#Function that filles created entry of 0 rides for location id and hour combinations which are absent in the original dataset 
#(as there are no recorded rides for that combination)


hour_col = "pickup_hour"
location_col = "pickup_location_id"
rides_col = "rides"
agg_data_filled = fill_missing_rides_full_range(
    agg_rides, hour_col, location_col, rides_col).sort_values(["pickup_location_id", "pickup_hour"]).reset_index(drop=True)
#Running the fill function on our dataset

pd.set_option('display.max_rows', 5)
agg_data_filled.head(10)

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2023-01-01 00:00:00,2,0
1,2023-01-01 01:00:00,2,0
...,...,...,...
8,2023-01-01 08:00:00,2,0
9,2023-01-01 09:00:00,2,0


Plotting function

In [5]:
def plot_rides(
    rides: pd.DataFrame,
    locations: Optional[List[int]] = None
):

    rides_to_plot = rides[rides.pickup_location_id.isin(locations)] if locations else rides

    fig = px.line(
        rides_to_plot,
        x="pickup_hour",
        y="rides",
        color="pickup_location_id",
        template="none"
    )

    fig.show()

# Function that plots a given a dataframe (in our case rides vs time in hour-day) for a given (optional) location id

plot_rides(agg_data_filled, locations=[42, 43])

Saving the processed data

In [5]:
month = 1
year = 2023
path = Path("..") / "data" / "processed" / f"ts_data_{year}_{month:02}.parquet"
agg_data_filled.to_parquet(path)