In [1]:
import pandas as pd

rides = pd.read_parquet('/Users/yashwantsaikoneru/taxi_demand_predictor/data/raw/transformed/validated_rides_2023_10.parquet')

rides.head(10)
     

Unnamed: 0,pickup_datetime,pickup_location_id
0,2023-10-01 00:16:44,168
1,2023-10-01 00:23:24,168
2,2023-10-01 00:21:18,161
3,2023-10-01 00:17:39,255
4,2023-10-01 00:16:15,151
5,2023-10-01 00:33:14,238
6,2023-10-01 00:58:18,50
7,2023-10-01 00:11:29,208
8,2023-10-01 00:22:33,87
9,2023-10-01 00:39:47,125


In [2]:
rides['pickup_hour'] = rides['pickup_datetime'].dt.floor('H')
rides
     

Unnamed: 0,pickup_datetime,pickup_location_id,pickup_hour
0,2023-10-01 00:16:44,168,2023-10-01 00:00:00
1,2023-10-01 00:23:24,168,2023-10-01 00:00:00
2,2023-10-01 00:21:18,161,2023-10-01 00:00:00
3,2023-10-01 00:17:39,255,2023-10-01 00:00:00
4,2023-10-01 00:16:15,151,2023-10-01 00:00:00
...,...,...,...
3522280,2023-10-31 23:44:53,230,2023-10-31 23:00:00
3522281,2023-10-31 23:38:00,233,2023-10-31 23:00:00
3522282,2023-10-31 23:56:58,68,2023-10-31 23:00:00
3522283,2023-10-31 23:37:55,114,2023-10-31 23:00:00


In [3]:
agg_rides = rides.groupby(['pickup_hour', 'pickup_location_id']).size().reset_index()
agg_rides.rename(columns={0: 'rides'}, inplace=True)
agg_rides


Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2023-10-01 00:00:00,4,62
1,2023-10-01 00:00:00,7,3
2,2023-10-01 00:00:00,10,2
3,2023-10-01 00:00:00,13,3
4,2023-10-01 00:00:00,17,1
...,...,...,...
77548,2023-10-31 23:00:00,261,7
77549,2023-10-31 23:00:00,262,12
77550,2023-10-31 23:00:00,263,50
77551,2023-10-31 23:00:00,264,27


In [4]:
from tqdm import tqdm

def add_missing_slots(agg_rides: pd.DataFrame) -> pd.DataFrame:
    
    location_ids = agg_rides['pickup_location_id'].unique()
    full_range = pd.date_range(
        agg_rides['pickup_hour'].min(), agg_rides['pickup_hour'].max(), freq='H')
    output = pd.DataFrame()
    for location_id in tqdm(location_ids):

        # keep only rides for this 'location_id'
        agg_rides_i = agg_rides.loc[agg_rides.pickup_location_id == location_id, ['pickup_hour', 'rides']]
            
        # quick way to add missing dates with 0 in a Series
        # taken from https://stackoverflow.com/a/19324591
        agg_rides_i.set_index('pickup_hour', inplace=True)
        agg_rides_i.index = pd.DatetimeIndex(agg_rides_i.index)
        agg_rides_i = agg_rides_i.reindex(full_range, fill_value=0)
        
        # add back `location_id` columns
        agg_rides_i['pickup_location_id'] = location_id

        output = pd.concat([output, agg_rides_i])
    
    # move the purchase_day from the index to a dataframe column
    output = output.reset_index().rename(columns={'index': 'pickup_hour'})
    
    return output

In [5]:
agg_rides_all_slots = add_missing_slots(agg_rides)

100%|██████████| 256/256 [00:00<00:00, 675.08it/s]


In [6]:
from typing import Optional, List
import plotly.express as px

def plot_rides(
    rides: pd.DataFrame,
    locations: Optional[List[int]] = None
    ):
    """
    Plot time-series data
    """
    rides_to_plot = rides[rides.pickup_location_id.isin(locations)] if locations else rides

    fig = px.line(
        rides_to_plot,
        x="pickup_hour",
        y="rides",
        color='pickup_location_id',
        template='none',
    )

    fig.show()

In [7]:
plot_rides(agg_rides_all_slots, locations=[])

  v = v.dt.to_pydatetime()


In [8]:
agg_rides_all_slots.to_parquet('/Users/yashwantsaikoneru/taxi_demand_predictor/data/raw/transformed/validated_rides_2023_10.parquet')