In [1]:
import pandas as pd

rides  = pd.read_parquet('../data/transformed/validated_rides_2022_01.parquet')
rides.head(10)

Unnamed: 0,pickup_datetime,pickup_location_id
0,2022-01-01 00:35:40,142
1,2022-01-01 00:33:43,236
2,2022-01-01 00:53:21,166
3,2022-01-01 00:25:21,114
4,2022-01-01 00:36:48,68
5,2022-01-01 00:40:15,138
6,2022-01-01 00:20:50,233
7,2022-01-01 00:13:04,238
8,2022-01-01 00:30:02,166
9,2022-01-01 00:48:52,236


In [2]:
rides['pickup_hour'] = rides['pickup_datetime'].dt.floor('h')
rides

Unnamed: 0,pickup_datetime,pickup_location_id,pickup_hour
0,2022-01-01 00:35:40,142,2022-01-01 00:00:00
1,2022-01-01 00:33:43,236,2022-01-01 00:00:00
2,2022-01-01 00:53:21,166,2022-01-01 00:00:00
3,2022-01-01 00:25:21,114,2022-01-01 00:00:00
4,2022-01-01 00:36:48,68,2022-01-01 00:00:00
...,...,...,...
2463926,2022-01-31 23:36:53,90,2022-01-31 23:00:00
2463927,2022-01-31 23:44:22,107,2022-01-31 23:00:00
2463928,2022-01-31 23:39:00,113,2022-01-31 23:00:00
2463929,2022-01-31 23:36:42,148,2022-01-31 23:00:00


In [3]:
agg = rides.groupby(by=['pickup_hour', 'pickup_location_id']).size().reset_index()
agg.rename(columns={0:'rides'}, inplace=True)
agg.head()

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2022-01-01,4,11
1,2022-01-01,7,6
2,2022-01-01,10,1
3,2022-01-01,12,2
4,2022-01-01,13,12


In [4]:
agg.columns

Index(['pickup_hour', 'pickup_location_id', 'rides'], dtype='object')

In [5]:
from tqdm import tqdm
def add_missing_timeslots(agg_rides: pd.DataFrame) -> pd.DataFrame:

    locations = agg_rides['pickup_location_id'].unique()
    full_range = pd.date_range(
        agg_rides['pickup_hour'].min(), agg_rides['pickup_hour'].max(), freq='h'
    )
    output = pd.DataFrame()
    print(agg_rides.columns)
    for location in tqdm(locations):
    
        agg_rides_i = agg_rides.loc[agg_rides.pickup_location_id == location, ['pickup_hour', 'rides']]

        agg_rides_i.set_index('pickup_hour', inplace=True)
        agg_rides_i.index = pd.DatetimeIndex(agg_rides_i.index)
        agg_rides_i = agg_rides_i.reindex(full_range, fill_value=0)

        agg_rides_i['pickup_location_id'] = location

        output = pd.concat([output, agg_rides_i])

    return output.reset_index().rename(columns={'index':'pickup_hour'})

In [6]:
agg_rides_all_slots = add_missing_timeslots(agg)

Index(['pickup_hour', 'pickup_location_id', 'rides'], dtype='object')


  0%|          | 0/257 [00:00<?, ?it/s]

100%|██████████| 257/257 [00:00<00:00, 445.73it/s]


In [7]:
from typing import Optional, List
import plotly.express as px

def plot_rides(
        rides: pd.DataFrame,
        locations: Optional[List[int]] = None
):
    """
    plot time-series data
    """

    rides_to_plot = rides[rides.pickup_location_id.isin(locations)] if locations else rides

    fig = px.line(
        rides_to_plot,
        x="pickup_hour",
        y='rides',
        color='pickup_location_id',
        template='none',
    )

    fig.show()

In [9]:
plot_rides(agg_rides_all_slots, [43])

In [10]:
agg_rides_all_slots.to_parquet('../data/transformed/ts_data_2022_01.parquet')