In [1]:
import pandas as pd

ts_data = pd.read_parquet('../data/transformed/ts_data_2024_01.parquet')
ts_data

Unnamed: 0,pickup_hour,rides,pickup_location_id
0,2024-01-01 00:00:00,25,4
1,2024-01-01 01:00:00,29,4
2,2024-01-01 02:00:00,34,4
3,2024-01-01 03:00:00,31,4
4,2024-01-01 04:00:00,32,4
...,...,...,...
193435,2024-01-31 19:00:00,0,245
193436,2024-01-31 20:00:00,0,245
193437,2024-01-31 21:00:00,0,245
193438,2024-01-31 22:00:00,0,245


### implementing data transformation only for 1 location for now

In [2]:
ts_data_one_location = ts_data.loc[ts_data.pickup_location_id == 43, :].reset_index(drop=True)
ts_data_one_location.head(20)

Unnamed: 0,pickup_hour,rides,pickup_location_id
0,2024-01-01 00:00:00,162,43
1,2024-01-01 01:00:00,89,43
2,2024-01-01 02:00:00,38,43
3,2024-01-01 03:00:00,14,43
4,2024-01-01 04:00:00,5,43
5,2024-01-01 05:00:00,3,43
6,2024-01-01 06:00:00,5,43
7,2024-01-01 07:00:00,12,43
8,2024-01-01 08:00:00,10,43
9,2024-01-01 09:00:00,15,43


In [3]:
def get_cutoff_indices(
        data: pd.DataFrame,
        n_features: int,
        step_size: int
        ) -> list:
    
    stop_position = len(data) - 1

    # Start the first sub-sequence at index position 0
    subseq_first_idx = 0
    subseq_mid_idx = n_features
    subseq_last_idx = n_features + 1
    indices = []

    while subseq_last_idx <= stop_position:
        indices.append((subseq_first_idx, subseq_mid_idx, subseq_last_idx))

        subseq_first_idx += step_size
        subseq_mid_idx += step_size
        subseq_last_idx += step_size

    return indices

In [7]:
# to get the slicing windows for the tabular data creation
n_features = 24 # previous day 24 hours data
step_size = 1 # window moves 1 step, so each row when we use these slices will have previous 24 hour data as features and current hour rides data as target

indices = get_cutoff_indices(ts_data_one_location, n_features, step_size)
print(indices[:10])

[(0, 24, 25), (1, 25, 26), (2, 26, 27), (3, 27, 28), (4, 28, 29), (5, 29, 30), (6, 30, 31), (7, 31, 32), (8, 32, 33), (9, 33, 34)]


In [9]:
# to get the values for each of these slice windows
import numpy as np

n_examples = len(indices)
x = np.ndarray(shape=(n_examples, n_features), dtype=np.float32)
y = np.ndarray(shape=(n_examples), dtype=np.float32)
pickup_hours = []

for i, idx in enumerate(indices):
    x[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides'].values
    y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values.item()
    pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_hour'])

In [12]:
print(f"{x.shape=}")

x.shape=(719, 24)


In [13]:
print(f"{x=}")

x=array([[162.,  89.,  38., ...,  28.,  13.,   5.],
       [ 89.,  38.,  14., ...,  13.,   5.,   3.],
       [ 38.,  14.,   5., ...,   5.,   3.,   0.],
       ...,
       [ 93.,  55.,  38., ..., 107., 120.,  81.],
       [ 55.,  38.,  12., ..., 120.,  81.,  52.],
       [ 38.,  12.,   3., ...,  81.,  52.,  54.]],
      shape=(719, 24), dtype=float32)


In [14]:
print(f"{pickup_hours[:5]=}")

pickup_hours[:5]=[Timestamp('2024-01-02 00:00:00'), Timestamp('2024-01-02 01:00:00'), Timestamp('2024-01-02 02:00:00'), Timestamp('2024-01-02 03:00:00'), Timestamp('2024-01-02 04:00:00')]


In [15]:
# convert x from numpy array to dataframe
features_one_location = pd.DataFrame(
    x,
    columns=[f"rides_previous_{i+1}_hour" for i in reversed(range(n_features))]
)
features_one_location

Unnamed: 0,rides_previous_24_hour,rides_previous_23_hour,rides_previous_22_hour,rides_previous_21_hour,rides_previous_20_hour,rides_previous_19_hour,rides_previous_18_hour,rides_previous_17_hour,rides_previous_16_hour,rides_previous_15_hour,...,rides_previous_10_hour,rides_previous_9_hour,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour
0,162.0,89.0,38.0,14.0,5.0,3.0,5.0,12.0,10.0,15.0,...,108.0,125.0,84.0,50.0,40.0,40.0,36.0,28.0,13.0,5.0
1,89.0,38.0,14.0,5.0,3.0,5.0,12.0,10.0,15.0,28.0,...,125.0,84.0,50.0,40.0,40.0,36.0,28.0,13.0,5.0,3.0
2,38.0,14.0,5.0,3.0,5.0,12.0,10.0,15.0,28.0,55.0,...,84.0,50.0,40.0,40.0,36.0,28.0,13.0,5.0,3.0,0.0
3,14.0,5.0,3.0,5.0,12.0,10.0,15.0,28.0,55.0,49.0,...,50.0,40.0,40.0,36.0,28.0,13.0,5.0,3.0,0.0,0.0
4,5.0,3.0,5.0,12.0,10.0,15.0,28.0,55.0,49.0,74.0,...,40.0,40.0,36.0,28.0,13.0,5.0,3.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
714,134.0,92.0,93.0,55.0,38.0,12.0,3.0,0.0,0.0,1.0,...,52.0,84.0,89.0,91.0,103.0,120.0,75.0,126.0,96.0,107.0
715,92.0,93.0,55.0,38.0,12.0,3.0,0.0,0.0,1.0,1.0,...,84.0,89.0,91.0,103.0,120.0,75.0,126.0,96.0,107.0,120.0
716,93.0,55.0,38.0,12.0,3.0,0.0,0.0,1.0,1.0,8.0,...,89.0,91.0,103.0,120.0,75.0,126.0,96.0,107.0,120.0,81.0
717,55.0,38.0,12.0,3.0,0.0,0.0,1.0,1.0,8.0,11.0,...,91.0,103.0,120.0,75.0,126.0,96.0,107.0,120.0,81.0,52.0


In [16]:
# convert y 
target_one_location = pd.DataFrame(
    y,
    columns=[f"target_rides_next_hour"]
)
target_one_location

Unnamed: 0,target_rides_next_hour
0,3.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
714,120.0
715,81.0
716,52.0
717,54.0


In [17]:
# function to convert timeseries data for all locations ids

from tqdm import tqdm

def transform_ts_data_into_features_and_target(
        ts_data: pd.DataFrame,
        input_seq_len: int,
        step_size: int
        ) -> pd.DataFrame:
    """
    Slices and transposes data from time-series format into a (features, target)
    format that we can use to train Supervised ML models
    """

    assert set(ts_data.columns) == {'pickup_hour', 'rides', 'pickup_location_id'}

    location_ids = ts_data['pickup_location_id'].unique()
    features = pd.DataFrame()
    targets = pd.DataFrame()

    for location_id in tqdm(location_ids):

        # keep only ts_data for this `location_id`
        ts_data_one_location = ts_data.loc[
            ts_data.pickup_location_id == location_id,
            ['pickup_hour', 'rides']
        ]

        # pre-compute cutoff indices to split dataframe rows
        indices = get_cutoff_indices(
            ts_data_one_location,
            input_seq_len,
            step_size
        )

        # slice and transpose data into numpy arrays for features and targets
        n_examples = len(indices)
        x = np.ndarray(shape=(n_examples, input_seq_len), dtype=np.float32)
        y = np.ndarray(shape=(n_examples), dtype=np.float32)
        pickup_hours = []
        for i, idx in enumerate(indices):
            x[i, :] = ts_data_one_location.iloc[idx[0]:idx[1]]['rides'].values
            y[i] = ts_data_one_location.iloc[idx[1]:idx[2]]['rides'].values.item()
            pickup_hours.append(ts_data_one_location.iloc[idx[1]]['pickup_hour'])

        # features: numpy array -> pandas dataframe
        features_one_location = pd.DataFrame(
            x,
            columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(input_seq_len))]
        )
        features_one_location['pickup_hour'] = pickup_hours
        features_one_location['pickup_location_id'] = location_id

        # target: numpy array -> pandas dataframe
        target_one_location = pd.DataFrame(
            y,
            columns=[f'target_rides_next_hour']
        )

        # concatenate results
        features = pd.concat([features, features_one_location])
        targets = pd.concat([targets, target_one_location])

    features.reset_index(inplace=True, drop=True)
    targets.reset_index(inplace=True, drop=True)

    return features, targets['target_rides_next_hour']


In [18]:
features, targets = transform_ts_data_into_features_and_target(
    ts_data,
    input_seq_len=24*7*1, # one week of history
    step_size=24,
)

100%|██████████| 260/260 [00:00<00:00, 273.09it/s]


In [19]:
print(f"{features.shape=}")

features.shape=(6240, 170)


In [20]:
print(f"{targets.shape=}")

targets.shape=(6240,)


In [21]:
features

Unnamed: 0,rides_previous_168_hour,rides_previous_167_hour,rides_previous_166_hour,rides_previous_165_hour,rides_previous_164_hour,rides_previous_163_hour,rides_previous_162_hour,rides_previous_161_hour,rides_previous_160_hour,rides_previous_159_hour,...,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id
0,25.0,29.0,34.0,31.0,32.0,8.0,6.0,4.0,0.0,1.0,...,1.0,3.0,1.0,0.0,0.0,2.0,1.0,1.0,2024-01-08,4
1,1.0,2.0,1.0,0.0,0.0,1.0,0.0,3.0,6.0,3.0,...,3.0,1.0,1.0,0.0,3.0,0.0,0.0,4.0,2024-01-09,4
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,4.0,5.0,...,3.0,2.0,9.0,1.0,1.0,4.0,0.0,0.0,2024-01-10,4
3,1.0,1.0,2.0,1.0,0.0,1.0,1.0,3.0,2.0,1.0,...,3.0,2.0,3.0,1.0,2.0,4.0,3.0,7.0,2024-01-11,4
4,2.0,4.0,1.0,1.0,0.0,0.0,0.0,1.0,3.0,1.0,...,2.0,2.0,2.0,2.0,1.0,5.0,7.0,2.0,2024-01-12,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-01-27,245
6236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-01-28,245
6237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-01-29,245
6238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2024-01-30,245


In [22]:
targets

0        1.0
1        1.0
2        0.0
3        3.0
4       10.0
        ... 
6235     0.0
6236     0.0
6237     0.0
6238     0.0
6239     0.0
Name: target_rides_next_hour, Length: 6240, dtype: float32