Import Statements

In [1]:
# Import required libraries
import pandas as pd
from pathlib import Path
import pyarrow.parquet as pq
import numpy as np

Loading processed data from 03ipynb

In [2]:
# Set month and year for processing
month = 1
year = 2024
path = Path("..") / "data" / "processed" / f"ts_data_{year}_{month:02}.parquet"

# Load the time series data
table = pq.read_table(path)
ts_data = table.to_pandas()
ts_data.head(18)

# Load the top stations to select one for testing
top_stations_path = Path("..") / "data" / "top_stations.csv"
top_stations = pd.read_csv(top_stations_path)
test_station = top_stations['start_station_name'].iloc[0]  # Select the top station
print(f"Testing transformation with station: {test_station}")

# Display the first 18 rows for the test station
ts_data[ts_data['start_station_name'] == test_station].head(18)

Testing transformation with station: W 21 St & 6 Ave


Unnamed: 0,pickup_hour,start_station_name,rides
1446336,2024-01-01 00:00:00,W 21 St & 6 Ave,2
1446337,2024-01-01 01:00:00,W 21 St & 6 Ave,2
1446338,2024-01-01 02:00:00,W 21 St & 6 Ave,5
1446339,2024-01-01 03:00:00,W 21 St & 6 Ave,2
1446340,2024-01-01 04:00:00,W 21 St & 6 Ave,0
1446341,2024-01-01 05:00:00,W 21 St & 6 Ave,1
1446342,2024-01-01 06:00:00,W 21 St & 6 Ave,0
1446343,2024-01-01 07:00:00,W 21 St & 6 Ave,0
1446344,2024-01-01 08:00:00,W 21 St & 6 Ave,1
1446345,2024-01-01 09:00:00,W 21 St & 6 Ave,3


Function that melts the dataframe in a specific way to create target variable (from lead values)

In [3]:
# Function to transform time series to tabular format
def transform_time_series_to_tabular(df, station_name, feature_col="rides", window_size=12, step_size=1):
    """
    Transforms time series data for a given station into a tabular format.
    The first `window_size` rows are used as features, and the next row is the target.
    The process slides down by `step_size` rows at a time to create the next set of features and target.

    Parameters:
        df (pd.DataFrame): The input DataFrame containing time series data.
        station_name (str): The station name to filter the data for.
        feature_col (str): The column name containing the values to use as features and target (default is "rides").
        window_size (int): The number of rows to use as features (default is 12).
        step_size (int): The number of rows to slide the window by (default is 1).

    Returns:
        pd.DataFrame: A transformed DataFrame where the first `window_size` columns are features
                      and the last column is the target.
    """
    # Filter the data for the given station name
    station_data = df[df['start_station_name'] == station_name].reset_index(drop=True)

    # Extract the feature column as a NumPy array
    values = station_data[feature_col].values

    # Ensure there are enough rows to create at least one window
    if len(values) <= window_size:
        raise ValueError(f"Not enough data to create even one window for {station_name}.")

    # Create the tabular data using a sliding window approach
    rows = []
    for i in range(0, len(values) - window_size, step_size):
        # The first `window_size` values are features, and the next value is the target
        features = values[i:i + window_size]
        target = values[i + window_size]
        rows.append(np.append(features, target))

    # Convert the list of rows into a DataFrame
    column_names = [f"feature_{i+1}" for i in range(window_size)] + ["target"]
    transformed_df = pd.DataFrame(rows, columns=column_names)

    return transformed_df

In [4]:
# Transform the data for the test station with a window size of 24 hours
features_targets = transform_time_series_to_tabular(ts_data, test_station, "rides", 24, 1)
features_targets.head(10)

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,target
0,2,2,5,2,0,1,0,0,1,3,...,11,9,7,5,5,2,2,0,2,0
1,2,5,2,0,1,0,0,1,3,3,...,9,7,5,5,2,2,0,2,0,0
2,5,2,0,1,0,0,1,3,3,8,...,7,5,5,2,2,0,2,0,0,0
3,2,0,1,0,0,1,3,3,8,20,...,5,5,2,2,0,2,0,0,0,0
4,0,1,0,0,1,3,3,8,20,8,...,5,2,2,0,2,0,0,0,0,0
5,1,0,0,1,3,3,8,20,8,9,...,2,2,0,2,0,0,0,0,0,1
6,0,0,1,3,3,8,20,8,9,11,...,2,0,2,0,0,0,0,0,1,6
7,0,1,3,3,8,20,8,9,11,9,...,0,2,0,0,0,0,0,1,6,0
8,1,3,3,8,20,8,9,11,9,7,...,2,0,0,0,0,0,1,6,0,0
9,3,3,8,20,8,9,11,9,7,5,...,0,0,0,0,0,1,6,0,0,0
