In [None]:
import polars as pl
from glob import glob
import os


def get_dfs_from_glob(glob_str: str):
    return (
        pl.concat([
            pl.read_csv(i, dtypes={'stopid': pl.Utf8, 'routeid': pl.Utf8})
            for i in glob(glob_str)
        ], how='vertical')
        .unique()
    )


def load_raw_data():
    """
    Load raw data from existing parquet file or from CSVs.
    """
    df = (pl.read_parquet('raw.parquet')
          if os.path.exists('raw.parquet')
          else get_dfs_from_glob('raw_data/raw_trip_*').unique()
          )
    # Save memory by re-using strings for categorical data
    return df.cast({
        'period': pl.Categorical,
        'routeid': pl.Categorical,
        'stopid': pl.Categorical
    })


def divide_by_interval(raw_dfs: pl.DataFrame, interval: int):
    return (
        raw_dfs
        .with_columns(
            # Lazy way to group timestamps
            pl.col('lastupdate').floordiv(interval).mul(interval),
            # Being delayed and being early is the same
            pl.col('delay').abs().cast(pl.UInt64),
        )
        .group_by([
            'id',
            'routeid',
            'stopid',
            'lastupdate',
            'period',
        ])
        .agg(
            pl.col('delay').mean().alias('avgabsdelay'),
            # For determining outliers
            pl.col('delay').max().alias('maxabsdelay')
        )
        .with_columns(
            pl.from_epoch('lastupdate', time_unit='s')
            .alias('interval')
        )
    )

In [13]:
raw_dfs = load_raw_data()

In [None]:
avg_delay = divide_by_interval(raw_dfs, 60*10)
avg_delay.with_columns(
    # Somehow the data isn't in UTC
    pl.col('interval').dt.offset_by("-7h")
)

Average delay in a stop every 3 minutes
10 recordings of a bus