In [None]:
import polars as pl
from glob import glob

def get_dfs_from_glob(glob_str: str):
    return (
        pl.concat([
            pl.read_csv(i, dtypes={ 'stopid': pl.Utf8, 'routeid': pl.Utf8 }) 
            for i in glob(glob_str)
        ], how='vertical')
        .unique()
    )

dfs = get_dfs_from_glob( 'feb/feb26run*.txt')

In [None]:
raw_dfs = get_dfs_from_glob('raw_data/raw_trip_*').unique().cast({
    'period': pl.Categorical,
    'routeid': pl.Categorical,
    'stopid': pl.Categorical
})

In [None]:
raw_dfs = pl.read_parquet('raw.parquet').cast({
    'period': pl.Categorical,
    'routeid': pl.Categorical,
    'stopid': pl.Categorical,
})

In [None]:
INTERVAL = 60*10
avg_delay = (
    raw_dfs
    .with_columns(
        pl.col('lastupdate')
        .floordiv(INTERVAL)
        .mul(INTERVAL)
        .alias('interval')
    )
    .group_by([
        'id',
        'routeid',
        'stopid',
        'interval',
        'period'
    ])
    .agg(
        pl.col('delay')
        .abs()
        .mean()
        .cast(pl.UInt64)
        .alias('avgabsdelay'),
        pl.col('delay')
        .abs()
        .max()
        .cast(pl.UInt64)
        .alias('maxabsdelay')
    )
    .with_columns(
        pl.from_epoch(pl.col('interval'), time_unit='s')
    )
)

In [None]:
avg_delay.write_parquet('avg.parquet')

Average delay in a stop every 3 minutes
10 recordings of a bus