In [None]:
import polars as pl
import seaborn as sb
import gtfs_delay_analysis as da

In [None]:
import importlib
importlib.reload(da)

In [None]:
stops = pl.read_csv(
    '/home/chrlz/dox/dl/ETS_Bus_Schedule_GTFS_Data_Feed_-_Stops_20240216.csv')

In [None]:
aggregated = da.load_aggregate_data()

So we know that all trip id is unique for a single day, no need to worry about overlaps

Average delay in a stop every 3 minutes 10 recordings of a bus

AM: 7am-9am
PM: 4pm-7pm
OFF: 5am-7am, 9am-4-pm, 7pm-10pm

In [None]:
# 1. Map delay average over week
mean_stop = aggregated.pipe(da.agg_group, 'stopid')

# 2. Over week by day
# TODO: Filter by PEAK, OFF, and DAY
def by_day(df: pl.DataFrame):
    by_day_all = df.pipe(da.agg_group, 'day')
    by_day_peak = (
        df
        .filter(pl.col('period') != 'OFF')
        .pipe(da.agg_group, 'day')
    )
    by_day_off = (
        df
        .filter(pl.col('period') == 'OFF')
        .pipe(da.agg_group, 'day')
    )

    # 3. Over day by hour
    by_hour = aggregated.pipe(da.agg_group, 'day', 'hour')
    return by_day_all, by_day_peak, by_day_off, by_hour


by_day_all, by_day_peak, by_day_off, by_hour = by_day(aggregated)

# TODO: Select route
# 4. Time series graph of route by hour by day


highest_delay_stops = (
    mean_stop
    .group_by('stopid')
    .agg(pl.col('avgdelay').max())
    .sort('avgdelay', descending=True)
    .select('stopid')
    .head(100)
    .unique()
    .join(aggregated, on='stopid')
)
routes_on_highest_delay_stops = (
    highest_delay_stops
    .select('routeid')
    .unique()
    .join(aggregated, on='routeid')
)
agg_routes_on_highest_delay = (
    routes_on_highest_delay_stops
    .pipe(da.agg_group, 'routeid')
    .drop('lastupdate')
)


def plot_cols(df: pl.DataFrame, key: str):
    tidy = df.melt(id_vars=key).sort(key)
    xticks = tidy[key]
    ax = sb.barplot(
        data=tidy,
        x=key,
        y='value',
        hue='variable',
        width=1,
        order=xticks,
    )
    return tidy, ax


def plot_by_route(df: pl.DataFrame):
    tidy, ax = plot_cols(df, 'routeid')
    xticks_unique = tidy['routeid'].unique()
    ax.set_xticks(ax.get_xticks(), xticks_unique, rotation=90)


# 5. Mapping delay propagations within a route
# 6. TODO

def plot_stop_and_route(df: pl.DataFrame, stop: str, route: str):
    days = pl.DataFrame({
        "day": [*range(1, 6)],
        "letter": ['M', 'T', 'W', 'R', 'F'],
    }, schema={"day": pl.Int8, 'letter': pl.Categorical})
    by_day_by_hour_for_stop = (
        da.select_stop_and_route(df, stop, route)
        .join(days, on='day')
        .drop('day')
        .rename({'letter': 'day'})
    )

    ax = sb.barplot(by_day_by_hour_for_stop, x='hour', y='avgdelay', hue='day')
    ax.set_title(f'Average delay for route {route} stop {stop}')
    return by_day_by_hour_for_stop


# select_stop(aggregated, "2260")

# sb.barplot(by_day_by_hour_for_stop, x='hour', y='maxdelay')


# highest_delay_stops.select('stopid').unique()

# plot_stop_and_route(aggregated, "2260", "637")
# da.select_stop(aggregated, "2260")

In [None]:
by_all_1899_560, by_peak_1899_560, by_off_1899_560, by_hour_1899_560= by_day(aggregated.filter(
    (pl.col('stopid')=='1899') 
    & (pl.col('routeid') == '560')
))
df_1899_560 = by_all_1899_560.select('day', all_avg=pl.col('avgdelay'), all_max=pl.col('maxdelay'), ).join(
    by_peak_1899_560.select('day', peak_avg=pl.col('avgdelay'), peak_max=pl.col('maxdelay'), ), on='day'
).join(
    by_off_1899_560.select('day', off_avg=pl.col('avgdelay'), off_max=pl.col('maxdelay'), ), on='day'
)
by_hour_1899_560.drop('lastupdate').write_csv('1899-560-by-hour.csv')
df_1899_560.write_csv('1899-560-by-day.csv')

In [None]:
by_hour_1899_560

In [None]:
aggregated.filter(pl.col('stopid')=='1899').group_by('routeid').agg(
    pl.col('meandelay').mean().alias('avgdelay'),
    pl.col('maxdelay').max().alias('maxdelay'),
    pl.col('count').sum()
).write_csv('1899-by-route.csv')

In [None]:
mean_stop.pipe(da.add_coords, stops).drop('lastupdate').write_csv('mean-stop.csv')

df = by_day_all.select('day', all_avg=pl.col('avgdelay'), all_max=pl.col('maxdelay'), ).join(
    by_day_peak.select('day', peak_avg=pl.col('avgdelay'), peak_max=pl.col('maxdelay'), ), on='day'
).join(
    by_day_off.select('day', off_avg=pl.col('avgdelay'), off_max=pl.col('maxdelay'), ), on='day'
)

df.write_csv('network-by-day.csv')

In [None]:
by_hour.drop('lastupdate').write_csv('network-by-day-by-hour.csv')
# by_hour.drop('lastupdate')


In [None]:

# plot_cols(df, 'day') and None

# plot_cols(by_hour.select('hour', 'day', 'avgdelay'), 'hour') and None

# by_hour

to_plot = by_hour.select('day', 'hour', 'avgdelay')
ax = sb.lineplot(to_plot, x='hour', y='avgdelay', hue='day')
ax.set_xticks([*range(5, 23)]) and None

In [None]:
(
    mean_stop.drop('lastupdate')
    .pipe(da.add_coords, stops)
    .write_csv('mean-stop.csv')
)

In [None]:
len(mean_stop) - len(mean_stop.filter(pl.col('numtrips') > 200))
# len(mean_stop)

Some exploration of the `trips.json` file which DOES contain coordinates of the
trip

From initial observation of the data, each trip may have a unique path, even for
the same route (e.g. shorter route for off-peak hours, my bus route home was
like that at one point)

Other assumptions:
- There is only one type of geometry line: MultiLineString
- The actual coordinates are wrapped inside another JSON array

In [None]:
trips = da.load_trips_df()

In [None]:
pl.Config.set_fmt_str_lengths(1000)
trips.sort('route_id')

In [None]:
import geopolars as gp
gp.GeoDataFrame(mean_stop.pipe(da.add_coords, stops)).plot()

How many counts should we deem useful for visualization of data?

In [None]:
pl.Config.set_fmt_table_cell_list_len(100)
aggregated['routeid']
# 1_250_916

In [None]:
# da.select_stop(aggregated, '5281')
# plot_stop_and_route(aggregated, '1899', '560')
# plot_stop_and_route(aggregated, '1899', '413')
# plot_stop_and_route(aggregated, '5281', '002')
# plot_stop_and_route(aggregated, '5281', '004')
# plot_stop_and_route(aggregated, '5281', '904')

aggregated.filter(
    (pl.col('stopid') == '5281') 
    # & (pl.col('routeid') == '904')
).group_by('routeid').agg(
    pl.col('count').sum(),
    pl.col('meandelay').sum(),
)
# 25529677

In [None]:
shapes = da.trips.load_parsed_shapes_df()

In [None]:
trips = da.trips.load_trips_without_shapes_df()
shapes = da.trips.load_str_shapes_df()

In [None]:
trips.filter(pl.col('route_id') =="560") .join(
    shapes, 
    on='shape_id'
).select('route_id', 'shape_id').unique().join(
    shapes, on='shape_id'
).write_csv('560-shapes.csv')

In [None]:
# Find trip with highest delay on Tuesday for route 560
aggregated.filter(
    (pl.col('routeid') == "560")
    & (pl.col('day') == 2)
).group_by('id').agg(
    pl.col('meandelay').max()
).sort('meandelay', descending=True)

In [None]:
# Find all points for that trip
def filter_by_trip(id: int, day:int):
    df= aggregated.filter(
        (pl.col('id') == id) & 
        (pl.col('day') == day)
    ).pipe(da.add_coords, stops).sort('meandelay').drop('delay')
    df.write_csv(f'{id}-{day}-stops.csv')
    return df
filter_by_trip(25527827, 2)
filter_by_trip(25527777, 2)
filter_by_trip(25527796, 2)

In [None]:
raw_dfs = da.load_raw_data()

In [None]:
aggregated