In [None]:
import polars as pl
import seaborn as sb
import gtfs_delay_analysis as da

In [None]:
import importlib
importlib.reload(da)

In [None]:
aggregated = da.load_aggregate_data()
stops = pl.read_csv(
    '/home/chrlz/dox/dl/ETS_Bus_Schedule_GTFS_Data_Feed_-_Stops_20240216.csv')
raw_dfs = da.load_raw_data()

Some exploration of the `trips.json` file which DOES contain coordinates of the
trip

From initial observation of the data, each trip may have a unique path, even for
the same route (e.g. shorter route for off-peak hours, my bus route home was
like that at one point)

Other assumptions:

- There is only one type of geometry line: MultiLineString
- The actual coordinates are wrapped inside another JSON array


In [None]:
trips = da.trips.load_trips_without_shapes_df()
str_shapes = da.trips.load_str_shapes_df()
shapes = da.trips.load_parsed_shapes_df()

In [None]:
pl.Config.set_fmt_str_lengths(1000)
trips.sort('route_id')

How many counts should we deem useful for visualization of data?


In [None]:
pl.Config.set_fmt_table_cell_list_len(100)
aggregated['routeid']
# 1_250_916

In [None]:
rev_1 = (
    aggregated.join(
        trips.select('trip_headsign', 'shape_id', 'route_id', id='trip_id'),
        on='id',
    )
    .group_by('route_id', 'trip_headsign', 'shape_id').agg(
        pl.col('meandelay').mean(),
        pl.col('count').sum(),
        pl.col('id').n_unique()
    )
    .sort('meandelay', descending=True)
    .filter(pl.col('id') > 30)

)
# rev_1.write_csv('avg-delay-by-trip-headsign.csv')
"""
1. Join `trips` to `aggregated` to new dataframe, only adding `trip_headsign` column. 

 *THEN* GroupBy(`trip_headsign`) + Aggregate(average the delay). 
Drop all columns except the Heading, Aggregated Average Delay

-> Sort by descending meandelay
"""

max_trip_headsign = rev_1[0]

In [None]:
rev_2_1 = (
    aggregated.join(
        trips.select('trip_headsign', 'shape_id', 'route_id', id='trip_id'),
        on='id',
    )
    .filter(
        (pl.col('routeid') == max_trip_headsign['route_id'][0]) &
        (pl.col('shape_id') == max_trip_headsign['shape_id'][0]) &
        (pl.col('trip_headsign') == max_trip_headsign['trip_headsign'][0])
    )
    .group_by('id')
    .agg(
        pl.col('route_id').first(),
        pl.col('trip_headsign').first(),
        pl.col('meandelay').mean(),
        pl.col('count').sum(),
    )
    .sort('meandelay', 'route_id', descending=True)
    # .write_csv('avg-by-trip-id-508-Meadows.csv')
)
"""
2.1 Based on a given **Heading**: Select all associated rows, GroupBy(TripId) + 
Aggregate(Average the delay). 
-> Sort by descending meandelay. Select highest meandelay of the few. 
(tripid is the identifier)
"""
None

In [None]:
from typing import Optional
def join_stops(df: pl.DataFrame, stops: pl.DataFrame):
    return (
        df.with_columns(pl.col('stopid').cast(pl.Utf8))
        .join(stops, left_on='stopid', right_on='stop_id')
    )


def make_sequence(
    df: pl.DataFrame,
    trip_id: Optional[int] = None,
    shape_id: Optional[str] = None,
    trips: Optional[pl.DataFrame] = None,
    shapes: Optional[pl.DataFrame] = None,
):
    trips = trips if trips is not None else  da.trips.load_trips_without_shapes_df()
    shapes = shapes if shapes is not None else da.trips.load_parsed_shapes_df()
    by_stop = df.group_by('stopid').agg(
        pl.col('meandelay').mean(),
        pl.col('stop_lon').first(),
        pl.col('stop_lat').first(),
        pl.col('routeid').first().cast(pl.Utf8),
    )
    pred = (
        pl.col('trip_id').eq(trip_id) if trip_id
        else pl.col('shape_id').eq(shape_id) if shape_id
        else True)
    trip_points = (
        trips
        .filter(pred)
        .join(shapes, on='shape_id')
        .unique('shape_id')
        .explode('geometry_line')
        .unique('geometry_line', keep='first', maintain_order=True)
        .with_row_index()
    )
    return (
        trip_points.join(
            by_stop,
            left_on='route_id',
            right_on='routeid',
        )
        .with_columns(
            pl.col('geometry_line').struct.field(
                'lon').sub(pl.col('stop_lon')),
            pl.col('geometry_line').struct.field(
                'lat').sub(pl.col('stop_lat')),
        )
        # Get euclidean distance
        .with_columns(
            pl.col('lon').pow(2).add(pl.col('lat').pow(2))
            .sqrt().alias('euclidean')
        )
        # Get the minimum euclidean distance for a stop
        .filter(pl.col('euclidean').eq(pl.col('euclidean').min().over('stopid')))
        # Re-create index
        .sort('index')
        .drop('index')
        .with_row_index()
        .select([
            'index',
            'route_id',
            'trip_headsign',
            'stopid',
            'shape_id',
            'meandelay',
            'stop_lon',
            'stop_lat',
        ])
    )



In [None]:
selected_trip = 25536592
agg_points = (
    aggregated
    .filter(pl.col('id').eq(selected_trip))
    .pipe(join_stops, stops)
)
rev_2_2 = make_sequence(agg_points, trip_id=selected_trip)
"""
2.2 Based on the selected `trip_headsign` **AND** `trip_id`: Select all the 
associated rows, join co-ordinates to StopId, join sequence #
"""
# rev_2_2.write_csv(f'rev-2.2-508-meadows-{selected_trip}-stops.csv')

selected_shape = "508-1-East"
all_trips_for_route=(
    trips
    .filter(pl.col('shape_id').eq(selected_shape))
    .select('trip_id', 'shape_id')
)
agg_points = (
    aggregated
    .join(all_trips_for_route, left_on='id', right_on='trip_id')
    .pipe(join_stops, stops)
)
rev_2_3 = make_sequence(agg_points, shape_id=selected_shape)
"""
2.3 Based on the selected **Heading**: Select all the associated rows, 
GroupBy(Sequence #) + Aggregate(Average the delay) 

-> Output df: **Sequence # | AverageAverageDelay**
"""
# rev_2_3.write_csv(f'rev-2.3-{selected_shape}-stops.csv')
None
rev_2_2

In [None]:
all_points = aggregated.join(
    trips.select('trip_id', 'shape_id').unique(),
    left_on='id',
    right_on='trip_id',
).pipe(join_stops, stops)

In [None]:
seq_508 = pl.concat([
    make_sequence(agg_points, t, trips=trips, shapes=shapes)
    .with_columns(trip_id=t)
    for t in all_trips_for_route['trip_id']
])

In [None]:
import tqdm

In [None]:
network_so_bad = pl.concat([
    make_sequence(all_points, t, trips=trips, shapes=shapes)
    .with_columns(trip_id=t)
    for t in tqdm.tqdm(all_points['id'].unique())
])
# 100%|██████████| 12987/12987 [10:23<00:00, 20.83it/s]


In [None]:
network_so_bad.write_parquet('sequence.parquet')

In [None]:
network_so_bad.write_csv('sequence.csv')

In [None]:
network_so_bad = pl.read_parquet('sequence.parquet')

In [None]:
network_so_bad

In [None]:
network_so_bad['shape_id'].unique()

In [None]:
network_so_bad

In [None]:
network_seq =make_sequence(
aggregated.join(trips.select('trip_id', 'shape_id'), left_on='id', right_on='trip_id').pipe(join_stops, stops)
)

In [None]:
seq_508.write_csv('seq-508-meadows.csv')

In [None]:
rev_2_3_off = make_sequence(agg_points.filter(pl.col('period').eq('OFF')), shape_id=selected_shape)
rev_2_3_peak = make_sequence(agg_points.filter(pl.col('period').ne('OFF')), shape_id=selected_shape)

In [None]:


# ddelay_rev_2_3_off.write_csv('508-Meadows-Delay-OFF-PEAK.csv')
# ddelay_rev_2_3_peak.write_csv('508-Meadows-Delay-PEAK.csv')


In [None]:
from gtfs_delay_analysis.ddelay import get_ddelay, plot_ddelay, plot_mean
ddelay_rev_2_2 = rev_2_2.with_columns(trip_id=selected_trip).pipe(get_ddelay)
ddelay_rev_2_3 = rev_2_3.with_columns(trip_id=0).pipe(get_ddelay)

ddelay_rev_2_3_off = rev_2_3_off.with_columns(
    trip_id=selected_trip).pipe(get_ddelay)
ddelay_rev_2_3_peak = rev_2_3_peak.with_columns(trip_id=0).pipe(get_ddelay)


In [None]:
from gtfs_delay_analysis.ddelay import get_ddelay, plot_ddelay, plot_mean

In [None]:
all_ddelay = (
network_so_bad
.pipe(get_ddelay)
.with_columns( pl.col('line').str.split('-'))
.with_columns(
    pl.col('line').list.get(0).alias('a'),
    pl.col('line').list.get(1).alias('b'),
)
.drop('line')
.join(stops.select('stop_id', a_lon='stop_lon', a_lat='stop_lat'), left_on='a', right_on='stop_id')
.join(stops.select('stop_id', b_lon='stop_lon', b_lat='stop_lat'), left_on='b', right_on='stop_id')
)

In [None]:
all_ddelay.write_parquet('all_ddelay.parquet')

In [None]:
with_shape = network_so_bad.pipe(get_ddelay).with_columns(
    pl.col('trip_id').cast(pl.Int64)
).join(trips.select('trip_id', 'shape_id'), on='trip_id')

In [None]:
with_shape.join(shapes, on='shape_id').drop('shape_id', 'line_length', 'line')

In [None]:
get_ddelay(seq_508).write_csv('seq-508-ddelay.csv')

In [None]:
plot_ddelay(ddelay_rev_2_3_peak, 'PEAK')
plot_ddelay(ddelay_rev_2_3_off, 'OFF')

In [None]:
plot_ddelay(ddelay_rev_2_2)

In [None]:
plot_mean(rev_2_2)

In [None]:
pl.Config.set_tbl_rows(100)
rev_2_2.with_columns(
    pl.col('meandelay').diff().alias('ddelay')
)

In [None]:
trips.write_csv('trips-new.csv')
shapes.write_csv('shapes-new.csv')

In [None]:
aggregated.filter(pl.col('id') == 25536770).sort('lastupdate')

In [None]:
raw_dfs = da.load_raw_data()

In [None]:
trips.filter(pl.col('route_id') == "004")