# Best bus routes

Find the best bus routes from PTV data.


Many bus routes don't have night services. So I'd like to find the bus routes with the latest final arrival time in any date.

In [1]:
import components as comp
import pandas as pd
import numpy as np

In [2]:
# Utilize a custom module to convert PTV's gtfs.zip data into a Series of Pandas DataFrames

df = comp.process_gtfs_zip('http://data.ptv.vic.gov.au/downloads/gtfs.zip', '')
# 2m30s

In [3]:
ptv = df.set_index(['branch_id', 'table_name'], inplace=False)['df']

In [4]:
# Create a DataFrame for each bus table in the GTFS data
df_bus_agency : pd.DataFrame = ptv['4']['agency']
df_bus_calendar : pd.DataFrame = ptv['4']['calendar']
df_bus_calendar_dates : pd.DataFrame = ptv['4']['calendar_dates']
df_bus_routes : pd.DataFrame = ptv['4']['routes']
df_bus_shapes : pd.DataFrame = ptv['4']['shapes']
df_bus_stops : pd.DataFrame = ptv['4']['stops']
df_bus_stop_times : pd.DataFrame = ptv['4']['stop_times']
df_bus_trips : pd.DataFrame = ptv['4']['trips']

In [5]:
def get_max_row(df : pd.DataFrame, by: str):
    """
    Return the row with the maximum/minimum value of 'by' column in the given dataframe

    df.max() and df.min() only returns the considered column instead of the whole row.
    """
    return df.loc[df[by].idxmax()]

def get_min_row(df : pd.DataFrame, by: str):
    """
    Return the row with the maximum/minimum value of 'by' column in the given dataframe

    df.max() and df.min() only returns the considered column instead of the whole row.
    """
    return df.loc[df[by].idxmin()]

In [6]:
df_bus_stop_times.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
0,43-477--1-MF1-8728614,05:56:00,05:56:00,6725,1,,0,0,0.0
1,43-477--1-MF1-8728614,05:56:00,05:56:00,6726,2,,0,0,289.29
2,43-477--1-MF1-8728614,06:00:00,06:00:00,9095,3,,0,0,1442.47
3,43-477--1-MF1-8728614,06:01:00,06:01:00,27586,4,,0,0,4975.39
4,43-477--1-MF1-8728614,06:02:00,06:02:00,27587,5,,0,0,5339.19


In [7]:
# Find the row with the maximum arrival time for each trip 
dfb1 = df_bus_stop_times.groupby('trip_id')['arrival_time'].max().to_frame(name="arrival_time").reset_index()
# 20s

In [8]:
dfb2 : pd.DataFrame = pd.merge(df_bus_trips, dfb1, on='trip_id', how='left')

In [9]:
dfb3 : pd.DataFrame = dfb2.groupby(['route_id', 'service_id'])['arrival_time'].max().to_frame(name="arrival_time").reset_index()

In [10]:
dfb4 = pd.merge(dfb3, df_bus_calendar, on='service_id', how='left')

In [11]:
# Unpivot the days of the week into a single column
dfb5 = pd.melt(
    frame=dfb4, 
    id_vars=['route_id', 'arrival_time'], 
    value_vars=['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'],
    var_name='day_of_week', 
    value_name='service_operating'
)
dfb5 = dfb5[dfb5['service_operating'] == 1][['route_id', 'arrival_time', 'day_of_week']]

In [12]:
dfb6 = dfb5.groupby(['route_id', 'day_of_week'])['arrival_time'].max().to_frame(name='arrival_time').reset_index()

In [13]:
dfb7 = pd.pivot(dfb6, index='route_id', columns='day_of_week', values='arrival_time')
dfb7.reset_index(inplace=True)

In [14]:
# Reorder columns
dfb7 = dfb7[['route_id', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']]

In [15]:
# Remove routes that don't operate on at least 1 particular day
dfb7.dropna(axis=0, how='any', inplace=True, subset=['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'])

In [16]:
dfb7['earliest_final'] = dfb7[['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']].min(axis=1)

In [17]:
dfb8 = pd.merge(dfb7, df_bus_routes, on='route_id', how='left')

In [18]:
dfb8.sort_values(by='earliest_final', ascending=False, inplace=True)

In [19]:
dfb8.head(20)

Unnamed: 0,route_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,earliest_final,agency_id,route_short_name,route_long_name,route_type,route_color,route_text_color
39,14-907-aus-1,24:52:00,24:52:00,24:52:00,24:52:00,26:42:00,27:45:00,24:43:00,24:43:00,,907,Mitcham - City (King/Lonsdale Sts),3,FF8200,FFFFFF
40,14-908-aus-1,24:50:00,24:50:00,24:50:00,24:50:00,26:40:00,27:35:00,24:42:00,24:42:00,,908,The Pines SC - City (King/Lonsdale Sts),3,FF8200,FFFFFF
37,14-905-aus-1,24:32:00,24:32:00,24:32:00,24:32:00,26:20:00,27:15:00,24:37:00,24:32:00,,905,The Pines SC - City (King/Lonsdale Sts),3,FF8200,FFFFFF
38,14-906-aus-1,24:44:00,24:44:00,24:44:00,24:44:00,24:44:00,24:30:00,24:29:00,24:29:00,,906,Warrandyte - City (King/Lonsdale Sts),3,FF8200,FFFFFF
176,35-426-aus-1,24:51:00,24:51:00,24:51:00,24:51:00,25:22:00,25:22:00,24:20:00,24:20:00,,426,Caroline Springs - Sunshine Station,3,FF8200,FFFFFF
295,82-190-aus-1,31:03:00,24:37:00,24:37:00,24:37:00,30:58:00,31:03:00,24:18:00,24:18:00,,190,Wyndham Vale Station - Werribee Station,3,FF8200,FFFFFF
170,35-216-aus-1,24:27:00,24:27:00,24:27:00,24:27:00,24:55:00,24:55:00,24:16:00,24:16:00,,216,Sunshine Station - City (Queen St),3,FF8200,FFFFFF
171,35-220-aus-1,24:45:00,24:45:00,24:45:00,24:45:00,24:50:00,24:50:00,24:09:00,24:09:00,,220,Sunshine Station - Sunshine Station,3,FF8200,FFFFFF
160,32-604-aus-1,24:24:00,24:24:00,24:24:00,24:24:00,24:24:00,24:04:00,23:59:00,23:59:00,,604,Gardenvale - Alfred Hospital,3,FF8200,FFFFFF
172,35-223-aus-1,24:29:00,24:29:00,24:29:00,24:29:00,24:33:00,24:33:00,23:57:00,23:57:00,,223,Yarraville - Highpoint SC,3,FF8200,FFFFFF


In [20]:
dfb_stop_times = pd.merge(df_bus_stop_times, df_bus_trips, on='trip_id', how='left')