# Best bus routes frequency

Find the bus routes with great frequency from PTV data.


In [1]:
import pyptvgtfs
import pandas as pd
import numpy as np

In [2]:
# Utilize a custom module to convert PTV's gtfs.zip data into a Series of Pandas DataFrames

df = pyptvgtfs.process_gtfs_zip('http://data.ptv.vic.gov.au/downloads/gtfs.zip', '')
# 2m30s

In [3]:
ptv = df.set_index(['branch_id', 'table_name'], inplace=False)['df']

In [4]:
# Create a DataFrame for each bus table in the GTFS data
df_bus_agency : pd.DataFrame = ptv['4']['agency']
df_bus_calendar : pd.DataFrame = ptv['4']['calendar']
df_bus_calendar_dates : pd.DataFrame = ptv['4']['calendar_dates']
df_bus_routes : pd.DataFrame = ptv['4']['routes']
df_bus_shapes : pd.DataFrame = ptv['4']['shapes']
df_bus_stops : pd.DataFrame = ptv['4']['stops']
df_bus_stop_times : pd.DataFrame = ptv['4']['stop_times']
df_bus_trips : pd.DataFrame = ptv['4']['trips']

In [5]:
def get_max_row(df : pd.DataFrame, by: str):
    """
    Return the row with the maximum/minimum value of 'by' column in the given dataframe

    df.max() and df.min() only returns the considered column instead of the whole row.
    """
    return df.loc[df[by].idxmax()]

def get_min_row(df : pd.DataFrame, by: str):
    """
    Return the row with the maximum/minimum value of 'by' column in the given dataframe

    df.max() and df.min() only returns the considered column instead of the whole row.
    """
    return df.loc[df[by].idxmin()]

In [11]:
dfb1 = df_bus_trips.groupby(['route_id', 'service_id'])['trip_id'].count().sort_values(ascending=False).rename('trip_count').reset_index()

In [24]:
df_bus_trips.keys()

Index(['route_id', 'service_id', 'trip_id', 'shape_id', 'trip_headsign',
       'direction_id'],
      dtype='object')

In [26]:
df_bus_stop_times

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
0,43-477--1-MF3-2005014,05:56:00,05:56:00,6725,1,,0,0,0.00
1,43-477--1-MF3-2005014,05:56:00,05:56:00,6726,2,,0,0,289.29
2,43-477--1-MF3-2005014,06:00:00,06:00:00,9095,3,,0,0,1442.47
3,43-477--1-MF3-2005014,06:01:00,06:01:00,27586,4,,0,0,4975.39
4,43-477--1-MF3-2005014,06:02:00,06:02:00,27587,5,,0,0,5339.19
...,...,...,...,...,...,...,...,...,...
4096610,23-959--1-Sat3-6,30:12:00,30:12:00,20471,49,,0,0,21583.92
4096611,23-959--1-Sat3-6,30:13:00,30:13:00,20472,50,,0,0,21818.76
4096612,23-959--1-Sat3-6,30:13:00,30:13:00,20473,51,,0,0,21964.56
4096613,23-959--1-Sat3-6,30:14:00,30:14:00,20474,52,,0,0,22064.42


In [52]:
df_trips_min_max = df_bus_stop_times.groupby('trip_id').aggregate({'arrival_time': 'min', 'departure_time': 'max', 'stop_sequence' : 'max', 'shape_dist_traveled' : 'max'}).rename(columns={'arrival_time': 'start_time', 'departure_time': 'end_time', 'stop_sequence' : 'stop_count'}).reset_index()
# 10s

In [53]:
df_bus_trips_1 = pd.merge(df_bus_trips, df_trips_min_max, on='trip_id', how='left')

In [72]:
df_bus_routes_frequency = df_bus_trips_1.groupby(['route_id', 'service_id', 'direction_id']).agg({'trip_id': 'count', 'start_time': ['min', 'max']}).reset_index()
# Convert the multi-level column names to a single level
df_bus_routes_frequency.columns = ['_'.join(col).strip() for col in df_bus_routes_frequency.columns.values]
df_bus_routes_frequency.rename(columns={'trip_id_count': 'trip_count', 'start_time_min': 'earliest', 'start_time_max': 'latest'}, inplace=True)
df_bus_routes_frequency.columns = [col.strip('_') for col in df_bus_routes_frequency.columns.values]

In [75]:
dfb1 = pd.merge(df_bus_routes_frequency, df_bus_routes[['route_id', 'route_short_name', 'route_long_name']], on='route_id')

In [80]:
dfb1

Unnamed: 0,route_id,service_id,direction_id,trip_count,earliest,latest,route_short_name,route_long_name
0,12-831-aus-1,MF1-12-831-aus,0,23,06:36:00,21:21:00,831,Kingsmere Estate - Berwick Station
1,12-831-aus-1,MF1-12-831-aus,1,24,06:12:00,21:00:00,831,Kingsmere Estate - Berwick Station
2,12-831-aus-1,MF2-12-831-aus,0,23,06:36:00,21:21:00,831,Kingsmere Estate - Berwick Station
3,12-831-aus-1,MF2-12-831-aus,1,24,06:12:00,21:00:00,831,Kingsmere Estate - Berwick Station
4,12-831-aus-1,Sat1-12-831-aus,0,14,08:15:00,21:42:00,831,Kingsmere Estate - Berwick Station
...,...,...,...,...,...,...,...,...
5188,82-498-aus-1,MF6-82-498-aus,1,28,05:50:00,21:39:00,498,Hoppers Crossing Station - Laverton Station
5189,82-498-aus-1,Sat3-82-498-aus,0,15,07:22:00,21:29:00,498,Hoppers Crossing Station - Laverton Station
5190,82-498-aus-1,Sat3-82-498-aus,1,15,07:35:00,21:21:00,498,Hoppers Crossing Station - Laverton Station
5191,82-498-aus-1,Sun4-82-498-aus,0,15,07:40:00,21:29:00,498,Hoppers Crossing Station - Laverton Station


In [85]:
dfb1_max = dfb1.groupby(['route_short_name', 'direction_id']).apply(lambda x: get_max_row(x, 'trip_count')).reset_index(drop=True)
dfb1_min = dfb1.groupby(['route_short_name', 'direction_id']).apply(lambda x: get_min_row(x, 'trip_count')).reset_index(drop=True)

In [88]:
dfb1_min.sort_values('trip_count', ascending=False)

Unnamed: 0,route_id,service_id,direction_id,trip_count,earliest,latest,route_short_name,route_long_name
133,53-401-aus-1,MF1-53-401-aus,1,178,06:30:00,21:54:00,401,North Melbourne Station - University of Melbourne
375,49-601-aus-1,MF2-49-601-aus,0,111,07:01:00,21:41:00,601,Huntingdale Station - Monash University (Clayton)
376,49-601-aus-1,MF2-49-601-aus,1,110,07:00:00,21:32:00,601,Huntingdale Station - Monash University (Clayton)
35,17-202-aus-1,MF1-17-202-aus,1,75,06:40:00,19:03:00,202,Yarra Bend - Melbourne University
82,39-301-aus-1,MF1-39-301-aus,0,73,06:58:00,18:54:00,301,Reservoir - La Trobe University
...,...,...,...,...,...,...,...,...
565,26-812-aus-1,Sat1-26-812-aus,0,1,10:08:00,10:08:00,812,Brighton - Dandenong
444,19-695-F-aus-1,Sun1-19-695-F-aus,1,1,18:08:00,18:08:00,695F,Gembrook - Belgrave
474,29-736-aus-1,MF1-29-736-aus,1,1,08:47:00,08:47:00,736,Blackburn - Mitcham
566,26-812-aus-1,Sat1-26-812-aus,1,1,13:01:00,13:01:00,812,Brighton - Dandenong
