In [1]:
from hashlib import sha1
import hmac
import requests
import pandas as pd
import geopandas as gpd
import json
import os
import numpy as np
import time
import pyptvgtfs
import csv
SESSION = requests.Session()

ENV = json.load(open('../local-env.json'))

def get_ptv_api_url(
        endpoint : str,
        dev_id : str | int, 
        api_key : str | int,
    ):
    """
    Returns the URL to use PTV TimeTable API.

    Generates a signature from dev id (user id), API key, and endpoint.

    See the following for more information:
    - Home page: https://www.ptv.vic.gov.au/footer/data-and-reporting/datasets/ptv-timetable-api/
    - Swagger UI: https://timetableapi.ptv.vic.gov.au/swagger/ui/index
    - Swagger Docs JSON: https://timetableapi.ptv.vic.gov.au/swagger/docs/v3 (You can use this to find the endpoints you want to use.)
    """
    assert endpoint.startswith('/'), f'Endpoint must start with /, got {endpoint}'
    raw = f'{endpoint}{'&' if '?' in endpoint else '?'}devid={dev_id}'
    hashed = hmac.new(api_key.encode('utf-8'), raw.encode('utf-8'), sha1)  # Encode the raw string to bytes
    signature = hashed.hexdigest()
    return f'https://timetableapi.ptv.vic.gov.au{raw}&signature={signature}'


def get_data(endpoint : str, need_auth : bool = True):
    """
    Returns the data from the URL.
    """
    if need_auth:
        url = get_ptv_api_url(endpoint, ENV['PTV_TIMETABLE_DEV_ID'], ENV['PTV_TIMETABLE_API_KEY'])
    else:
        url = f'https://timetableapi.ptv.vic.gov.au{endpoint}'
    response = SESSION.get(url)
    response.raise_for_status()
    return response.json()


In [2]:
API_DOCS = get_data('/swagger/docs/v3', need_auth=False)
STATIC_API_ENDPOINTS = [k for k in API_DOCS['paths'].keys() if '{' not in k]
API_ROUTES : dict = get_data('/v3/routes')['routes']
API_ROUTE_TYPES : dict = get_data('/v3/route_types')['route_types']
API_DISRUPTIONS : dict = get_data('/v3/disruptions')['disruptions']
API_DISRUPTION_MODES : dict = get_data('/v3/disruptions/modes')['disruption_modes']
API_OUTLETS : dict = get_data('/v3/outlets')['outlets']
API_DF_ROUTE_TYPES = pd.DataFrame(API_ROUTE_TYPES)
API_DF_DISRUPTION_MODES = pd.DataFrame(API_DISRUPTION_MODES)
API_DF_OUTLETS = pd.DataFrame(API_OUTLETS)
# There are some faulty data in the outlets data. In particular, the latitude is > 0, which is not possible in Victoria.
API_DF_OUTLETS['outlet_latitude'] = API_DF_OUTLETS['outlet_latitude'].apply(lambda x: -x if x > 0 else x)
for route in API_ROUTES:
    for mid, v in route['route_service_status'].items():
        assert mid not in route, f'Key {mid} already exists in route'
        route[mid] = v
    del route['route_service_status']

API_DF_ROUTES = pd.DataFrame(API_ROUTES) 
assert API_DF_ROUTES['route_id'].is_unique, 'route_id is not unique'
assert API_DF_ROUTES['route_gtfs_id'].is_unique, 'route_gtfs_id is not unique'
API_DF_ROUTES['route_id'] = API_DF_ROUTES['route_id'].apply(str)
API_DF_ROUTES['route_type'] = API_DF_ROUTES['route_type'].apply(lambda x: str(int(x)) if not pd.isna(x) else x)

API_DF_ROUTES = API_DF_ROUTES[['route_type', 'route_id', 'route_name', 'route_number']]

In [4]:
GTFS = pyptvgtfs.process_gtfs_zip('../downloads/20240312_113156/gtfs.zip', '')
GTFS.drop(columns=['version_id'], inplace=True)
GTFS_DFS = GTFS.set_index(['mode_id', 'table_name'])['df'].to_dict()
new_GTFS_DFS = {}
for mid, v in GTFS_DFS.items():
    new_GTFS_DFS[mid[0]] = new_GTFS_DFS.get(mid[0], {})
    new_GTFS_DFS[mid[0]][mid[1]] = v
GTFS_DFS : dict[str, dict[str, pd.DataFrame]] = new_GTFS_DFS
for mid in GTFS_DFS:
    for tn in GTFS_DFS[mid]:
        GTFS_DFS[mid][tn]['mode_id'] = mid
# 45s - 1m

In [81]:
GTFS_DF_ROUTES = pd.concat([GTFS_DFS[mid]['routes'] for mid in GTFS_DFS])
# 1m - 2m
GTFS_DF_ROUTES['route_idx'] = GTFS_DF_ROUTES['route_id'].apply(lambda x: x.split('-'))
GTFS_DF_ROUTES['route_id0'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[0])
GTFS_DF_ROUTES['route_id1'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[1])
GTFS_DF_ROUTES['route_id2'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[2] if len(x) > 4 else np.nan)
GTFS_DF_ROUTES['route_id3'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[-2])
GTFS_DF_ROUTES['route_id4'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[-1])
GTFS_DF_ROUTES['route_id01'] = GTFS_DF_ROUTES['mode_id'] + '-' + GTFS_DF_ROUTES['route_id1']
GTFS_DF_ROUTES['route_gtfs_id'] = GTFS_DF_ROUTES.apply(lambda x: f'{x["mode_id"]}-{x["route_id1"]}' + (x['route_id2'] if x['mode_id'] == '4' and pd.notna(x['route_id2']) else ''), axis=1)

In [None]:
GTFS_DF_STOP_TIMES = pd.concat([GTFS_DFS[mid]['stop_times'] for mid in GTFS_DFS])
GTFS_DF_TRIPS = pd.concat([GTFS_DFS[mid]['trips'] for mid in GTFS_DFS])
GTFS_DF_TRIPS = GTFS_DF_TRIPS.merge(GTFS_DF_ROUTES, on='route_id', how='left', suffixes=('', '_route'))
GTFS_DF_TRIPS.drop(columns=['mode_id_route'], inplace=True)

# GTFS_DF_STOP_TIMES = GTFS_DF_STOP_TIMES.merge(GTFS_DF_STOPS, on='stop_id', suffixes=('', '_stop'))
GTFS_DF_STOP_TIMES = GTFS_DF_STOP_TIMES.merge(GTFS_DF_TRIPS, on='trip_id', suffixes=('', '_trip'))
# 1m 30s

GTFS_DF_STOP_TIMES = GTFS_DF_STOP_TIMES[['stop_id', 'stop_sequence', 'mode_id', 'direction_id', 'route_gtfs_id', 'route_short_name', 'route_long_name']].drop_duplicates()
# 20s
GTFS_DF_ROUTESTOPS = GTFS_DF_STOP_TIMES.reset_index(drop=True)

In [479]:
GTFS_DF_STOPS = pd.concat([GTFS_DFS[mid]['stops'] for mid in GTFS_DFS])
GTFS_DF_STOPS['stop_lat'] = GTFS_DF_STOPS['stop_lat'].apply(np.float64)
GTFS_DF_STOPS['stop_lon'] = GTFS_DF_STOPS['stop_lon'].apply(np.float64)
GTFS_DF_STOPS['stop_full_name'] = GTFS_DF_STOPS['stop_name']

def get_suburb(stop_full_name):
    if '(' not in stop_full_name:
        stop_name = stop_full_name
        stop_suburb = np.nan
        return (stop_name, stop_suburb)
    parentheses_count = 0
    for i in range(len(stop_full_name) - 1, -1, -1):
        c = stop_full_name[i]
        if c == ')':
            parentheses_count += 1
        if c == '(':
            parentheses_count -= 1
        if parentheses_count == 0:
            stop_name = stop_full_name[:i].strip()
            stop_suburb = stop_full_name[i:].removesuffix(')').lstrip('(')
            return (stop_name, stop_suburb)
        
GTFS_DF_STOPS['stop_name'], GTFS_DF_STOPS['stop_suburb'] = zip(*GTFS_DF_STOPS['stop_full_name'].apply(get_suburb))

# Assert no stop_name contains both ',' and '('
assert not (GTFS_DF_STOPS['stop_suburb'].notna() & GTFS_DF_STOPS['stop_full_name'].apply(lambda x: ',' in x)).any()

# Assert all stop_full_name contains at most 1 ',', and if it contains 1 ',', it must be followed by a space.
assert GTFS_DF_STOPS['stop_full_name'].apply(lambda x: x.count(',') == x.count(', ') and x.count(',') <= 1).all()


GTFS_DF_STOPS['stop_suburb'] = GTFS_DF_STOPS.apply(lambda x: x['stop_name'].split(', ')[0] if ',' in x['stop_name'] else x['stop_suburb'], axis=1)

GTFS_DF_STOPS['stop_name'] = GTFS_DF_STOPS.apply(lambda x: x['stop_name'].split(', ')[1] if ',' in x['stop_name'] else x['stop_name'], axis=1)

# Inspect NaN stop_suburb
GTFS_DF_STOPS[GTFS_DF_STOPS['stop_suburb'].isna()]



# Custom stop_name and stop_suburb

def custom_stop_name(x):
    if x['stop_id'] == '5588':
        assert x['stop_name'] == 'Rosemary St'
        return 'Rosemary St/High St'
    return x['stop_name']

def custom_stop_suburb(x):
    if x['stop_id'] == '5588':
        assert x['stop_name'] == 'Rosemary St' or x['stop_name'] == 'Rosemary St/High St', x['stop_name']
        return 'Templestowe Lower'
    if x['stop_id'] == '28185':
        assert x['stop_name'] == 'Keysborough South Shopping Centre/Braeside-Dandenong Rd', x['stop_name']
        return 'Keysborough'
    if x['stop_id'] == '35117':
        assert x['stop_name'] == 'Ascot St/Sturt St', x['stop_name']
        return 'Ballarat Central'
    return x['stop_suburb']

GTFS_DF_STOPS['stop_name'] = GTFS_DF_STOPS.apply(lambda x: custom_stop_name(x), axis=1)
GTFS_DF_STOPS['stop_suburb'] = GTFS_DF_STOPS.apply(lambda x: custom_stop_suburb(x), axis=1)


In [481]:

GTFS_DF_STOPS = GTFS_DF_STOPS.groupby('stop_id').aggregate({'stop_name': 'unique', 'stop_suburb': 'unique', 'stop_full_name': 'unique', 'stop_lat': 'unique', 'stop_lon': 'unique'})
# 5s - 7s

GTFS_DF_STOPS.reset_index(inplace=True)

for col in GTFS_DF_STOPS.columns:
    if col != 'stop_id':
        GTFS_DF_STOPS[f'{col}_len'] = GTFS_DF_STOPS[col].apply(len)

# Inspect len columns to find stops with multiple names, suburbs, etc.
# GTFS_DF_STOPS[['stop_name_len', 'stop_suburb_len', 'stop_full_name_len', 'stop_lat_len', 'stop_lon_len']].max()
        
assert GTFS_DF_STOPS['stop_suburb_len'].max() == 1

assert GTFS_DF_STOPS.apply(lambda x: x['stop_name_len'] == 1 or (x['stop_lat_len'] == 1 and x['stop_lon_len'] == 1), axis=1).all()

GTFS_DF_STOPS['stop_suburb'] = GTFS_DF_STOPS['stop_suburb'].apply(lambda x: x[0])

GTFS_DF_STOPS.drop(columns=[col for col in GTFS_DF_STOPS.columns if col.endswith('_len')], inplace=True)

GTFS_DF_STOPS = GTFS_DF_STOPS.merge(GTFS_DF_ROUTESTOPS.groupby('stop_id')['route_gtfs_id'].unique().apply(lambda x: ','.join(x)).reset_index(), on='stop_id')

GTFS_DF_ROUTES_MIN = GTFS_DF_ROUTES[['route_gtfs_id', 'route_short_name', 'route_long_name', 'mode_id']].drop_duplicates()

# # Measure the offset between the multiple latitudes and longitudes values
# k = GTFS_DF_STOPS[GTFS_DF_STOPS['stop_lat'].apply(len) > 1]
# k['stop_offset_lat'] = k['stop_lat'].apply(lambda x: x[0] - x[1])
# k['stop_offset_lon'] = k['stop_lon'].apply(lambda x: x[0] - x[1])
# k['stop_offset_lat_m'] = k['stop_offset_lat'].apply(lambda x: np.abs(x * 111139))
# k['stop_offset_lon_m'] = k.apply(lambda x: np.abs(x['stop_offset_lon'] * 111139 * np.cos(x['stop_lat'][0])), axis=1)
# k.sort_values(['stop_offset_lat_m', 'stop_offset_lon_m'], ascending=False)
# GTFS_DF_STOPS[GTFS_DF_STOPS['stop_name'].apply(lambda x: any(['28-Middle St' in i for i in x]))]

In [488]:
assert GTFS_DF_STOPS['stop_suburb'].apply(lambda x: x.count('(') == x.count(')')).all()

GTFS_DF_STOPS['stop_suburb_parentheses'] = GTFS_DF_STOPS['stop_suburb'].apply(lambda x: x.count('('))

assert (GTFS_DF_STOPS['stop_suburb_parentheses'] <= 1).all()

GTFS_DF_STOPS['stop_suburb_name'], GTFS_DF_STOPS['stop_suburb_postcode'] = zip(*GTFS_DF_STOPS['stop_suburb'].apply(lambda x: (x.split('(')[0], x.split('(')[1].removesuffix(')') if '(' in x else np.nan)))

In [498]:
GTFS_DF_STOPS['stop_suburb_postcode'].unique()

array([nan, '3081', 'NSW', '3350', 'SA', 'ACT', '3037', '3220', '3219',
       'Albury - NSW', '3381'], dtype=object)

In [508]:
d = get_data('/v3/runs/route/15810')

[{'run_id': 74148,
  'run_ref': '74148',
  'route_id': 15810,
  'route_type': 2,
  'final_stop_id': 4064,
  'destination_name': 'Sunbury Station/Brook St ',
  'status': 'scheduled',
  'direction_id': 102,
  'run_sequence': 0,
  'express_stop_count': 0,
  'vehicle_position': None,
  'vehicle_descriptor': None,
  'geopath': []},
 {'run_id': 74149,
  'run_ref': '74149',
  'route_id': 15810,
  'route_type': 2,
  'final_stop_id': 4064,
  'destination_name': 'Sunbury Station/Brook St ',
  'status': 'scheduled',
  'direction_id': 102,
  'run_sequence': 0,
  'express_stop_count': 0,
  'vehicle_position': None,
  'vehicle_descriptor': None,
  'geopath': []},
 {'run_id': 74150,
  'run_ref': '74150',
  'route_id': 15810,
  'route_type': 2,
  'final_stop_id': 4064,
  'destination_name': 'Sunbury Station/Brook St ',
  'status': 'scheduled',
  'direction_id': 102,
  'run_sequence': 0,
  'express_stop_count': 0,
  'vehicle_position': None,
  'vehicle_descriptor': None,
  'geopath': []},
 {'run_id': 7

In [511]:
pd.DataFrame(d['runs'])

Unnamed: 0,run_id,run_ref,route_id,route_type,final_stop_id,destination_name,status,direction_id,run_sequence,express_stop_count,vehicle_position,vehicle_descriptor,geopath
0,74148,74148,15810,2,4064,Sunbury Station/Brook St,scheduled,102,0,0,,,[]
1,74149,74149,15810,2,4064,Sunbury Station/Brook St,scheduled,102,0,0,,,[]
2,74150,74150,15810,2,4064,Sunbury Station/Brook St,scheduled,102,0,0,,,[]
3,74151,74151,15810,2,4064,Sunbury Station/Brook St,scheduled,102,0,0,,,[]
4,74152,74152,15810,2,4064,Sunbury Station/Brook St,scheduled,102,0,0,,,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
517,74665,74665,15810,2,4563,Diggers Rest Station/Old Calder Hwy,scheduled,103,0,0,,,[]
518,74666,74666,15810,2,4563,Diggers Rest Station/Old Calder Hwy,scheduled,103,0,0,,,[]
519,74667,74667,15810,2,4563,Diggers Rest Station/Old Calder Hwy,scheduled,103,0,0,,,[]
520,74668,74668,15810,2,4563,Diggers Rest Station/Old Calder Hwy,scheduled,103,0,0,,,[]


In [None]:
API_DOCS['paths']['/v3/pattern/run/{run_ref}/route_type/{route_type}']

In [513]:
d2 = get_data('/v3/pattern/run/74666/route_type/2')

In [526]:
GTFS_DFS['2']['trips'][GTFS_DFS['2']['trips']['route_id'].str.contains('2-ccl')]['trip_headsign'].unique()

array(['Clockwise'], dtype=object)

In [436]:
SHP_DIR = '../local/ptv-spatial-datasets'
SHP_GDFS : gpd.GeoDataFrame = { f.split('.')[0]: gpd.read_file(os.path.join(SHP_DIR, f)) for f in os.listdir(SHP_DIR) if f.endswith('.shp') }
for f in os.listdir(SHP_DIR):
    if f.endswith('.txt'):
        gdf_name = f.removesuffix('_column_names.txt').upper()
        with open(os.path.join(SHP_DIR, f), 'r') as file:
            gdf_column_names = [line.strip() for line in file.readlines()][4:]
        assert gdf_name in SHP_GDFS, f'{gdf_name} not in GDFS'
        for line in gdf_column_names:
            assert ' = ' in line, f'Invalid line: {line}'
        gdf_column_names = { line.split(' = ')[0]: line.split(' = ')[1] for line in gdf_column_names }
        SHP_GDFS[gdf_name].rename(columns=gdf_column_names, inplace=True)

assert SHP_GDFS['PTV_METRO_BUS_STOP']['ROUTES_USING_STOP'].notna().all()
assert SHP_GDFS['PTV_METRO_TRAM_STOP']['ROUTES_USING_STOP'].notna().all()
assert SHP_GDFS['PTV_METRO_TRAIN_STATION']['ROUTES_USING_STOP'].notna().all()
# Check if a column exists in a GeoDataFrame
assert 'ROUTES_USING_STOP' not in SHP_GDFS['PTV_REGIONAL_COACH_STOP'].columns
assert SHP_GDFS['PTV_SKYBUS_STOP']['ROUTES_USING_STOP'].isna().all()
assert SHP_GDFS['PTV_REGIONAL_BUS_STOP']['ROUTES_USING_STOP'].notna().any()


In [437]:

shp_gdf_routes = []
for mid, gdf in SHP_GDFS.items():
    if 'ROUTE' in mid:
        gdf['SHP_FILE'] = mid
        shp_gdf_routes.append(gdf)
SHP_DF_ROUTES : pd.DataFrame = pd.concat(shp_gdf_routes)


In [438]:


SHP_DF_ROUTES['route_idx'] = SHP_DF_ROUTES['ROUTE_ID'].apply(lambda x: x.split('-'))
SHP_DF_ROUTES['route_id0'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[0])
SHP_DF_ROUTES['route_id1'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[1])
SHP_DF_ROUTES['route_id2'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[2] if len(x) > 4 else np.nan)
SHP_DF_ROUTES['route_id3'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[-2])
SHP_DF_ROUTES['route_id4'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[-1])
SHP_DF_ROUTES['route_id01'] = SHP_DF_ROUTES['route_id0'] + '-' + SHP_DF_ROUTES['route_id1']

assert SHP_DF_ROUTES.groupby('route_id01').aggregate({'route_id2': 'unique', 'route_id3': 'unique', 'route_id4': 'unique'}).apply(lambda x: len(x['route_id2']) <= 1 or len(x['route_id4']) <= 1, axis=1).all()

# Since ROUTE_LONG_NAME is all not null, when we merge dataframes, we can use ROUTE_LONG_NAME isna() to check if the other dataframe has the equivalent data in SHP_DF_ROUTES_MIN
assert SHP_DF_ROUTES['ROUTE_LONG_NAME'].notna().all()

# Inspect the route_id0 values of SHP_DF_ROUTES
SHP_DF_ROUTES['route_id0'].sort_values(key=lambda x: x.apply(int)).unique() # array(['3', '4', '5', '6', '7', '11'], dtype=object)

# Assert that all metro buses route_short_name are 3 or 4 characters long
assert SHP_DF_ROUTES[(SHP_DF_ROUTES['route_id0'] == '4')]['ROUTE_SHORT_NAME'].apply(lambda x: len(x) in [3, 4]).all()
# Proof that route_id1 is unique for each ROUTE_SHORT_NAME for route_id0 == 4
odd_bus_id1_names = SHP_DF_ROUTES[(SHP_DF_ROUTES['route_id1'] != SHP_DF_ROUTES['ROUTE_SHORT_NAME']) & (SHP_DF_ROUTES['route_id0'] == '4')]['ROUTE_SHORT_NAME'].unique()
assert SHP_DF_ROUTES[(SHP_DF_ROUTES['ROUTE_SHORT_NAME'].apply(lambda x: x in odd_bus_id1_names)) & (SHP_DF_ROUTES['route_id0'] == '4')].groupby('ROUTE_SHORT_NAME')['route_id1'].nunique().unique() == [1]

def get_gtfs_id(x):
    if x['route_id0'] == '4':
        return f'4-{x["ROUTE_SHORT_NAME"]}'
    elif x['route_id0'] == '7':
        assert 'TB' in x["route_id1"], f'7-TeleBus route_id1 {x["route_id1"]} does not contain TB'
        route_number = x["route_id1"].lstrip('TB')
        # Add left trailing 0s
        route_number = route_number.zfill(2)
        return f'7-B{route_number}'
    else:
        return f'{x["route_id0"]}-{x["route_id1"]}'

SHP_DF_ROUTES['route_shp_id'] = SHP_DF_ROUTES.apply(lambda x: f'{x["route_id0"]}-{x["route_id1"]}' + (x['route_id2'] if x['route_id0'] == '4' and pd.notna(x['route_id2']) else ''), axis=1)
SHP_DF_ROUTES['route_gtfs_id'] = SHP_DF_ROUTES.apply(lambda x: get_gtfs_id(x), axis=1)

SHP_DF_ROUTES_MIN = SHP_DF_ROUTES[['route_gtfs_id', 'route_shp_id', 'route_id0', 'ROUTE_SHORT_NAME', 'ROUTE_LONG_NAME', 'SHP_FILE']].drop_duplicates()

SHP_DF_ROUTES_MIN = SHP_DF_ROUTES_MIN.groupby('route_gtfs_id').aggregate({'route_shp_id': 'unique', 'route_id0': 'unique', 'ROUTE_SHORT_NAME': 'unique', 'ROUTE_LONG_NAME': 'unique', 'SHP_FILE': 'unique'}).reset_index()

assert SHP_DF_ROUTES_MIN['route_shp_id'].apply(lambda x: len(x) == 1).all()
assert SHP_DF_ROUTES_MIN['route_id0'].apply(lambda x: len(x) == 1).all()
assert SHP_DF_ROUTES_MIN['ROUTE_SHORT_NAME'].apply(lambda x: len(x) == 1).all()
assert SHP_DF_ROUTES_MIN['SHP_FILE'].apply(lambda x: len(x) == 1).all()

# Inspect multiple ROUTE_LONG_NAME of the same route_gtfs_id
SHP_DF_ROUTES_MIN[SHP_DF_ROUTES_MIN['ROUTE_LONG_NAME'].apply(lambda x: len(x) != 1)]

SHP_DF_ROUTES_MIN['route_shp_id'] = SHP_DF_ROUTES_MIN['route_shp_id'].apply(lambda x: x[0])
SHP_DF_ROUTES_MIN['route_id0'] = SHP_DF_ROUTES_MIN['route_id0'].apply(lambda x: x[0])
SHP_DF_ROUTES_MIN['ROUTE_SHORT_NAME'] = SHP_DF_ROUTES_MIN['ROUTE_SHORT_NAME'].apply(lambda x: x[0])
SHP_DF_ROUTES_MIN['SHP_FILE'] = SHP_DF_ROUTES_MIN['SHP_FILE'].apply(lambda x: x[0])

# Inspect SHP_FILE and route_id0
SHP_DF_ROUTES_MIN[['SHP_FILE', 'route_id0']].drop_duplicates().sort_values('SHP_FILE')


assert SHP_GDFS['PTV_METRO_TRAM_STOP']['STOP_ID'].is_unique
assert SHP_GDFS['PTV_METRO_TRAIN_STATION']['STOP_ID'].is_unique
assert SHP_GDFS['PTV_REGIONAL_BUS_STOP']['STOP_ID'].is_unique
assert SHP_GDFS['PTV_REGIONAL_COACH_STOP']['STOP_ID'].is_unique
assert SHP_GDFS['PTV_SKYBUS_STOP']['STOP_ID'].is_unique

# STOP ID in PTV_METRO_BUS_STOP is not unique, however it's only because of the addition of TeleBus routes
shp_metro_bus_stop_duplicated_ids = SHP_GDFS['PTV_METRO_BUS_STOP'][SHP_GDFS['PTV_METRO_BUS_STOP']['STOP_ID'].duplicated(keep=False)].groupby('STOP_ID')['ROUTES_USING_STOP'].unique()
assert shp_metro_bus_stop_duplicated_ids.apply(lambda x: len(x) == 2).all()
assert shp_metro_bus_stop_duplicated_ids.apply(lambda x: len([i for i in x if 'TeleBus' in i]) == 1).all()

# Split PTV_METRO_BUS_STOP into PTV_METROBUS_STOP and PTV_TELEBUS_STOP
SHP_GDFS['PTV_METROBUS_STOP'] = SHP_GDFS['PTV_METRO_BUS_STOP'][SHP_GDFS['PTV_METRO_BUS_STOP']['ROUTES_USING_STOP'].apply(lambda x: 'TeleBus' not in x)].reset_index(drop=True)
SHP_GDFS['PTV_TELEBUS_STOP'] = SHP_GDFS['PTV_METRO_BUS_STOP'][SHP_GDFS['PTV_METRO_BUS_STOP']['ROUTES_USING_STOP'].apply(lambda x: 'TeleBus' in x)].reset_index(drop=True)

assert SHP_GDFS['PTV_METROBUS_STOP']['STOP_ID'].is_unique
assert SHP_GDFS['PTV_TELEBUS_STOP']['STOP_ID'].is_unique

SHP_DFS_STOPS = {
    '2': SHP_GDFS['PTV_METRO_TRAIN_STATION'],
    '3': SHP_GDFS['PTV_METRO_TRAM_STOP'],
    '4': SHP_GDFS['PTV_METROBUS_STOP'],
    # '5': SHP_GDFS['PTV_REGIONAL_COACH_STOP'],
    '6': SHP_GDFS['PTV_REGIONAL_BUS_STOP'],
    '7': SHP_GDFS['PTV_TELEBUS_STOP'],
    '11': SHP_GDFS['PTV_SKYBUS_STOP']
}

for mid in SHP_DFS_STOPS:
    SHP_DFS_STOPS[mid]['ROUTE'] = SHP_DFS_STOPS[mid]['ROUTES_USING_STOP'].apply(lambda x: x.split(',') if pd.notna(x) else [])
    SHP_DFS_STOPS[mid] = SHP_DFS_STOPS[mid][['STOP_ID', 'ROUTE']].explode('ROUTE').reset_index(drop=True)

for mid in ['3', '4', '6', '7', '11']:
    SHP_DFS_STOPS[mid] = SHP_DFS_STOPS[mid].merge(SHP_DF_ROUTES_MIN[SHP_DF_ROUTES_MIN['route_id0'] == mid], left_on='ROUTE', right_on='ROUTE_SHORT_NAME', how='left')
        

# Assert that there is no odd ROUTE in SHP_DFS_STOPS
assert SHP_DFS_STOPS['3']['ROUTE_SHORT_NAME'].notna().all()
assert SHP_DFS_STOPS['4']['ROUTE_SHORT_NAME'].notna().all()
assert SHP_DFS_STOPS['7']['ROUTE_SHORT_NAME'].notna().all()
# assert 'ROUTE' not in SHP_DFS_STOPS['5'].columns
assert (SHP_DFS_STOPS['6'][SHP_DFS_STOPS['6']['ROUTE'].notna() & SHP_DFS_STOPS['6']['ROUTE_SHORT_NAME'].isna()]['ROUTE'] == '').all()
assert (SHP_DFS_STOPS['11'][SHP_DFS_STOPS['11']['ROUTE'].notna() & SHP_DFS_STOPS['11']['ROUTE_SHORT_NAME'].isna()]['ROUTE'] == '').all()


gtfs_df_routes_metrotrains = GTFS_DF_ROUTES_MIN[GTFS_DF_ROUTES_MIN['mode_id'] == '2'][['route_gtfs_id', 'route_short_name']].drop_duplicates().sort_values('route_short_name')

SHP_DFS_STOPS['2'] = SHP_DFS_STOPS['2'][['STOP_ID', 'ROUTE']].drop_duplicates()
SHP_DFS_STOPS['2'] = pd.merge(SHP_DFS_STOPS['2'], gtfs_df_routes_metrotrains, left_on='ROUTE', right_on='route_short_name', how='left')
assert SHP_DFS_STOPS['2']['route_gtfs_id'].notna().all()

for mid in ['2', '3', '4', '6', '7', '11']:
    SHP_DFS_STOPS[mid] = SHP_DFS_STOPS[mid].dropna(subset=['ROUTE'])
    SHP_DFS_STOPS[mid]['route_gtfs_id'] = SHP_DFS_STOPS[mid].apply(lambda x: x['route_gtfs_id'] if pd.notna(x['route_gtfs_id']) else x['ROUTE'], axis=1)
    SHP_DFS_STOPS[mid] = SHP_DFS_STOPS[mid].groupby('STOP_ID').aggregate({'ROUTE': 'unique', 'route_gtfs_id': 'unique'}).reset_index()
    SHP_DFS_STOPS[mid]['ROUTE'] = SHP_DFS_STOPS[mid]['ROUTE'].apply(lambda x: ','.join(x))
    SHP_DFS_STOPS[mid]['route_gtfs_id'] = SHP_DFS_STOPS[mid]['route_gtfs_id'].apply(lambda x: ','.join(x))


SHP_GDFS['PTV_METRO_TRAIN_STATION'] = pd.merge(SHP_GDFS['PTV_METRO_TRAIN_STATION'], SHP_DFS_STOPS['2'][['STOP_ID', 'route_gtfs_id']], on='STOP_ID', how='left')
SHP_GDFS['PTV_METRO_TRAM_STOP'] = pd.merge(SHP_GDFS['PTV_METRO_TRAM_STOP'], SHP_DFS_STOPS['3'][['STOP_ID', 'route_gtfs_id']], on='STOP_ID', how='left')
SHP_GDFS['PTV_METROBUS_STOP'] = pd.merge(SHP_GDFS['PTV_METROBUS_STOP'], SHP_DFS_STOPS['4'][['STOP_ID', 'route_gtfs_id']], on='STOP_ID', how='left')
SHP_GDFS['PTV_REGIONAL_BUS_STOP'] = pd.merge(SHP_GDFS['PTV_REGIONAL_BUS_STOP'], SHP_DFS_STOPS['6'][['STOP_ID', 'route_gtfs_id']], on='STOP_ID', how='left')
SHP_GDFS['PTV_TELEBUS_STOP'] = pd.merge(SHP_GDFS['PTV_TELEBUS_STOP'], SHP_DFS_STOPS['7'][['STOP_ID', 'route_gtfs_id']], on='STOP_ID', how='left')
SHP_GDFS['PTV_SKYBUS_STOP'] = pd.merge(SHP_GDFS['PTV_SKYBUS_STOP'], SHP_DFS_STOPS['11'][['STOP_ID', 'route_gtfs_id']], on='STOP_ID', how='left')


SHP_GDFS['PTV_METRO_TRAIN_STATION']['mode_id'] = '2'
SHP_GDFS['PTV_METRO_TRAM_STOP']['mode_id'] = '3'
SHP_GDFS['PTV_METROBUS_STOP']['mode_id'] = '4'
SHP_GDFS['PTV_REGIONAL_COACH_STOP']['mode_id'] = '5'
SHP_GDFS['PTV_REGIONAL_BUS_STOP']['mode_id'] = '6'
SHP_GDFS['PTV_TELEBUS_STOP']['mode_id'] = '7'
SHP_GDFS['PTV_SKYBUS_STOP']['mode_id'] = '11'




  SHP_DF_STOPS['geometry'] = SHP_DF_STOPS['geometry'].apply(lambda x: x.coords[0])


In [499]:
SHP_DF_STOPS : pd.DataFrame = pd.concat([
    SHP_GDFS['PTV_METRO_TRAIN_STATION'],
    SHP_GDFS['PTV_METRO_TRAM_STOP'],
    SHP_GDFS['PTV_METROBUS_STOP'],
    SHP_GDFS['PTV_REGIONAL_COACH_STOP'],
    SHP_GDFS['PTV_REGIONAL_BUS_STOP'],
    SHP_GDFS['PTV_TELEBUS_STOP'],
    SHP_GDFS['PTV_SKYBUS_STOP']
])

SHP_DF_STOPS['geometry'] = SHP_DF_STOPS['geometry'].apply(lambda x: x.coords[0])
SHP_DF_STOPS['STOP_FULL_NAME'] = SHP_DF_STOPS['STOP_NAME']


# Count occurrences of '(' and ')' in STOP_FULL_NAME
assert SHP_DF_STOPS['STOP_FULL_NAME'].apply(lambda x: x.count('(') == x.count(')')).all()
SHP_DF_STOPS['parentheses_count'] = SHP_DF_STOPS['STOP_FULL_NAME'].apply(lambda x: x.count('('))
SHP_DF_STOPS[SHP_DF_STOPS['parentheses_count'] != 1]
SHP_DF_STOPS.drop(columns='parentheses_count', inplace=True)

# Get only the last pair of parentheses
def get_suburb(stop_full_name):
    if '(' not in stop_full_name:
        stop_name = stop_full_name
        stop_suburb = np.nan
        return (stop_name, stop_suburb)
    parentheses_count = 0
    for i in range(len(stop_full_name) - 1, -1, -1):
        c = stop_full_name[i]
        if c == ')':
            parentheses_count += 1
        if c == '(':
            parentheses_count -= 1
        if parentheses_count == 0:
            stop_name = stop_full_name[:i].strip()
            stop_suburb = stop_full_name[i:].removesuffix(')').lstrip('(')
            return (stop_name, stop_suburb)

SHP_DF_STOPS['STOP_SUBURB'] = SHP_DF_STOPS['STOP_FULL_NAME'].apply(get_suburb)
SHP_DF_STOPS['STOP_NAME'] = SHP_DF_STOPS['STOP_SUBURB'].apply(lambda x: x[0])
SHP_DF_STOPS['STOP_SUBURB'] = SHP_DF_STOPS['STOP_SUBURB'].apply(lambda x: x[1])


# Inspection of the data shows that there are some stops with no suburb. We will manually fill these in.
SHP_DF_STOPS[SHP_DF_STOPS['STOP_SUBURB'].isna()]

def custom_stop_suburb(x):
    if x['STOP_ID'] == '35117':
        assert x['STOP_NAME'] == 'Ascot St/Sturt St', x['STOP_NAME']
        return 'Ballarat Central'
    return x['STOP_SUBURB']

SHP_DF_STOPS['STOP_SUBURB'] = SHP_DF_STOPS.apply(custom_stop_suburb, axis=1)


  SHP_DF_STOPS['geometry'] = SHP_DF_STOPS['geometry'].apply(lambda x: x.coords[0])


In [500]:
assert SHP_DF_STOPS['STOP_SUBURB'].apply(lambda x: x.count('(') == x.count(')')).all()

SHP_DF_STOPS['STOP_SUBURB_PARENTHESES'] = SHP_DF_STOPS['STOP_SUBURB'].apply(lambda x: x.count('('))

assert (SHP_DF_STOPS['STOP_SUBURB_PARENTHESES'] <= 1).all()

SHP_DF_STOPS['STOP_SUBURB_NAME'], SHP_DF_STOPS['STOP_SUBURB_POSTCODE'] = zip(*SHP_DF_STOPS['STOP_SUBURB'].apply(lambda x: (x.split('(')[0], x.split('(')[1].removesuffix(')') if '(' in x else np.nan)))

In [502]:
SHP_DF_STOPS['STOP_SUBURB_POSTCODE'].unique()

# array([nan, '3081', '3037', '3381', '3350', 'NSW', '3220', '3219', 'SA', 'Albury - NSW'], dtype=object)

# array([nan, '3081', 'NSW', '3350', 'SA', 'ACT', '3037', '3220', '3219', 'Albury - NSW', '3381'], dtype=object)

array([nan, '3081', '3037', '3381', '3350', 'NSW', '3220', '3219', 'SA',
       'Albury - NSW'], dtype=object)

In [None]:

SHP_DF_STOPS['LATITUDE'] = SHP_DF_STOPS['LATITUDE'].apply(np.float64)
SHP_DF_STOPS['LONGITUDE'] = SHP_DF_STOPS['LONGITUDE'].apply(np.float64)


SHP_DF_STOPS = SHP_DF_STOPS.groupby('STOP_ID').aggregate({'STOP_NAME': 'unique', 'STOP_SUBURB': 'unique', 'STOP_FULL_NAME': 'unique', 'LATITUDE': 'unique', 'LONGITUDE': 'unique', 'TICKETZONE': 'unique', 'route_gtfs_id': 'unique', 'geometry': 'unique'})
# 7s - 10s
SHP_DF_STOPS.reset_index(inplace=True)

for col in SHP_DF_STOPS.columns:
    if col != 'STOP_ID':
        SHP_DF_STOPS[f'{col}_len'] = SHP_DF_STOPS[col].apply(len)

for col in ['STOP_NAME', 'STOP_SUBURB', 'STOP_FULL_NAME', 'LATITUDE', 'LONGITUDE', 'geometry']:
    assert SHP_DF_STOPS[f'{col}_len'].max() == 1
    SHP_DF_STOPS[col] = SHP_DF_STOPS[col].apply(lambda x: x[0])

SHP_DF_STOPS['TICKETZONE'] = SHP_DF_STOPS['TICKETZONE'].apply(lambda x: ','.join([str(i) for i in x if not pd.isna(i)]))
SHP_DF_STOPS['route_gtfs_id'] = SHP_DF_STOPS['route_gtfs_id'].apply(lambda x: ','.join([str(i) for i in x if not pd.isna(i)]))

SHP_DF_STOPS.drop(columns=[col for col in SHP_DF_STOPS.columns if col.endswith('_len')], inplace=True)

In [482]:
GS_DF_STOPS = pd.merge(GTFS_DF_STOPS, SHP_DF_STOPS, left_on='stop_id', right_on='STOP_ID', suffixes=('_gtfs', '_shp'), how='outer')

GS_DF_STOPS.drop(columns=['stop_full_name', 'STOP_FULL_NAME'], inplace=True)

# Assert that all GTFS stops with multiple names have an equivalent in SHP
assert GS_DF_STOPS[GS_DF_STOPS['stop_name'].apply(lambda x: isinstance(x, np.ndarray) and len(x) > 1)]['STOP_ID'].notna().all()

# Assert that all GTFS stops with multiple names have an equivalent in SHP, and the equivalent's name is among the multiple names
assert GS_DF_STOPS[GS_DF_STOPS['STOP_ID'].notna() & GS_DF_STOPS['stop_id'].notna()].apply(lambda x: x['STOP_NAME'] in x['stop_name'] if len(x['stop_name']) > 1 else True, axis=1).all()

# Choose the name from SHP if it is in the GTFS stop_name, else choose the GTFS stop_name. It is guaranteed that when we choose x['stop_name'][0], the x['stop_name'] only has one element.
GS_DF_STOPS['stop_name'] = GS_DF_STOPS.apply(lambda x: (x['STOP_NAME'] if (x['STOP_NAME'] in x['stop_name']) else x['stop_name'][0]) if isinstance(x['stop_name'], np.ndarray) else x['stop_name'], axis=1)

# Assert that all GTFS stops with multiple latitudes and longitudes have an equivalent in SHP
assert GS_DF_STOPS[GS_DF_STOPS['stop_lat'].apply(lambda x: isinstance(x, np.ndarray) and len(x) > 1)]['LATITUDE'].notna().all()

# Assert that all GTFS stops with multiple latitudes and longitudes have an equivalent in SHP
assert GS_DF_STOPS[GS_DF_STOPS['stop_lon'].apply(lambda x: isinstance(x, np.ndarray) and len(x) > 1)]['LONGITUDE'].notna().all()

# # Latitude and longitude are completely different between GTFS and SHP
# k = GS_DF_STOPS[GS_DF_STOPS['LATITUDE'].notna() & GS_DF_STOPS['stop_lat'].notna()]
# len(k[k.apply(lambda x: x['geometry'][0] == x['stop_lat'][1] if len(x['stop_lat']) > 1 else False, axis=1)])

# assert GS_DF_STOPS[GS_DF_STOPS['LONGITUDE'].notna() & GS_DF_STOPS['stop_lon'].notna()].apply(lambda x: x['LONGITUDE'] in x['stop_lon'] if len(x['stop_lon']) > 1 else True, axis=1).all()

In [483]:
GS_DF_STOPS_FULL = GS_DF_STOPS[GS_DF_STOPS['stop_id'].notna() & GS_DF_STOPS['STOP_ID'].notna()]

In [484]:
# GS_DF_STOPS_FULL[GS_DF_STOPS_FULL['stop_suburb'] != GS_DF_STOPS_FULL['STOP_SUBURB']]
# GS_DF_STOPS_FULL[GS_DF_STOPS_FULL['stop_name'] != GS_DF_STOPS_FULL['STOP_NAME']]
GS_DF_STOPS_FULL[(GS_DF_STOPS_FULL['stop_name'] != GS_DF_STOPS_FULL['STOP_NAME']) & (GS_DF_STOPS_FULL['stop_suburb'] != GS_DF_STOPS_FULL['STOP_SUBURB'])]

Unnamed: 0,stop_id,stop_name,stop_suburb,stop_lat,stop_lon,route_gtfs_id_gtfs,STOP_ID,STOP_NAME,STOP_SUBURB,LATITUDE,LONGITUDE,TICKETZONE,route_gtfs_id_shp,geometry
1218,11380,Navan Park/Opp 284 Coburns Rd,Melton,[-37.6716077893165],[144.570689039386],4-453,11380,Navan Park/Coburns Rd,Harkness,-37.671608,144.570689,2,4-453,"(144.5706949696913, -37.671594795876835)"
2645,13005,Tucker Rd/Mawby Rd,Bentleigh,[-37.9288844259794],[145.051446659906],4-701,13005,Corolla Ave/Mawby Rd,Bentleigh East,-37.928884,145.051447,2,4-701,"(145.05145288116285, -37.9288708135009)"
2893,13273,Tucker Rd/Mawby Rd,Bentleigh,[-37.9289745038198],[145.05144428165],4-701,13273,Corolla Ave/Mawby Rd,Bentleigh East,-37.928975,145.051444,2,4-701,"(145.05145000127632, -37.928961893472824)"
10861,23370,Cana Catholic Primary School/Banchory Ave,Hillside,[-37.6886506993103],[144.746362690131],4-460,23370,Cana Catholic PS/Banchory Ave,Hillside (3037),-37.688651,144.746363,2,4-460,"(144.74636917731445, -37.68863783016752)"
10862,23371,Cana Catholic Primary School/Banchory Ave,Hillside,[-37.6887205834372],[144.746247175734],4-460,23371,Cana Catholic PS/Banchory Ave,Hillside (3037),-37.688721,144.746247,2,4-460,"(144.7462528974455, -37.68870785016954)"
18992,44354,Silverbush Way/Sayers Rd,Williams Landing,[-37.8526725083357],[144.730136572865],"4-150,4-151",44354,Forsyth Rd/Sayers Rd,Truganina,-37.852673,144.730137,2,"4-151,4-150","(144.73014288010182, -37.85265986607712)"
20582,46758,Hillside Recreation Reserve/Royal Cres,Hillside,[-37.6855406909971],[144.739721256821],4-463,46758,Hillside Rec Reserve/Royal Cres,Hillside (3037),-37.685541,144.739721,2,4-463,"(144.739727179593, -37.68552779019742)"
24321,5540,Bulleen Terminus/Thompsons Rd,Bulleen,[-37.7698542902203],[145.10009803421],4-905,5540,Manningham Rd/Thompsons Rd,Templestowe Lower,-37.769854,145.100098,2,4-905,"(145.10010398045483, -37.76984080991194)"


In [485]:
GS_DF_STOPS_FULL[GS_DF_STOPS_FULL['stop_id'] == '5588']

Unnamed: 0,stop_id,stop_name,stop_suburb,stop_lat,stop_lon,route_gtfs_id_gtfs,STOP_ID,STOP_NAME,STOP_SUBURB,LATITUDE,LONGITUDE,TICKETZONE,route_gtfs_id_shp,geometry
24373,5588,Rosemary St/High St,Templestowe Lower,[-37.7604947207492],[145.118874510881],"4-309,4-281",5588,MacRobertson St/High St,Templestowe Lower,-37.760495,145.118875,2,"4-309,4-281","(145.11888087300423, -37.76048189116718)"


In [430]:

GS_DF_STOPS_MISSING = GS_DF_STOPS[GS_DF_STOPS['stop_id'].isna() | GS_DF_STOPS['STOP_ID'].isna()].copy()
GS_DF_STOPS_MISSING['union_stop_name'] = GS_DF_STOPS_MISSING.apply(lambda x: x['stop_name'] if pd.isna(x['STOP_ID']) else x['STOP_NAME'], axis=1)
GS_DF_STOPS_MISSING['union_stop_suburb'] = GS_DF_STOPS_MISSING.apply(lambda x: x['stop_suburb'] if pd.isna(x['STOP_ID']) else x['STOP_SUBURB'], axis=1)
GS_DF_STOPS_MISSING.sort_values(['union_stop_name', 'union_stop_suburb'], inplace=True)
# Wrap in " "
GS_DF_STOPS_MISSING.to_csv('../local/gs-stop-missing.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)

In [295]:
gtfs_custom_stopid_check = ["49276","49278","49275","49279","49280","28205","22251","28341","28342","28343","28344","28345","28346","28347","28348","27745","27746","27747","48879","22694","28165","28166","28167","17245","48471","48474","49602","49605","49468","28537","28538","28539","28540","28541","28542","28543"]
shp_custom_stopid_check = ["18074","18558","18082","18561","18562","40072","48382","27882","2507","3348","41652","18859","18860","18861","44707","19806","19808","37491","37475","8637","27985","27986","50255","50256","50257","50258","50259","50260"]

In [296]:
GS_DF_STOPS[(GS_DF_STOPS['STOP_NAME'] == '33-South Daly St/Dawson St') | (GS_DF_STOPS['stop_name'] == '33-South Daly St/Dawson St')]

Unnamed: 0,stop_id,stop_name,stop_suburb,stop_lat,stop_lon,STOP_ID,STOP_NAME,STOP_SUBURB,LATITUDE,LONGITUDE,TICKETZONE,ROUTES_USING_STOP,geometry
6692,,,,,,18074.0,33-South Daly St/Dawson St,Brunswick West,-37.769399,144.944169,1.0,58.0,"(144.94417509159285, -37.76938584520886)"
22221,49276.0,33-South Daly St/Dawson St,Brunswick West,[-37.7695279063077],[144.945357459904],,,,,,,,


In [302]:
mid = pd.merge(GTFS_DF_ROUTESTOPS[GTFS_DF_ROUTESTOPS['route_gtfs_id'] == '3-58'], GTFS_DF_STOPS, left_on='stop_id', right_on='stop_id', how='left')
mid[mid['stop_name'].apply(lambda x: any(['33-South Daly St/Dawson St' in i for i in x]))]

Unnamed: 0,stop_id,stop_sequence,mode_id,direction_id,route_gtfs_id,route_short_name,route_long_name,stop_name,stop_suburb,stop_full_name,stop_lat,stop_lon
112,49276,12,3,1,3-58,58,Toorak - West Coburg,[33-South Daly St/Dawson St],Brunswick West,[33-South Daly St/Dawson St (Brunswick West)],[-37.7695279063077],[144.945357459904]


In [None]:
SHP_GDFS

In [7]:
GSA_DF_ROUTES = pd.read_csv('../local/gsa-routes.csv', dtype=str)

In [None]:

DF_API2GS = GSA_DF_ROUTES.groupby('route_id')['route_gs_id'].unique().rename('route_gs_id').reset_index()
df_api2gs_nunique = GSA_DF_ROUTES.groupby('route_id')['route_gs_id'].nunique().rename('route_gs_nunique').reset_index()
DF_API2GS = pd.merge(DF_API2GS, df_api2gs_nunique, on='route_id')
DF_API2GS['route_gs_len'] = DF_API2GS['route_gs_id'].apply(len)
DF_API2GS['route_gs_nunique'] = DF_API2GS['route_gs_nunique'].apply(int)
DF_API2GS['gs_na'] = DF_API2GS['route_gs_len'] != DF_API2GS['route_gs_nunique']
DF_API2GS = pd.merge(DF_API2GS, API_DF_ROUTES, on='route_id')

DF_API2GS[DF_API2GS['gs_na'] & (DF_API2GS['route_gs_nunique'] == 0)]['route_gtfs_id0'].unique() # array(['4', '1', '5'], dtype=object)
DF_API2GS[DF_API2GS['route_gs_nunique'] >= 2]['route_gtfs_id0'].unique() # array(['1', '5'], dtype=object)

In [9]:
api_directions_endpoints = API_DF_ROUTES.apply(lambda x: f'/v3/directions/route/{x["route_id"]}', axis=1).unique()

api_all_directions = {}
for i, endpoint in enumerate(api_directions_endpoints):
    route_id = int(endpoint.split('/')[4])
    directions = None
    while directions is None:
        try:
            directions = get_data(endpoint)
            # print(f'[{i}] Got directions for route {route_id}')
        except requests.exceptions.HTTPError:
            # print(f'Failed to get directions for route {route_id}. Retrying in 30 seconds...')
            time.sleep(30)
            continue
    api_all_directions[route_id] = directions
api_all_directions = { str(k): v['directions'] for k, v in api_all_directions.items() }
# 2m - 3m

In [10]:
assert all([str(direction['route_id']) == str(k) for k, v in api_all_directions.items() for direction in v])

API_DIRECTIONS_LIST = [direction for k, v in api_all_directions.items() for direction in v]

API_DF_DIRECTIONS = pd.DataFrame(API_DIRECTIONS_LIST)[['route_id', 'route_type', 'direction_id', 'direction_name']]
API_DF_DIRECTIONS.to_csv('../local/ptv-api/all_directions.csv', index=False)
assert API_DF_DIRECTIONS[['route_id', 'route_type']].value_counts().max() <= 2

API_DF_DIRECTIONS = pd.read_csv('../local/ptv-api/all_directions.csv')

API_route_rtds = API_DF_DIRECTIONS[['route_id', 'route_type', 'direction_id']].values
API_route_rtds = [(str(r), str(t), str(d)) for r, t, d in API_route_rtds]

In [12]:
API_STOPS_dict = {}
for i, (route_id, route_type, direction_id) in enumerate(API_route_rtds):
    # print(f'[{i}] Getting stops for route {route_id}, route type {route_type}, direction {direction_id}')
    endpoint = f'/v3/stops/route/{route_id}/route_type/{route_type}?direction_id={direction_id}&include_geopath=true'
    stops = None
    while stops is None:
        try:
            stops = get_data(endpoint)
            # print(f'[{i}] Got stops for route {route_id}')
        except requests.exceptions.HTTPError:
            # print(f'Failed to get stops for route {route_id}. Retrying in 30 seconds...')
            time.sleep(30)
            continue
    API_STOPS_dict[route_id] = API_STOPS_dict.get(route_id, {})
    API_STOPS_dict[route_id][direction_id] = stops
# 3m - 5m
    
API_STOPS_dict = { str(k): { str(k2): v2 for k2, v2 in v.items()} for k, v in API_STOPS_dict.items() }
with open('../local/ptv-api/all_stops_by_direction.json', 'w') as f:
    f.write(json.dumps(API_STOPS_dict))

In [None]:
API_STOPS_STOPS = []
API_STOPS_GEOPATHS = []
for route_id, route_type, direction_id in API_route_rtds:
    stops = API_STOPS_dict[route_id][direction_id]['stops']
    for stop in stops:
        if 'stop_ticket' not in stop:
            # print(f'Route {route_id} has no ticket key for stop {stop["stop_id"]}')
            continue
        if stop['stop_ticket'] is None:
            # print(f'Route {route_id}: stop {stop["stop_id"]}: stop ticket is None. Skipping...')
            continue
        for mid, v in stop['stop_ticket'].items():
            mid = f'stop_{mid}'
            assert mid not in stop, f'Key {mid} already exists in stop'
            stop[mid] = v
        if 'route_id' not in stop:
            stop['route_id'] = route_id
        if 'route_type' not in stop:
            stop['route_type'] = route_type
        if 'direction_id' not in stop:
            stop['direction_id'] = direction_id
        API_STOPS_STOPS.append(stop)
    geopath = API_STOPS_dict[route_id][direction_id]['geopath']
    for path in geopath:
        if 'route_id' not in path:
            path['route_id'] = route_id
        if 'route_type' not in path:
            path['route_type'] = route_type
        if 'direction_id' not in path:
            path['direction_id'] = direction_id
        API_STOPS_GEOPATHS.append(path)
        
with open('../local/ptv-api/all_stops_stops.json', 'w') as f:
    f.write(json.dumps(API_STOPS_STOPS))
with open('../local/ptv-api/all_stops_geopaths.json', 'w') as f:
    f.write(json.dumps(API_STOPS_GEOPATHS))


In [41]:

API_STOPS_STOPS = json.load(open('../local/ptv-api/all_stops_stops.json'))
API_STOPS_GEOPATHS = json.load(open('../local/ptv-api/all_stops_geopaths.json'))

API_DF_STOPS = pd.DataFrame(API_STOPS_STOPS)
API_DF_STOPS.drop(columns=['disruption_ids'], inplace=True)
API_DF_STOPS['stop_ticket_zones'] = API_DF_STOPS['stop_ticket_zones'].apply(lambda x: ', '.join(map(str, x)) if isinstance(x, list) else x)
API_DF_STOPS.drop(columns=['stop_ticket'], inplace=True)
API_DF_STOPS['stop_is_regional'] = API_DF_STOPS['stop_zone'].apply(lambda x: 'Regional' in x)
API_DF_STOPS['stop_zones'] = API_DF_STOPS['stop_ticket_zones']
API_DF_STOPS.drop(columns=['stop_ticket_zones', 'stop_zone'], inplace=True)
API_DF_STOPS = API_DF_STOPS[['stop_id', 'stop_name', 'stop_suburb', 'stop_latitude', 'stop_longitude', 'stop_sequence', 'route_id', 'direction_id',  'route_type',  'stop_landmark', 'stop_zones', 'stop_ticket_type', 'stop_is_free_fare_zone', 'stop_is_regional', 'stop_ticket_machine', 'stop_ticket_checks', 'stop_vline_reservation']]

API_DF_STOPS.to_csv('../local/ptv-api/all_stops_stops.csv', index=False)
API_DF_STOPS = pd.read_csv('../local/ptv-api/all_stops_stops.csv')

In [47]:
API_DF_STOPS_MIN = API_DF_STOPS[['stop_id', 'stop_name', 'stop_suburb', 'stop_latitude', 'stop_longitude']].drop_duplicates().reset_index(drop=True)

In [54]:
API_DF_STOPS_MIN.groupby('stop_id').aggregate({'stop_name': 'nunique', 'stop_suburb': 'nunique', 'stop_latitude': 'nunique', 'stop_longitude': 'nunique'}).max()

stop_name         2
stop_suburb       1
stop_latitude     1
stop_longitude    1
dtype: int64

In [55]:
API_DF_STOPS_MIN_GROUP = API_DF_STOPS_MIN.groupby('stop_id').aggregate({'stop_name': 'unique', 'stop_suburb': 'unique', 'stop_latitude': 'unique', 'stop_longitude': 'unique'})
# 7s - 9s
assert API_DF_STOPS_MIN_GROUP['stop_suburb'].apply(lambda x: len(x) == 1).all()
assert API_DF_STOPS_MIN_GROUP['stop_latitude'].apply(lambda x: len(x) == 1).all()
assert API_DF_STOPS_MIN_GROUP['stop_longitude'].apply(lambda x: len(x) == 1).all()

In [62]:
API_DF_STOPS_MIN_GROUP[API_DF_STOPS_MIN_GROUP['stop_name'].apply(lambda x: len(x) > 1)]

Unnamed: 0_level_0,stop_name,stop_suburb,stop_latitude,stop_longitude
stop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1021,"[Berwick Station, Berwick Railway Station]",[Berwick],[-38.04041],[145.345718]
1028,"[Broadmeadows Station, Broadmeadows Railway St...",[Broadmeadows],[-37.6830521],[144.919617]
1036,"[Caulfield Station, Caulfield Railway Station]",[Caulfield East],[-37.8774567],[145.042526]
1040,"[Clayton Station, Clayton Railway Station]",[Clayton],[-37.9246826],[145.120529]
1044,"[Craigieburn Station, Craigieburn Railway Stat...",[Craigieburn],[-37.6019249],[144.943314]
1049,"[Dandenong Station, Dandenong Railway Station]",[Dandenong],[-37.9899673],[145.209732]
1064,"[Essendon Station, Essendon Railway Station]",[Essendon],[-37.75601],[144.9162]
1071,"[Flinders Street Station, Flinders Street Rail...",[Melbourne City],[-37.81831],[144.966965]
1072,"[Footscray Station, Footscray Railway Station]",[Footscray],[-37.8010864],[144.9032]
1144,"[North Melbourne Station, North Melbourne Rail...",[West Melbourne],[-37.807415],[144.942566]


In [70]:
API_DF_STOPS[API_DF_STOPS['stop_name'] == 'Clayton Railway Station']

Unnamed: 0,stop_id,stop_name,stop_suburb,stop_latitude,stop_longitude,stop_sequence,route_id,direction_id,route_type,stop_landmark,stop_zones,stop_ticket_type,stop_is_free_fare_zone,stop_is_regional,stop_ticket_machine,stop_ticket_checks,stop_vline_reservation
60197,1040,Clayton Railway Station,Clayton,-37.924683,145.120529,41,1721,0,3,,2,,False,True,False,False,False
60235,1040,Clayton Railway Station,Clayton,-37.924683,145.120529,0,1721,23,3,,2,,False,True,False,False,False
61822,1040,Clayton Railway Station,Clayton,-37.924683,145.120529,25,1823,0,3,,2,,False,True,False,False,False
61852,1040,Clayton Railway Station,Clayton,-37.924683,145.120529,6,1823,11,3,,2,,False,True,False,False,False
61883,1040,Clayton Railway Station,Clayton,-37.924683,145.120529,18,1824,0,3,,2,,False,True,False,False,False
61911,1040,Clayton Railway Station,Clayton,-37.924683,145.120529,6,1824,39,3,,2,,False,True,False,False,False
62430,1040,Clayton Railway Station,Clayton,-37.924683,145.120529,22,5838,0,3,,2,,False,True,False,False,False
62455,1040,Clayton Railway Station,Clayton,-37.924683,145.120529,0,5838,43,3,,2,,False,True,False,False,False


In [535]:
GSA_DF_ROUTES[GSA_DF_ROUTES['mode_id'].apply(lambda x: x in ['1', '5']) | GSA_DF_ROUTES['route_gtfs_id0'].apply(lambda x: x in ['1', '5'])].to_csv('../local/gsa-vline-trains.csv', index=False)

In [537]:
GSA_VLINE_ROUTES = GSA_DF_ROUTES[GSA_DF_ROUTES['mode_id'].apply(lambda x: x in ['1', '5']) | GSA_DF_ROUTES['route_gtfs_id0'].apply(lambda x: x in ['1', '5'])]
vline_api_ids = GSA_VLINE_ROUTES['route_id'].dropna().unique()
len(vline_api_ids)

In [546]:
API_DF_VLINE_STOPS = API_DF_STOPS[API_DF_STOPS['route_id'].apply(lambda x: str(x) in vline_api_ids)]
API_DF_VLINE_STOPS.to_csv('../local/api-vline-stops.csv', index=False)

In [550]:
API_DF_VLINES = API_DF_VLINE_STOPS.groupby('route_id').aggregate({'stop_id': 'unique', 'stop_name': 'unique'}).reset_index()

In [554]:
API_DF_VLINES[API_DF_VLINES['stop_name'].apply(lambda stops: all(["Railway Station" in stop for stop in stops]))].explode(['stop_id', 'stop_name'])

Unnamed: 0,route_id,stop_id,stop_name
13,1727,1500,Albury Railway Station
13,1727,1508,Benalla Railway Station
13,1727,1555,Shepparton Railway Station
13,1727,1567,Wangaratta Railway Station
57,7601,1517,Colac Railway Station
57,7601,1527,Geelong Railway Station
57,7601,1570,Winchelsea Railway Station


In [571]:
API_DF_VLINE_STOPS[['stop_id', 'route_type']].drop_duplicates()

Unnamed: 0,stop_id,route_type
59820,1501,3
59821,4125,3
59822,4130,3
59823,4539,3
59824,4156,3
...,...,...
62323,4609,3
62375,28302,3
62387,1591,3
62401,1594,3


In [556]:
API_DOCS['paths']['/v3/stops/{stop_id}/route_type/{route_type}']['get']

{'tags': ['Stops'],
 'summary': 'View facilities at a specific stop (Metro and V/Line stations only)',
 'operationId': 'Stops_StopDetails',
 'consumes': [],
 'produces': ['application/json', 'text/json', 'text/html'],
 'parameters': [{'name': 'stop_id',
   'in': 'path',
   'description': 'Identifier of stop; values returned by Stops API',
   'required': True,
   'type': 'integer',
   'format': 'int32'},
  {'name': 'route_type',
   'in': 'path',
   'description': 'Number identifying transport mode; values returned via RouteTypes API',
   'required': True,
   'type': 'integer',
   'format': 'int32',
   'enum': [0, 1, 2, 3, 4]},
  {'name': 'stop_location',
   'in': 'query',
   'description': 'Indicates if stop location information will be returned (default = false)',
   'required': False,
   'type': 'boolean'},
  {'name': 'stop_amenities',
   'in': 'query',
   'description': 'Indicates if stop amenity information will be returned (default = false)',
   'required': False,
   'type': 'boole

In [561]:
get_data('/v3/stops/1071/route_type/0?gtfs=false&stop_location=true&stop_amenities=true&stop_accessibility=true&stop_contact=true&stop_ticket=true&stop_staffing=true&stop_disruptions=false')

{'stop': {'point_id': 19854,
  'operating_hours': 'Monday to Sunday, from first train to last train',
  'mode_id': 2,
  'station_details_id': 0,
  'flexible_stop_opening_hours': '',
  'stop_contact': {'lost_property_contact_number': None,
   'phone': '(03) 9610 3711',
   'lost_property': '(03) 9610 7512',
   'feedback': '1800 800 007'},
  'stop_ticket': {'ticket_type': '',
   'zone': 'Zone 1',
   'is_free_fare_zone': False,
   'ticket_machine': True,
   'ticket_checks': True,
   'vline_reservation': False,
   'ticket_zones': [1]},
  'disruption_ids': [],
  'station_type': 'Premium Station',
  'station_description': 'The customer service centre is staffed from first to last train, 7 days a week. Protective Services Officers are generally present from 6pm to last train Sunday to Thursday and overnight on Fridays and Saturdays.',
  'route_type': 0,
  'stop_location': {'postcode': 3000,
   'municipality': 'Melbourne',
   'municipality_id': 30,
   'primary_stop_name': 'Swanston',
   'road_t

In [564]:
k = get_data('/v3/stops/19854/route_type/0?gtfs=true&stop_location=true&stop_amenities=true&stop_accessibility=true&stop_contact=true&stop_ticket=true&stop_staffing=true&stop_disruptions=false')

In [567]:
k['stop']['point_id'], k['stop']['stop_id']

(19854, 1071)

In [577]:
get_data('/v3/stops/11569/route_type/2?gtfs=true&stop_location=true&stop_amenities=true&stop_accessibility=true&stop_contact=true&stop_ticket=true&stop_staffing=true&stop_disruptions=false')

{'stop': {'point_id': 11569,
  'operating_hours': 'N',
  'mode_id': 1,
  'station_details_id': 0,
  'flexible_stop_opening_hours': '',
  'stop_contact': {'lost_property_contact_number': None,
   'phone': '',
   'lost_property': '',
   'feedback': ''},
  'stop_ticket': {'ticket_type': '',
   'zone': 'Zone 2',
   'is_free_fare_zone': False,
   'ticket_machine': False,
   'ticket_checks': False,
   'vline_reservation': False,
   'ticket_zones': [2]},
  'disruption_ids': [],
  'station_type': None,
  'station_description': None,
  'route_type': 2,
  'stop_location': {'postcode': 3178,
   'municipality': 'Knox',
   'municipality_id': 24,
   'primary_stop_name': 'Woodside',
   'road_type_primary': 'Dr',
   'second_stop_name': 'Wentworth',
   'road_type_second': 'Ave',
   'bay_nbr': 0,
   'suburb': 'Rowville',
   'gps': {'latitude': -37.9213066, 'longitude': 145.263382}},
  'stop_amenities': {'seat_type': '',
   'pay_phone': False,
   'indoor_waiting_area': False,
   'sheltered_waiting_area':

In [578]:
get_data('/v3/stops/16495/route_type/2?gtfs=true&stop_location=true&stop_amenities=true&stop_accessibility=true&stop_contact=true&stop_ticket=true&stop_staffing=true&stop_disruptions=false')

{'stop': {'point_id': 11569,
  'operating_hours': 'N',
  'mode_id': 1,
  'station_details_id': 0,
  'flexible_stop_opening_hours': '',
  'stop_contact': {'lost_property_contact_number': None,
   'phone': '',
   'lost_property': '',
   'feedback': ''},
  'stop_ticket': {'ticket_type': '',
   'zone': 'Zone 2',
   'is_free_fare_zone': False,
   'ticket_machine': False,
   'ticket_checks': False,
   'vline_reservation': False,
   'ticket_zones': [2]},
  'disruption_ids': [],
  'station_type': None,
  'station_description': None,
  'route_type': 2,
  'stop_location': {'postcode': 3178,
   'municipality': 'Knox',
   'municipality_id': 24,
   'primary_stop_name': 'Woodside',
   'road_type_primary': 'Dr',
   'second_stop_name': 'Wentworth',
   'road_type_second': 'Ave',
   'bay_nbr': 0,
   'suburb': 'Rowville',
   'gps': {'latitude': -37.9213066, 'longitude': 145.263382}},
  'stop_amenities': {'seat_type': '',
   'pay_phone': False,
   'indoor_waiting_area': False,
   'sheltered_waiting_area':