In [2]:
from hashlib import sha1
import hmac
import requests
import pandas as pd
import geopandas as gpd
import json
import os
import numpy as np
import logging
import sys
import time
import pyptvgtfs

In [3]:
DATA_DIR = '../local/ptv-api/data'

In [11]:
GTFS = pyptvgtfs.process_gtfs_zip('../downloads/20240315_193134/gtfs.zip', '20240315_193134')
# 1m - 3m
GTFS.drop(columns=['version_id'], inplace=True)
GTFS_DFS = GTFS.set_index(['mode_id', 'table_name'])['df'].to_dict()
new_GTFS_DFS = {}
for mid, v in GTFS_DFS.items():
    new_GTFS_DFS[mid[0]] = new_GTFS_DFS.get(mid[0], {})
    new_GTFS_DFS[mid[0]][mid[1]] = v
GTFS_DFS : dict[str, dict[str, pd.DataFrame]] = new_GTFS_DFS
for mid in GTFS_DFS:
    for tn in GTFS_DFS[mid]:
        GTFS_DFS[mid][tn]['mode_id'] = mid
GTFS_DF_STOPS = pd.concat([GTFS_DFS[mid]['stops'] for mid in GTFS_DFS], ignore_index=True)

In [9]:
ENV = json.load(open('../local-env.json'))

def get_ptv_api_url(
        endpoint : str,
        dev_id : str | int, 
        api_key : str | int,
    ):
    """
    Returns the URL to use PTV TimeTable API.

    Generates a signature from dev id (user id), API key, and endpoint.

    See the following for more information:
    - Home page: https://www.ptv.vic.gov.au/footer/data-and-reporting/datasets/ptv-timetable-api/
    - Swagger UI: https://timetableapi.ptv.vic.gov.au/swagger/ui/index
    - Swagger Docs JSON: https://timetableapi.ptv.vic.gov.au/swagger/docs/v3 (You can use this to find the endpoints you want to use.)
    """
    assert endpoint.startswith('/'), f'Endpoint must start with /, got {endpoint}'
    raw = f'{endpoint}{'&' if '?' in endpoint else '?'}devid={dev_id}'
    hashed = hmac.new(api_key.encode('utf-8'), raw.encode('utf-8'), sha1)  # Encode the raw string to bytes
    signature = hashed.hexdigest()
    return f'https://timetableapi.ptv.vic.gov.au{raw}&signature={signature}'


class PTVAPIClient:
    def __init__(self, dev_id : str | int, api_key : str | int):
        self.dev_id = dev_id
        self.api_key = api_key
        self.session = requests.Session()

    def get_data(self, endpoint : str, need_auth : bool = True):
        """
        Returns the data from the URL.
        """
        if need_auth:
            url = get_ptv_api_url(endpoint, self.dev_id, self.api_key)
        else:
            url = f'https://timetableapi.ptv.vic.gov.au{endpoint}'
        response = self.session.get(url)
        response.raise_for_status()
        return response.json()
    

class PTVAPI3(PTVAPIClient):
    def __init__(self, dev_id : str | int, api_key : str | int):
        super().__init__(dev_id, api_key)
        
    def get_docs(self) -> dict:
        return self.get_data('/swagger/docs/v3', need_auth=False)
    
    def get_routes(self) -> dict:
        return self.get_data('/v3/routes')['routes']
    
    def get_route_types(self) -> dict:
        return self.get_data('/v3/route_types')['route_types']
    
    def get_disruptions(self) -> dict:
        return self.get_data('/v3/disruptions')['disruptions']
    
    def get_disruption_modes(self) -> dict:
        return self.get_data('/v3/disruptions/modes')['disruption_modes']
    
    def get_outlets(self) -> dict:
        return self.get_data('/v3/outlets')['outlets']


CLIENT = PTVAPI3(ENV['PTV_TIMETABLE_DEV_ID'], ENV['PTV_TIMETABLE_API_KEY'])

In [10]:
API_DOCS = CLIENT.get_data('/swagger/docs/v3', need_auth=False)

STATIC_API_ENDPOINTS = [k for k in API_DOCS['paths'].keys() if '{' not in k]

API_ROUTES : dict = CLIENT.get_data('/v3/routes')['routes']

API_ROUTE_TYPES : dict = CLIENT.get_data('/v3/route_types')['route_types']

API_DISRUPTIONS : dict = CLIENT.get_data('/v3/disruptions')['disruptions']

API_DISRUPTION_MODES : dict = CLIENT.get_data('/v3/disruptions/modes')['disruption_modes']

API_OUTLETS : dict = CLIENT.get_data('/v3/outlets')['outlets']

# Create dataframes from the data

API_DF_ROUTE_TYPES = pd.DataFrame(API_ROUTE_TYPES)

API_DF_DISRUPTION_MODES = pd.DataFrame(API_DISRUPTION_MODES)

API_DF_OUTLETS = pd.DataFrame(API_OUTLETS)

# There are some faulty data in the outlets data. In particular, the latitude is > 0, which is not possible in Victoria.
API_DF_OUTLETS['outlet_latitude'] = API_DF_OUTLETS['outlet_latitude'].apply(lambda x: -x if x > 0 else x)
for route in API_ROUTES:
    for k, v in route['route_service_status'].items():
        assert k not in route, f'Key {k} already exists in route'
        route[k] = v
    del route['route_service_status']


API_DF_ROUTES = pd.DataFrame(API_ROUTES, dtype=str)


In [484]:
API_STOPS_INFO = json.load(open(f'{DATA_DIR}/stops_info.json'))
FAILED_STOPS = json.load(open(f'{DATA_DIR}/failed_stops.json'))
GA_MISSING_STOPS = json.load(open(f'{DATA_DIR}/missing_stops_real.json'))
FAILED_STOPS_MISSING = json.load(open(f'{DATA_DIR}/failed_stops_missing.json'))

for stop_id in API_STOPS_INFO:
    for route_type in API_STOPS_INFO[stop_id]:
        API_STOPS_INFO[stop_id][route_type] = API_STOPS_INFO[stop_id][route_type]['stop']

with open(f'{DATA_DIR}/stops_info.json', 'w') as f:
    json.dump(API_STOPS_INFO, f)

In [491]:
API_STOPS_INFO = json.load(open(f'{DATA_DIR}/stops_info.json'))
FAILED_STOPS = json.load(open(f'{DATA_DIR}/failed_stops.json'))
GA_MISSING_STOPS = json.load(open(f'{DATA_DIR}/missing_stops_real.json'))
FAILED_STOPS_MISSING = json.load(open(f'{DATA_DIR}/failed_stops_missing.json'))

API_STOPS_DATA = {}

for stop_id in API_STOPS_INFO:
    for route_type in API_STOPS_INFO[stop_id]:
        stop_id = str(stop_id)
        route_type = str(route_type)
        stop = API_STOPS_INFO[stop_id][route_type]
        assert str(stop['route_type']) == str(route_type)
        assert str(stop['stop_id']) == str(stop_id)
        assert 'gtfs_stop_id' not in stop
        if 'point_id' in stop:
            stop['gtfs_stop_id'] = [stop['point_id']]
        else:
            continue
        API_STOPS_DATA[stop_id] = API_STOPS_DATA.get(stop_id, {})
        API_STOPS_DATA[stop_id][route_type] = stop

for gtfs_stop_id in GA_MISSING_STOPS:
    for route_type in GA_MISSING_STOPS[gtfs_stop_id]:
        stop = GA_MISSING_STOPS[gtfs_stop_id][route_type]
        assert str(stop['route_type']) == str(route_type), (gtfs_stop_id, stop['route_type'], route_type)
        assert 'gtfs_stop_id' not in stop
        stop_id = str(stop['stop_id'])
        route_type = str(route_type)
        if stop_id in API_STOPS_DATA and route_type in API_STOPS_DATA[stop_id]:
            API_STOPS_DATA[stop_id][route_type]['gtfs_stop_id'].append(gtfs_stop_id)
        else:
            API_STOPS_DATA[stop_id] = API_STOPS_DATA.get(stop_id, {})
            API_STOPS_DATA[stop_id][route_type] = stop
            API_STOPS_DATA[stop_id][route_type]['gtfs_stop_id'] = [gtfs_stop_id]

API_STOPS_DATA_LIST = []
for stop_id in API_STOPS_DATA:
    for route_type in API_STOPS_DATA[stop_id]:
        stop_info = API_STOPS_DATA[stop_id][route_type]
        stop_info : dict
        for k, v in stop_info.items():
            if isinstance(v, int) and not isinstance(v, bool):
                stop_info[k] = str(v)
        API_STOPS_DATA_LIST.append(stop_info)

with open(f'{DATA_DIR}/stops_data.json', 'w') as f:
    json.dump(API_STOPS_DATA, f)

with open(f'{DATA_DIR}/stops_data_list.json', 'w') as f:
    json.dump(API_STOPS_DATA_LIST, f)

# 45s - 1m

In [490]:
pd.DataFrame(API_STOPS_DATA_LIST)

Unnamed: 0,point_id,operating_hours,mode_id,station_details_id,flexible_stop_opening_hours,stop_contact,stop_ticket,disruption_ids,station_type,station_description,route_type,stop_location,stop_amenities,stop_accessibility,stop_staffing,routes,stop_id,stop_name,stop_landmark,gtfs_stop_id
0,19847,N,2,0,,"{'lost_property_contact_number': None, 'phone'...","{'ticket_type': '', 'zone': 'Zone 1', 'is_free...",[],Unstaffed Station,This station is unstaffed. Protective Services...,0,"{'postcode': 3147, 'municipality': 'Boroondara...","{'seat_type': 'Seat', 'pay_phone': True, 'indo...","{'lighting': True, 'platform_number': None, 'a...","{'fri_am_from': '', 'fri_am_to': '', 'fri_pm_f...","[{'route_type': 0, 'route_id': 1, 'route_name'...",1002,Alamein,,[19847]
1,19848,"Monday to Sunday, from first train to last train",2,0,,"{'lost_property_contact_number': None, 'phone'...","{'ticket_type': '', 'zone': 'Zone 1', 'is_free...",[],Premium Station,The customer service centre is staffed from fi...,0,"{'postcode': 3147, 'municipality': 'Boroondara...","{'seat_type': 'Seat', 'pay_phone': True, 'indo...","{'lighting': True, 'platform_number': None, 'a...","{'fri_am_from': '', 'fri_am_to': '', 'fri_pm_f...","[{'route_type': 0, 'route_id': 1, 'route_name'...",1010,Ashburton,,[19848]
2,19903,AM peak,2,0,,"{'lost_property_contact_number': None, 'phone'...","{'ticket_type': '', 'zone': 'Zone 1', 'is_free...",[],Host Station,Customer Service staff are at the station duri...,0,"{'postcode': 3123, 'municipality': 'Boroondara...","{'seat_type': 'Seat', 'pay_phone': True, 'indo...","{'lighting': True, 'platform_number': None, 'a...","{'fri_am_from': '0630', 'fri_am_to': '1030', '...","[{'route_type': 0, 'route_id': 1, 'route_name'...",1012,Auburn,,[19903]
3,19906,"Monday to Sunday, from first train to last train",2,0,,"{'lost_property_contact_number': None, 'phone'...","{'ticket_type': '', 'zone': 'Zone 1', 'is_free...",[],Premium Station,The customer service centre is staffed from fi...,0,"{'postcode': 3121, 'municipality': 'Yarra', 'm...","{'seat_type': 'Seat', 'pay_phone': True, 'indo...","{'lighting': True, 'platform_number': None, 'a...","{'fri_am_from': '', 'fri_am_to': '', 'fri_pm_f...","[{'route_type': 0, 'route_id': 1, 'route_name'...",1030,Burnley,,[19906]
4,19849,N,2,0,,"{'lost_property_contact_number': None, 'phone'...","{'ticket_type': '', 'zone': 'Zone 1', 'is_free...",[],Unstaffed Station,This station is unstaffed. Protective Services...,0,"{'postcode': 3146, 'municipality': 'Boroondara...","{'seat_type': 'Seat', 'pay_phone': True, 'indo...","{'lighting': True, 'platform_number': None, 'a...","{'fri_am_from': '', 'fri_am_to': '', 'fri_pm_f...","[{'route_type': 0, 'route_id': 1, 'route_name'...",1031,Burwood,,[19849]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20311,46434,,6,0,,"{'lost_property_contact_number': None, 'phone'...","{'ticket_type': '', 'zone': 'Regional', 'is_fr...",[],,,2,"{'postcode': 3875, 'municipality': 'East Gipps...","{'seat_type': '', 'pay_phone': False, 'indoor_...","{'lighting': False, 'platform_number': None, '...","{'fri_am_from': '', 'fri_am_to': '', 'fri_pm_f...","[{'route_type': 2, 'route_id': 1495, 'route_na...",30925,150 Princes Hwy,,[46434]
20312,46441,,6,0,,"{'lost_property_contact_number': None, 'phone'...","{'ticket_type': '', 'zone': 'Regional', 'is_fr...",[],,,2,"{'postcode': 3903, 'municipality': 'East Gipps...","{'seat_type': '', 'pay_phone': False, 'indoor_...","{'lighting': False, 'platform_number': None, '...","{'fri_am_from': '', 'fri_am_to': '', 'fri_pm_f...","[{'route_type': 2, 'route_id': 1495, 'route_na...",30926,Forest Tech/Princes Hwy,Forest Tech,"[46441, 46449]"
20313,46444,,6,0,,"{'lost_property_contact_number': None, 'phone'...","{'ticket_type': '', 'zone': 'Regional', 'is_fr...",[],,,2,"{'postcode': 3909, 'municipality': 'East Gipps...","{'seat_type': '', 'pay_phone': False, 'indoor_...","{'lighting': False, 'platform_number': None, '...","{'fri_am_from': '', 'fri_am_to': '', 'fri_pm_f...","[{'route_type': 2, 'route_id': 1495, 'route_na...",22237,Rules Rd/Princes Hwy,,[46444]
20314,46446,,6,0,,"{'lost_property_contact_number': None, 'phone'...","{'ticket_type': '', 'zone': 'Regional', 'is_fr...",[],,,2,"{'postcode': 3909, 'municipality': 'East Gipps...","{'seat_type': '', 'pay_phone': False, 'indoor_...","{'lighting': False, 'platform_number': None, '...","{'fri_am_from': '', 'fri_am_to': '', 'fri_pm_f...","[{'route_type': 2, 'route_id': 1495, 'route_na...",27517,Veldens Rd/Princes Hwy,,[46446]


In [158]:
API_STOPS_INFO = json.load(open(f'{DATA_DIR}/stops_info.json'))
FAILED_STOPS = json.load(open(f'{DATA_DIR}/failed_stops.json'))
GA_MISSING_STOPS = json.load(open(f'{DATA_DIR}/missing_stops_real.json'))
FAILED_STOPS_MISSING = json.load(open(f'{DATA_DIR}/failed_stops_missing.json'))

API_STOPS_INFO_LIST = []


for stop_info in API_STOPS_INFO_LIST:
    stop_info : dict
    kv_is_dict = [(k, v) for k, v in stop_info.items() if isinstance(v, dict)]
    for k, v in kv_is_dict:
        for k2, v2 in v.items():
            new_key = f'{k}_{k2}'
            assert new_key not in stop_info, f'{new_key} already exists in {stop_info}'
            stop_info[new_key] = v2
        del stop_info[k]
    for k, v in stop_info.items():
        if isinstance(v, int) and not isinstance(v, bool):
            stop_info[k] = str(v)


API_DF_STOPS_INFO = pd.DataFrame(API_STOPS_INFO_LIST, dtype=str)

assert API_DF_STOPS_INFO['station_details_id'].nunique() == 1, 'station_details_id is not unique'

print(f'Station details id: {API_DF_STOPS_INFO["station_details_id"].dropna().unique()}')

API_DF_STOPS_INFO.drop(columns=['station_details_id', 'disruption_ids'], inplace=True)

assert API_DF_STOPS_INFO.groupby(['stop_id', 'route_type'])['point_id'].unique().apply(lambda x: len(x) == 1).all()

assert API_DF_STOPS_INFO.groupby('point_id')['gtfs_stop_id'].unique().reset_index().apply(lambda x: x['point_id'] in x['gtfs_stop_id'], axis=1).all()

API_DF_STOPS_GTFS = API_DF_STOPS_INFO[['stop_id', 'route_type', 'gtfs_stop_id', 'stop_name']].drop_duplicates()
assert API_DF_STOPS_GTFS.groupby('gtfs_stop_id')['stop_id'].unique().apply(lambda x: len(x) == 1).all()

# Inspect gtfs_stop_id with multiple stop_name
k = API_DF_STOPS_GTFS.groupby('gtfs_stop_id')['stop_name'].unique()
k[k.apply(lambda x: len(x) != 1)]
# Most of them are just Railway Station and No-"Station", which are the same thing.

API_DF_STOPS_GTFS_GROUP = API_DF_STOPS_GTFS.groupby('gtfs_stop_id').aggregate({'stop_id': 'first', 'route_type': 'unique', 'stop_name': 'first'}).reset_index()


assert API_DF_STOPS_INFO[API_DF_STOPS_INFO['mode_id'].isna()][['stop_id', 'route_type']].apply(lambda x: [x['stop_id'], x['route_type']] in FAILED_STOPS, axis=1).all()

API_DF_STOPS_INFO.dropna(subset=['mode_id'], inplace=True)

API_DF_STOPS_INFO_MIN = API_DF_STOPS_INFO.drop_duplicates(subset=['stop_id', 'route_type'])

Station details id: ['0']


In [137]:
GTFS_DF_STOPS_GROUP = GTFS_DF_STOPS.groupby('stop_id').aggregate({col: 'unique' for col in GTFS_DF_STOPS.columns if col != 'stop_id'}).reset_index()

In [167]:
GA_DF_STOPS = pd.merge(API_DF_STOPS_GTFS_GROUP, GTFS_DF_STOPS_GROUP, how='outer', left_on='gtfs_stop_id', right_on='stop_id', suffixes=('_api', '_gtfs'))

In [168]:
len(GA_DF_STOPS[GA_DF_STOPS['stop_id_gtfs'].isna()]), len(GA_DF_STOPS[GA_DF_STOPS['stop_id_api'].isna()])

(40, 105)

In [170]:
GA_DF_STOPS[GA_DF_STOPS.apply(lambda x: isinstance(x['route_type'], np.ndarray) and len(x['route_type']) > 1 and isinstance(x['mode_id'], np.ndarray) and len(x['mode_id']) > 1, axis=1)]

Unnamed: 0,gtfs_stop_id,stop_id_api,route_type,stop_name_api,stop_id_gtfs,stop_name_gtfs,stop_lat,stop_lon,mode_id
4335,15128,4462,"[3, 2]",Fraser St/Wyndham St,15128,[Fraser St/Wyndham St (Shepparton)],[-36.3795002869314],[145.399762624587],"[5, 6]"
5902,17135,4219,"[3, 2]",Raglan St/Albert St,17135,[Raglan St/Albert St (Creswick)],[-37.4245467375203],[143.894452810868],"[5, 6]"
6551,17980,4216,"[3, 2]",Service Station/Sunraysia Hwy,17980,[Service Station/Sunraysia Hwy (Speed)],[-35.4017845021593],[142.438659301823],"[5, 6]"
8476,20362,1573,"[3, 2]",Yarragon Railway Station,20362,[Yarragon Railway Station (Yarragon)],[-38.2031575351187],[146.063062795324],"[1, 5]"
8801,20816,1187,"[3, 0]",Sunbury Railway Station,20816,[Sunbury Railway Station (Sunbury)],[-37.5791237309197],[144.728552008081],"[4, 6]"
9833,22250,1049,"[3, 0]",Dandenong Railway Station,22250,[Dandenong Railway Station (Dandenong)],[-37.9899679153852],[145.209725368166],"[1, 5]"
9835,22252,1153,"[3, 0]",Pakenham Railway Station,22252,[Pakenham Railway Station (Pakenham)],[-38.0806138269752],[145.486367676332],"[1, 5]"


In [175]:
API_STOPS_STOPS = json.load(open(f'{DATA_DIR}/stops.json'))

API_DF_STOPS = pd.DataFrame(API_STOPS_STOPS, dtype=str)
API_DF_STOPS.drop(columns=['disruption_ids'], inplace=True)
API_DF_STOPS['stop_ticket_zones'] = API_DF_STOPS['stop_ticket_zones'].apply(lambda x: ', '.join(map(str, x)) if isinstance(x, list) else x)
API_DF_STOPS.drop(columns=['stop_ticket'], inplace=True)
API_DF_STOPS['stop_is_regional'] = API_DF_STOPS['stop_zone'].apply(lambda x: 'Regional' in x if isinstance(x, str) else False)
API_DF_STOPS['stop_zones'] = API_DF_STOPS['stop_ticket_zones']
API_DF_STOPS.drop(columns=['stop_ticket_zones', 'stop_zone'], inplace=True)
API_DF_STOPS = API_DF_STOPS[['stop_id', 'stop_name', 'stop_suburb', 'stop_latitude', 'stop_longitude', 'stop_sequence', 'route_id', 'direction_id',  'route_type',  'stop_landmark', 'stop_zones', 'stop_ticket_type', 'stop_is_free_fare_zone', 'stop_is_regional', 'stop_ticket_machine', 'stop_ticket_checks', 'stop_vline_reservation']]

In [181]:
API_DF_STOPS_MIN = API_DF_STOPS[['stop_id', 'stop_name', 'stop_suburb', 'stop_latitude', 'stop_longitude', 'route_type']].drop_duplicates(subset=['stop_id', 'route_type'])

In [214]:
API_DF_STOPS_FULL_GTFS = pd.merge(API_DF_STOPS_MIN, API_DF_STOPS_GTFS.dropna(subset=['gtfs_stop_id']).groupby('stop_id')['gtfs_stop_id'].unique(), how='outer', left_on='stop_id', right_index=True, suffixes=('_api', '_gtfs'))

In [227]:
GTFS_WITH_NO_API_ID_DF = GA_DF_STOPS[GA_DF_STOPS['stop_id_api'].isna()].copy(deep=True)
assert len(GTFS_WITH_NO_API_ID_DF) == len(FAILED_STOPS_MISSING)
assert GTFS_WITH_NO_API_ID_DF['stop_id_gtfs'].apply(lambda x: x in FAILED_STOPS_MISSING).all()  
assert GTFS_WITH_NO_API_ID_DF['stop_name_gtfs'].apply(lambda x: len(x) == 1).all()
assert GTFS_WITH_NO_API_ID_DF['stop_lat'].apply(lambda x: len(x) == 1).all()
assert GTFS_WITH_NO_API_ID_DF['stop_lon'].apply(lambda x: len(x) == 1).all()
GTFS_WITH_NO_API_ID_DF['stop_name_gtfs'] = GTFS_WITH_NO_API_ID_DF['stop_name_gtfs'].apply(lambda x: x[0])
GTFS_WITH_NO_API_ID_DF['stop_lat'] = GTFS_WITH_NO_API_ID_DF['stop_lat'].apply(lambda x: x[0])
GTFS_WITH_NO_API_ID_DF['stop_lon'] = GTFS_WITH_NO_API_ID_DF['stop_lon'].apply(lambda x: x[0])
GTFS_WITH_NO_API_ID_DF = GTFS_WITH_NO_API_ID_DF[['stop_id_gtfs', 'stop_name_gtfs', 'stop_lat', 'stop_lon', 'mode_id']]

In [217]:
API_WITH_NO_GTFS_ID_DF = API_DF_STOPS_FULL_GTFS[API_DF_STOPS_FULL_GTFS['gtfs_stop_id'].isna()].copy(deep=True)
API_WITH_NO_GTFS_ID_DF['stop_full_name'] = API_WITH_NO_GTFS_ID_DF.apply(lambda x: f"{x['stop_name']} ({x['stop_suburb']})", axis=1)

In [248]:
GTFS_WITH_NO_API_ID_DF[GTFS_WITH_NO_API_ID_DF['stop_name_gtfs'].duplicated(keep=False)]
assert API_WITH_NO_GTFS_ID_DF['stop_full_name'].is_unique

In [250]:
df_test_merge_by_name = pd.merge(API_WITH_NO_GTFS_ID_DF, GTFS_WITH_NO_API_ID_DF, how='outer', left_on='stop_full_name', right_on='stop_name_gtfs', suffixes=('_api', '_gtfs'))
assert df_test_merge_by_name['stop_name_gtfs'].notna().all() # Success. Luckily all API stops that cannot be retrieved via API have an equivalent GTFS stop with the same name.

In [258]:
len(df_test_merge_by_name[df_test_merge_by_name['stop_full_name'].isna()]['stop_name_gtfs'])

47

In [278]:
df_test_merge_by_name.tail()

Unnamed: 0,stop_id,stop_name,stop_suburb,stop_latitude,stop_longitude,route_type,gtfs_stop_id,stop_full_name,stop_id_gtfs,stop_name_gtfs,stop_lat,stop_lon,mode_id
100,23376.0,White St/Hamilton - Port Fairy Rd,Byaduk North,-37.8942757,141.954437,3.0,,White St/Hamilton - Port Fairy Rd (Byaduk North),22957,White St/Hamilton - Port Fairy Rd (Byaduk North),-37.894271,141.954437,[5]
101,4127.0,Willaura Kindergarten/Willaura-Wickliffe Rd,Willaura,-37.54469,142.742371,2.0,,Willaura Kindergarten/Willaura-Wickliffe Rd (W...,17759,Willaura Kindergarten/Willaura-Wickliffe Rd (W...,-37.54469,142.742366,[6]
102,33508.0,Wodonga Middle Years College/Wilson St,Wodonga,-36.1320152,146.892258,2.0,,Wodonga Middle Years College/Wilson St (Wodonga),48276,Wodonga Middle Years College/Wilson St (Wodonga),-36.132021,146.892262,[6]
103,31511.0,Wood Rd/Greaves Rd,Narre Warren South,-38.0568848,145.315155,2.0,,Wood Rd/Greaves Rd (Narre Warren South),46548,Wood Rd/Greaves Rd (Narre Warren South),-38.056889,145.315163,[4]
104,,,,,,,,,14370,Yarra Road Primary School/Yarra Rd (Croydon No...,-37.766345,145.271951,[4]


In [283]:
df_test_merge_by_name.groupby('stop_id').aggregate({'route_type': 'first', 'stop_id_gtfs': 'unique', 'mode_id': lambda x: list(set(i[0] for i in x))})

Unnamed: 0_level_0,route_type,stop_id_gtfs,mode_id
stop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
15682,2,[14833],[4]
1595,2,"[20618, 21708]","[10, 6]"
21122,3,[22955],[5]
21454,2,[4594],[4]
21861,3,[22956],[5]
23071,2,[10481],[4]
23376,3,[22957],[5]
2506,2,[18565],[4]
25321,2,[45045],[6]
25864,2,[38008],[6]


In [267]:
API_DF_STOPS_INFO[API_DF_STOPS_INFO['stop_name'].notna() & API_DF_STOPS_INFO['stop_name'].str.contains('Adelaide') & API_DF_STOPS_INFO['stop_name'].str.contains('Station')]
# Adelaide Railway Station

Unnamed: 0,point_id,operating_hours,mode_id,flexible_stop_opening_hours,station_type,station_description,route_type,routes,stop_id,stop_name,...,stop_staffing_thu_pm_from,stop_staffing_thu_pm_to,stop_staffing_tue_am_from,stop_staffing_tue_am_to,stop_staffing_tue_pm_from,stop_staffing_tue_pm_to,stop_staffing_wed_am_from,stop_staffing_wed_am_to,stop_staffing_wed_pm_from,stop_staffing_wed_pm_To


In [432]:
assert (API_DF_STOPS_INFO_MIN.groupby(['route_type', 'stop_id'])['stop_name'].nunique() <= 1).all()
assert (API_DF_STOPS_INFO_MIN.groupby(['route_type', 'stop_id'])['mode_id'].nunique() <= 1).all()

stop_found_check = {}
data_route_stops = []
for stop_info in API_STOPS_INFO_LIST:
    stop_id = stop_info['stop_id']
    route_type = stop_info['route_type']
    if stop_id in stop_found_check and route_type in stop_found_check[stop_id]:
        continue
    if 'routes' not in stop_info:
        continue
    if stop_id not in stop_found_check:
        stop_found_check[stop_id] = {}
    stop_found_check[stop_id][route_type] = True
    for route in stop_info['routes']:
        route['stop_id'] = stop_id
        route['stop_route_type'] = route_type
        route['stop_name'] = stop_info['stop_name']
        route['stop_mode_id'] = stop_info['mode_id']
        data_route_stops.append(route)

# df = pd.DataFrame(API_STOPS_INFO_LIST)
# df['route_type'] = df['route_type'].astype(str)
# df['stop_id'] = df['stop_id'].astype(str)
# df.drop_duplicates(subset=['stop_id', 'route_type'], inplace=True)
# df = df[['stop_id', 'route_type', 'routes']].explode('routes').reset_index(drop=True)
API_DF_ROUTE_STOPS = pd.DataFrame(data_route_stops, dtype=str)
API_DF_ROUTE_STOPS['route_gtfs_mode'] = API_DF_ROUTE_STOPS['route_gtfs_id'].apply(lambda x: x.split('-')[0])
API_DF_ROUTE_STOPS['route_gtfs_code'] = API_DF_ROUTE_STOPS['route_gtfs_id'].apply(lambda x: x.split('-')[1])
API_DF_ROUTESS = API_DF_ROUTE_STOPS[['route_id', 'route_name', 'route_number', 'route_type', 'route_gtfs_id', 'route_gtfs_mode', 'route_gtfs_code']].drop_duplicates()
assert API_DF_ROUTESS['route_gtfs_id'].is_unique
API_DF_ROUTEX = API_DF_ROUTESS.groupby('route_id').aggregate({col: 'unique' for col in API_DF_ROUTESS.columns if col != 'route_id'}).reset_index()
assert API_DF_ROUTEX['route_name'].apply(lambda x: len(x) == 1).all()
API_DF_ROUTEX['route_name'] = API_DF_ROUTEX['route_name'].apply(lambda x: x[0])
assert API_DF_ROUTEX['route_number'].apply(lambda x: len(x) == 1).all()
API_DF_ROUTEX['route_number'] = API_DF_ROUTEX['route_number'].apply(lambda x: x[0])
assert API_DF_ROUTEX['route_type'].apply(lambda x: len(x) == 1).all()
API_DF_ROUTEX['route_type'] = API_DF_ROUTEX['route_type'].apply(lambda x: x[0])
assert API_DF_ROUTEX['route_gtfs_code'].apply(lambda x: len(x) == 1).all()
API_DF_ROUTEX['route_gtfs_code'] = API_DF_ROUTEX['route_gtfs_code'].apply(lambda x: x[0])
assert API_DF_ROUTEX['route_gtfs_id'].apply(lambda x: len(x) != 1).any()
assert API_DF_ROUTEX['route_gtfs_mode'].apply(lambda x: len(x) != 1).any()
API_DF_ROUTEX['route_gtfs_mode'] = API_DF_ROUTEX['route_gtfs_mode'].apply(lambda x: '-'.join(sorted(x)))


# Proof that API_DF_ROUTEX covers exactly all route_id of API_DF_ROUTES 
API_DF_ROUTES_FULL = pd.merge(API_DF_ROUTES, API_DF_ROUTEX, how='outer', left_on='route_id', right_on='route_id', suffixes=('_routes', '_stops'))
assert API_DF_ROUTES_FULL['route_name_routes'].notna().all()

API_DF_ROUTES_FULL.drop(columns=['geopath', 'description', 'timestamp'], inplace=True)
assert (API_DF_ROUTES_FULL['route_name_routes'] == API_DF_ROUTES_FULL['route_name_stops']).all()
assert (API_DF_ROUTES_FULL['route_number_routes'] == API_DF_ROUTES_FULL['route_number_stops']).all()
assert (API_DF_ROUTES_FULL['route_type_routes'] == API_DF_ROUTES_FULL['route_type_stops']).all()
assert API_DF_ROUTES_FULL.apply(lambda x: x['route_gtfs_id_routes'] in x['route_gtfs_id_stops'], axis=1).all()

del API_DF_ROUTES_FULL
API_DF_ROUTEX.drop(columns=['route_gtfs_id'], inplace=True)
API_DF_ROUTEX.to_csv(f'{DATA_DIR}/routesx.csv', index=False)

In [466]:
API_DF_STOPS_INFO_MIN[['mode_id', 'route_type']].drop_duplicates().sort_values(by=['mode_id', 'route_type'])

Unnamed: 0,mode_id,route_type
1382,1,2
0,2,0
238,3,1
68,4,3
9,5,3
3930,6,2


In [476]:
API_DF_ROUTESS['route_gtfs_mode'].value_counts()

route_gtfs_mode
4     388
6     322
5      58
1      56
3      24
2      17
11      2
10      2
Name: count, dtype: int64

In [479]:
API_DF_ROUTESS[API_DF_ROUTESS['route_gtfs_mode'].apply(lambda x: x in ['1', '5'])]

Unnamed: 0,route_id,route_name,route_number,route_type,route_gtfs_id,route_gtfs_mode,route_gtfs_code
48,1717,Batemans Bay - Melbourne via Bairnsdale,,3,5-V09,5,V09
49,1717,Batemans Bay - Melbourne via Bairnsdale,,3,1-V09,1,V09
50,1718,Canberra - Melbourne via Bairnsdale,,3,5-V13,5,V13
51,1718,Canberra - Melbourne via Bairnsdale,,3,1-V13,1,V13
52,1719,Sale - Melbourne via Maffra & Traralgon,,3,5-V43,5,V43
...,...,...,...,...,...,...,...
15315,1761,Deniliquin - Melbourne via Moama & Echuca & He...,,3,1-V20,1,V20
17174,7601,Geelong - Colac via Winchelsea and Birregurra,,3,5-GCL,5,GCL
30373,2055,Alexandra - Seymour via Yea,,3,5-als,5,als
32606,1727,Shepparton - Sydney via Benalla,,3,5-V42,5,V42


In [450]:
[k for k in API_DOCS['definitions'] if any('point_id' in t for t in API_DOCS['definitions'][k]['properties'])]

[]

In [482]:
API_DF_ROUTEX[API_DF_ROUTEX['route_gtfs_mode'].apply(lambda x: '5' not in x and '1' in x)]

Unnamed: 0,route_id,route_name,route_number,route_type,route_gtfs_mode,route_gtfs_code
34,1123,Skybus - Melbourne Airport - Melbourne City,,2,11,SKY
178,13621,Skybus - Melbourne Airport - Frankston,,2,11,SK4


In [483]:
API_STOPS_INFO_LIST

[{'point_id': '19847',
  'operating_hours': 'N',
  'mode_id': '2',
  'station_details_id': '0',
  'flexible_stop_opening_hours': '',
  'disruption_ids': [],
  'station_type': 'Unstaffed Station',
  'station_description': 'This station is unstaffed. Protective Services Officers may be present from 6pm to last train Sunday to Thursday and until around 1am on Fridays and Saturdays.',
  'route_type': '0',
  'routes': [{'route_type': 0,
    'route_id': 1,
    'route_name': 'Alamein',
    'route_number': '',
    'route_gtfs_id': '2-ALM',
    'geopath': [],
    'stop_id': '1002',
    'stop_route_type': '0',
    'stop_name': 'Alamein ',
    'stop_mode_id': '2'}],
  'stop_id': '1002',
  'stop_name': 'Alamein ',
  'stop_landmark': '',
  'gtfs_stop_id': '19847',
  'stop_contact_lost_property_contact_number': None,
  'stop_contact_phone': '',
  'stop_contact_lost_property': '(03) 9610 7513',
  'stop_contact_feedback': '1800 800 007',
  'stop_ticket_ticket_type': '',
  'stop_ticket_zone': 'Zone 1',

In [403]:
API_DF_ROUTEX.groupby('route_type')['route_gtfs_mode'].unique()

route_type
0                 [2]
1                 [3]
2          [4, 6, 11]
3    [1-5, 1-10-5, 5]
Name: route_gtfs_mode, dtype: object

In [354]:
API_DF_ROUTEX[API_DF_ROUTEX['route_gtfs_id'].apply(lambda x: len(x) != 1)]

Unnamed: 0,route_id,route_name,route_number,route_type,route_gtfs_id
129,1512,Warrnambool - Melbourne via Ararat & Hamilton,,3,"[5-995, 1-995]"
154,1706,Albury - Melbourne via Seymour,,3,"[5-V01, 1-V01]"
155,1710,Seymour - Melbourne via Broadmeadows,,3,"[5-V40, 1-V40]"
156,1717,Batemans Bay - Melbourne via Bairnsdale,,3,"[5-V09, 1-V09]"
157,1718,Canberra - Melbourne via Bairnsdale,,3,"[5-V13, 1-V13]"
158,1719,Sale - Melbourne via Maffra & Traralgon,,3,"[5-V43, 1-V43]"
159,1720,Cowes and Inverloch - Melbourne via Dandenong ...,,3,"[5-V15, 1-V15]"
160,1721,Marlo - Lake Tyers Beach - Melbourne via Bairn...,,3,"[5-V25, 1-V25]"
161,1722,Yarram - Melbourne via Koo Wee Rup & Dandenong,,3,"[5-V52, 1-V52]"
162,1723,Griffith - Melbourne via Shepparton,,3,"[5-V41, 1-V41]"


False

In [283]:
GTFS_DF_ROUTES = pd.concat([GTFS_DFS[mid]['routes'] for mid in GTFS_DFS])
# 1m - 2m
GTFS_DF_ROUTES['route_idx'] = GTFS_DF_ROUTES['route_id'].apply(lambda x: x.split('-'))
GTFS_DF_ROUTES['route_id0'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[0])
GTFS_DF_ROUTES['route_id1'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[1])
GTFS_DF_ROUTES['route_id2'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[2] if len(x) > 4 else np.nan)
GTFS_DF_ROUTES['route_id3'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[-2])
GTFS_DF_ROUTES['route_id4'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[-1])
GTFS_DF_ROUTES['route_id01'] = GTFS_DF_ROUTES['mode_id'] + '-' + GTFS_DF_ROUTES['route_id1']
GTFS_DF_ROUTES['route_gtfs_id'] = GTFS_DF_ROUTES.apply(lambda x: f'{x["mode_id"]}-{x["route_id1"]}' + (x['route_id2'] if x['mode_id'] == '4' and pd.notna(x['route_id2']) else ''), axis=1)

GTFS_DF_ROUTES_MIN = GTFS_DF_ROUTES[['route_gtfs_id', 'route_short_name', 'route_long_name', 'mode_id']].drop_duplicates()


SHP_DIR = '../local/ptv-spatial-datasets'
SHP_GDFS : gpd.GeoDataFrame = { f.split('.')[0]: gpd.read_file(os.path.join(SHP_DIR, f)) for f in os.listdir(SHP_DIR) if f.endswith('.shp') }
for f in os.listdir(SHP_DIR):
    if f.endswith('.txt'):
        gdf_name = f.removesuffix('_column_names.txt').upper()
        with open(os.path.join(SHP_DIR, f), 'r') as file:
            gdf_column_names = [line.strip() for line in file.readlines()][4:]
        assert gdf_name in SHP_GDFS, f'{gdf_name} not in GDFS'
        for line in gdf_column_names:
            assert ' = ' in line, f'Invalid line: {line}'
        gdf_column_names = { line.split(' = ')[0]: line.split(' = ')[1] for line in gdf_column_names }
        SHP_GDFS[gdf_name].rename(columns=gdf_column_names, inplace=True)

assert SHP_GDFS['PTV_METRO_BUS_STOP']['ROUTES_USING_STOP'].notna().all()
assert SHP_GDFS['PTV_METRO_TRAM_STOP']['ROUTES_USING_STOP'].notna().all()
assert SHP_GDFS['PTV_METRO_TRAIN_STATION']['ROUTES_USING_STOP'].notna().all()
# Check if a column exists in a GeoDataFrame
assert 'ROUTES_USING_STOP' not in SHP_GDFS['PTV_REGIONAL_COACH_STOP'].columns
assert SHP_GDFS['PTV_SKYBUS_STOP']['ROUTES_USING_STOP'].isna().all()
assert SHP_GDFS['PTV_REGIONAL_BUS_STOP']['ROUTES_USING_STOP'].notna().any()



for mid, gdf in SHP_GDFS.items():
    if 'ROUTE' in mid:
        gdf['SHP_FILE'] = mid
SHP_DF_ROUTES : pd.DataFrame = pd.concat([gdf for mid, gdf in SHP_GDFS.items() if 'ROUTE' in mid], ignore_index=True)



SHP_DF_ROUTES['route_idx'] = SHP_DF_ROUTES['ROUTE_ID'].apply(lambda x: x.split('-'))
SHP_DF_ROUTES['route_id0'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[0])
SHP_DF_ROUTES['route_id1'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[1])
SHP_DF_ROUTES['route_id2'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[2] if len(x) > 4 else np.nan)
SHP_DF_ROUTES['route_id3'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[-2])
SHP_DF_ROUTES['route_id4'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[-1])
SHP_DF_ROUTES['route_id01'] = SHP_DF_ROUTES['route_id0'] + '-' + SHP_DF_ROUTES['route_id1']

assert SHP_DF_ROUTES.groupby('route_id01').aggregate({'route_id2': 'unique', 'route_id3': 'unique', 'route_id4': 'unique'}).apply(lambda x: len(x['route_id2']) <= 1 or len(x['route_id4']) <= 1, axis=1).all()

# Since ROUTE_LONG_NAME is all not null, when we merge dataframes, we can use ROUTE_LONG_NAME isna() to check if the other dataframe has the equivalent data in SHP_DF_ROUTES_MIN
assert SHP_DF_ROUTES['ROUTE_LONG_NAME'].notna().all()

# Inspect the route_id0 values of SHP_DF_ROUTES
SHP_DF_ROUTES['route_id0'].sort_values(key=lambda x: x.apply(int)).unique() # array(['3', '4', '5', '6', '7', '11'], dtype=object)

# Assert that all metro buses route_short_name are 3 or 4 characters long
assert SHP_DF_ROUTES[(SHP_DF_ROUTES['route_id0'] == '4')]['ROUTE_SHORT_NAME'].apply(lambda x: len(x) in [3, 4]).all()
# Proof that route_id1 is unique for each ROUTE_SHORT_NAME for route_id0 == 4
odd_bus_id1_names = SHP_DF_ROUTES[(SHP_DF_ROUTES['route_id1'] != SHP_DF_ROUTES['ROUTE_SHORT_NAME']) & (SHP_DF_ROUTES['route_id0'] == '4')]['ROUTE_SHORT_NAME'].unique()
assert SHP_DF_ROUTES[(SHP_DF_ROUTES['ROUTE_SHORT_NAME'].apply(lambda x: x in odd_bus_id1_names)) & (SHP_DF_ROUTES['route_id0'] == '4')].groupby('ROUTE_SHORT_NAME')['route_id1'].nunique().unique() == [1]

def get_gtfs_id(x):
    if x['route_id0'] == '4':
        return f'4-{x["ROUTE_SHORT_NAME"]}'
    elif x['route_id0'] == '7':
        assert 'TB' in x["route_id1"], f'7-TeleBus route_id1 {x["route_id1"]} does not contain TB'
        route_number = x["route_id1"].lstrip('TB')
        # Add left trailing 0s
        route_number = route_number.zfill(2)
        return f'7-B{route_number}'
    else:
        return f'{x["route_id0"]}-{x["route_id1"]}'

SHP_DF_ROUTES['route_shp_id'] = SHP_DF_ROUTES.apply(lambda x: f'{x["route_id0"]}-{x["route_id1"]}' + (x['route_id2'] if x['route_id0'] == '4' and pd.notna(x['route_id2']) else ''), axis=1)
SHP_DF_ROUTES['route_gtfs_id'] = SHP_DF_ROUTES.apply(lambda x: get_gtfs_id(x), axis=1)

SHP_DF_ROUTES_MIN = SHP_DF_ROUTES[['route_gtfs_id', 'route_shp_id', 'route_id0', 'ROUTE_SHORT_NAME', 'ROUTE_LONG_NAME', 'SHP_FILE']].drop_duplicates()

SHP_DF_ROUTES_MIN = SHP_DF_ROUTES_MIN.groupby('route_gtfs_id').aggregate({'route_shp_id': 'unique', 'route_id0': 'unique', 'ROUTE_SHORT_NAME': 'unique', 'ROUTE_LONG_NAME': 'unique', 'SHP_FILE': 'unique'}).reset_index()

assert SHP_DF_ROUTES_MIN['route_shp_id'].apply(lambda x: len(x) == 1).all()
assert SHP_DF_ROUTES_MIN['route_id0'].apply(lambda x: len(x) == 1).all()
assert SHP_DF_ROUTES_MIN['ROUTE_SHORT_NAME'].apply(lambda x: len(x) == 1).all()
assert SHP_DF_ROUTES_MIN['SHP_FILE'].apply(lambda x: len(x) == 1).all()

# Inspect multiple ROUTE_LONG_NAME of the same route_gtfs_id
SHP_DF_ROUTES_MIN[SHP_DF_ROUTES_MIN['ROUTE_LONG_NAME'].apply(lambda x: len(x) != 1)]

SHP_DF_ROUTES_MIN['route_shp_id'] = SHP_DF_ROUTES_MIN['route_shp_id'].apply(lambda x: x[0])
SHP_DF_ROUTES_MIN['route_id0'] = SHP_DF_ROUTES_MIN['route_id0'].apply(lambda x: x[0])
SHP_DF_ROUTES_MIN['ROUTE_SHORT_NAME'] = SHP_DF_ROUTES_MIN['ROUTE_SHORT_NAME'].apply(lambda x: x[0])
SHP_DF_ROUTES_MIN['SHP_FILE'] = SHP_DF_ROUTES_MIN['SHP_FILE'].apply(lambda x: x[0])

# Inspect SHP_FILE and route_id0
SHP_DF_ROUTES_MIN[['SHP_FILE', 'route_id0']].drop_duplicates().sort_values('SHP_FILE')


assert SHP_GDFS['PTV_METRO_TRAM_STOP']['STOP_ID'].is_unique
assert SHP_GDFS['PTV_METRO_TRAIN_STATION']['STOP_ID'].is_unique
assert SHP_GDFS['PTV_REGIONAL_BUS_STOP']['STOP_ID'].is_unique
assert SHP_GDFS['PTV_REGIONAL_COACH_STOP']['STOP_ID'].is_unique
assert SHP_GDFS['PTV_SKYBUS_STOP']['STOP_ID'].is_unique

# STOP ID in PTV_METRO_BUS_STOP is not unique, however it's only because of the addition of TeleBus routes
shp_metro_bus_stop_duplicated_ids = SHP_GDFS['PTV_METRO_BUS_STOP'][SHP_GDFS['PTV_METRO_BUS_STOP']['STOP_ID'].duplicated(keep=False)].groupby('STOP_ID')['ROUTES_USING_STOP'].unique()
assert shp_metro_bus_stop_duplicated_ids.apply(lambda x: len(x) == 2).all()
assert shp_metro_bus_stop_duplicated_ids.apply(lambda x: len([i for i in x if 'TeleBus' in i]) == 1).all()

# Split PTV_METRO_BUS_STOP into PTV_METROBUS_STOP and PTV_TELEBUS_STOP
SHP_GDFS['PTV_METROBUS_STOP'] = SHP_GDFS['PTV_METRO_BUS_STOP'][SHP_GDFS['PTV_METRO_BUS_STOP']['ROUTES_USING_STOP'].apply(lambda x: 'TeleBus' not in x)].reset_index(drop=True)
SHP_GDFS['PTV_TELEBUS_STOP'] = SHP_GDFS['PTV_METRO_BUS_STOP'][SHP_GDFS['PTV_METRO_BUS_STOP']['ROUTES_USING_STOP'].apply(lambda x: 'TeleBus' in x)].reset_index(drop=True)

assert SHP_GDFS['PTV_METROBUS_STOP']['STOP_ID'].is_unique
assert SHP_GDFS['PTV_TELEBUS_STOP']['STOP_ID'].is_unique

SHP_DFS_STOPS = {
    '2': SHP_GDFS['PTV_METRO_TRAIN_STATION'],
    '3': SHP_GDFS['PTV_METRO_TRAM_STOP'],
    '4': SHP_GDFS['PTV_METROBUS_STOP'],
    # '5': SHP_GDFS['PTV_REGIONAL_COACH_STOP'],
    '6': SHP_GDFS['PTV_REGIONAL_BUS_STOP'],
    '7': SHP_GDFS['PTV_TELEBUS_STOP'],
    '11': SHP_GDFS['PTV_SKYBUS_STOP']
}

for mid in SHP_DFS_STOPS:
    SHP_DFS_STOPS[mid]['ROUTE'] = SHP_DFS_STOPS[mid]['ROUTES_USING_STOP'].apply(lambda x: x.split(',') if pd.notna(x) else [])
    SHP_DFS_STOPS[mid] = SHP_DFS_STOPS[mid][['STOP_ID', 'ROUTE']].explode('ROUTE').reset_index(drop=True)

for mid in ['3', '4', '6', '7', '11']:
    SHP_DFS_STOPS[mid] = SHP_DFS_STOPS[mid].merge(SHP_DF_ROUTES_MIN[SHP_DF_ROUTES_MIN['route_id0'] == mid], left_on='ROUTE', right_on='ROUTE_SHORT_NAME', how='left')
        

# Assert that there is no odd ROUTE in SHP_DFS_STOPS
assert SHP_DFS_STOPS['3']['ROUTE_SHORT_NAME'].notna().all()
assert SHP_DFS_STOPS['4']['ROUTE_SHORT_NAME'].notna().all()
assert SHP_DFS_STOPS['7']['ROUTE_SHORT_NAME'].notna().all()
# assert 'ROUTE' not in SHP_DFS_STOPS['5'].columns
assert (SHP_DFS_STOPS['6'][SHP_DFS_STOPS['6']['ROUTE'].notna() & SHP_DFS_STOPS['6']['ROUTE_SHORT_NAME'].isna()]['ROUTE'] == '').all()
assert (SHP_DFS_STOPS['11'][SHP_DFS_STOPS['11']['ROUTE'].notna() & SHP_DFS_STOPS['11']['ROUTE_SHORT_NAME'].isna()]['ROUTE'] == '').all()


gtfs_df_routes_metrotrains = GTFS_DF_ROUTES_MIN[GTFS_DF_ROUTES_MIN['mode_id'] == '2'][['route_gtfs_id', 'route_short_name']].drop_duplicates().sort_values('route_short_name')

SHP_DFS_STOPS['2'] = SHP_DFS_STOPS['2'][['STOP_ID', 'ROUTE']].drop_duplicates()
SHP_DFS_STOPS['2'] = pd.merge(SHP_DFS_STOPS['2'], gtfs_df_routes_metrotrains, left_on='ROUTE', right_on='route_short_name', how='left')
assert SHP_DFS_STOPS['2']['route_gtfs_id'].notna().all()

for mid in ['2', '3', '4', '6', '7', '11']:
    SHP_DFS_STOPS[mid] = SHP_DFS_STOPS[mid].dropna(subset=['ROUTE'])
    SHP_DFS_STOPS[mid]['route_gtfs_id'] = SHP_DFS_STOPS[mid].apply(lambda x: x['route_gtfs_id'] if pd.notna(x['route_gtfs_id']) else x['ROUTE'], axis=1)
    SHP_DFS_STOPS[mid] = SHP_DFS_STOPS[mid].groupby('STOP_ID').aggregate({'ROUTE': 'unique', 'route_gtfs_id': 'unique'}).reset_index()
    SHP_DFS_STOPS[mid]['ROUTE'] = SHP_DFS_STOPS[mid]['ROUTE'].apply(lambda x: ','.join(x))
    SHP_DFS_STOPS[mid]['route_gtfs_id'] = SHP_DFS_STOPS[mid]['route_gtfs_id'].apply(lambda x: ','.join(x))


SHP_GDFS['PTV_METRO_TRAIN_STATION'] = pd.merge(SHP_GDFS['PTV_METRO_TRAIN_STATION'], SHP_DFS_STOPS['2'][['STOP_ID', 'route_gtfs_id']], on='STOP_ID', how='left')
SHP_GDFS['PTV_METRO_TRAM_STOP'] = pd.merge(SHP_GDFS['PTV_METRO_TRAM_STOP'], SHP_DFS_STOPS['3'][['STOP_ID', 'route_gtfs_id']], on='STOP_ID', how='left')
SHP_GDFS['PTV_METROBUS_STOP'] = pd.merge(SHP_GDFS['PTV_METROBUS_STOP'], SHP_DFS_STOPS['4'][['STOP_ID', 'route_gtfs_id']], on='STOP_ID', how='left')
SHP_GDFS['PTV_REGIONAL_BUS_STOP'] = pd.merge(SHP_GDFS['PTV_REGIONAL_BUS_STOP'], SHP_DFS_STOPS['6'][['STOP_ID', 'route_gtfs_id']], on='STOP_ID', how='left')
SHP_GDFS['PTV_TELEBUS_STOP'] = pd.merge(SHP_GDFS['PTV_TELEBUS_STOP'], SHP_DFS_STOPS['7'][['STOP_ID', 'route_gtfs_id']], on='STOP_ID', how='left')
SHP_GDFS['PTV_SKYBUS_STOP'] = pd.merge(SHP_GDFS['PTV_SKYBUS_STOP'], SHP_DFS_STOPS['11'][['STOP_ID', 'route_gtfs_id']], on='STOP_ID', how='left')


SHP_GDFS['PTV_METRO_TRAIN_STATION']['mode_id'] = '2'
SHP_GDFS['PTV_METRO_TRAM_STOP']['mode_id'] = '3'
SHP_GDFS['PTV_METROBUS_STOP']['mode_id'] = '4'
SHP_GDFS['PTV_REGIONAL_COACH_STOP']['mode_id'] = '5'
SHP_GDFS['PTV_REGIONAL_BUS_STOP']['mode_id'] = '6'
SHP_GDFS['PTV_TELEBUS_STOP']['mode_id'] = '7'
SHP_GDFS['PTV_SKYBUS_STOP']['mode_id'] = '11'


SHP_DF_STOPS : pd.DataFrame = pd.concat([
    SHP_GDFS['PTV_METRO_TRAIN_STATION'],
    SHP_GDFS['PTV_METRO_TRAM_STOP'],
    SHP_GDFS['PTV_METROBUS_STOP'],
    SHP_GDFS['PTV_REGIONAL_COACH_STOP'],
    SHP_GDFS['PTV_REGIONAL_BUS_STOP'],
    SHP_GDFS['PTV_TELEBUS_STOP'],
    SHP_GDFS['PTV_SKYBUS_STOP']
])

SHP_DF_STOPS['geometry'] = SHP_DF_STOPS['geometry'].apply(lambda x: x.coords[0])
SHP_DF_STOPS['STOP_FULL_NAME'] = SHP_DF_STOPS['STOP_NAME']


# Count occurrences of '(' and ')' in STOP_FULL_NAME
assert SHP_DF_STOPS['STOP_FULL_NAME'].apply(lambda x: x.count('(') == x.count(')')).all()
SHP_DF_STOPS['parentheses_count'] = SHP_DF_STOPS['STOP_FULL_NAME'].apply(lambda x: x.count('('))
SHP_DF_STOPS[SHP_DF_STOPS['parentheses_count'] != 1]
SHP_DF_STOPS.drop(columns='parentheses_count', inplace=True)

# Get only the last pair of parentheses
def get_suburb(stop_full_name):
    if '(' not in stop_full_name:
        stop_name = stop_full_name
        stop_suburb = np.nan
        return (stop_name, stop_suburb)
    parentheses_count = 0
    for i in range(len(stop_full_name) - 1, -1, -1):
        c = stop_full_name[i]
        if c == ')':
            parentheses_count += 1
        if c == '(':
            parentheses_count -= 1
        if parentheses_count == 0:
            stop_name = stop_full_name[:i].strip()
            stop_suburb = stop_full_name[i:].removesuffix(')').lstrip('(')
            return (stop_name, stop_suburb)

SHP_DF_STOPS['STOP_SUBURB'] = SHP_DF_STOPS['STOP_FULL_NAME'].apply(get_suburb)
SHP_DF_STOPS['STOP_NAME'] = SHP_DF_STOPS['STOP_SUBURB'].apply(lambda x: x[0])
SHP_DF_STOPS['STOP_SUBURB'] = SHP_DF_STOPS['STOP_SUBURB'].apply(lambda x: x[1])


# Inspection of the data shows that there are some stops with no suburb. We will manually fill these in.
SHP_DF_STOPS[SHP_DF_STOPS['STOP_SUBURB'].isna()]

def custom_stop_suburb(x):
    if x['STOP_ID'] == '35117':
        assert x['STOP_NAME'] == 'Ascot St/Sturt St', x['STOP_NAME']
        return 'Ballarat Central'
    return x['STOP_SUBURB']

SHP_DF_STOPS['STOP_SUBURB'] = SHP_DF_STOPS.apply(custom_stop_suburb, axis=1)

assert SHP_DF_STOPS['STOP_SUBURB'].apply(lambda x: x.count('(') == x.count(')')).all()

SHP_DF_STOPS['STOP_SUBURB_PARENTHESES'] = SHP_DF_STOPS['STOP_SUBURB'].apply(lambda x: x.count('('))

assert (SHP_DF_STOPS['STOP_SUBURB_PARENTHESES'] <= 1).all()

SHP_DF_STOPS['STOP_SUBURB_NAME'], SHP_DF_STOPS['STOP_SUBURB_POSTCODE'] = zip(*SHP_DF_STOPS['STOP_SUBURB'].apply(lambda x: (x.split('(')[0], x.split('(')[1].removesuffix(')') if '(' in x else np.nan)))


SHP_DF_STOPS['LATITUDE'] = SHP_DF_STOPS['LATITUDE'].apply(np.float64)
SHP_DF_STOPS['LONGITUDE'] = SHP_DF_STOPS['LONGITUDE'].apply(np.float64)


SHP_DF_STOPS = SHP_DF_STOPS.groupby('STOP_ID').aggregate({'STOP_NAME': 'unique', 'STOP_SUBURB': 'unique', 'STOP_FULL_NAME': 'unique', 'LATITUDE': 'unique', 'LONGITUDE': 'unique', 'TICKETZONE': 'unique', 'route_gtfs_id': 'unique', 'geometry': 'unique'})
# 7s - 10s
SHP_DF_STOPS.reset_index(inplace=True)

for col in SHP_DF_STOPS.columns:
    if col != 'STOP_ID':
        SHP_DF_STOPS[f'{col}_len'] = SHP_DF_STOPS[col].apply(len)

for col in ['STOP_NAME', 'STOP_SUBURB', 'STOP_FULL_NAME', 'LATITUDE', 'LONGITUDE', 'geometry']:
    assert SHP_DF_STOPS[f'{col}_len'].max() == 1
    SHP_DF_STOPS[col] = SHP_DF_STOPS[col].apply(lambda x: x[0])

SHP_DF_STOPS['TICKETZONE'] = SHP_DF_STOPS['TICKETZONE'].apply(lambda x: ','.join([str(i) for i in x if not pd.isna(i)]))
SHP_DF_STOPS['route_gtfs_id'] = SHP_DF_STOPS['route_gtfs_id'].apply(lambda x: ','.join([str(i) for i in x if not pd.isna(i)]))

SHP_DF_STOPS.drop(columns=[col for col in SHP_DF_STOPS.columns if col.endswith('_len')], inplace=True)

  SHP_DF_STOPS['geometry'] = SHP_DF_STOPS['geometry'].apply(lambda x: x.coords[0])


In [285]:
SHP_DF_STOPS[SHP_DF_STOPS['STOP_ID'] == '10697']

Unnamed: 0,STOP_ID,STOP_NAME,STOP_SUBURB,STOP_FULL_NAME,LATITUDE,LONGITUDE,TICKETZONE,route_gtfs_id,geometry
608,10697,VicRoads/Hartnett Dr,Seaford,VicRoads/Hartnett Dr (Seaford),-38.117843,145.140596,2,"4-779,4-778","(145.1406018143186, -38.1178298666463)"


In [286]:
GA_DF_STOPS_FULL[GA_DF_STOPS_FULL['stop_id_gtfs'] == '10697']

Unnamed: 0,stop_id_gtfs,stop_name_gtfs,stop_lat,stop_lon,mode_id_gtfs,point_id,operating_hours,mode_id_api,flexible_stop_opening_hours,station_type,...,stop_staffing_thu_pm_to,stop_staffing_tue_am_from,stop_staffing_tue_am_to,stop_staffing_tue_pm_from,stop_staffing_tue_pm_to,stop_staffing_wed_am_from,stop_staffing_wed_am_to,stop_staffing_wed_pm_from,stop_staffing_wed_pm_To,gtfs_stop_id
608,10697,VicRoads Customer Service Center/Hartnett Dr (...,-38.117843,145.140596,4,10697,N,1,,,...,,,,,,,,,,10697


In [287]:
GTFS_DF_STOPS[GTFS_DF_STOPS['stop_id'] == '10697']

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,mode_id
2600,10697,VicRoads Customer Service Center/Hartnett Dr (...,-38.117843,145.140596,4


In [323]:
GA_DF_STOPS_FULL[GA_DF_STOPS_FULL['stop_id_api'].isna()]

Unnamed: 0,stop_id_gtfs,stop_name_gtfs,stop_lat,stop_lon,mode_id_gtfs,point_id,operating_hours,mode_id_api,flexible_stop_opening_hours,station_type,...,stop_staffing_thu_pm_to,stop_staffing_tue_am_from,stop_staffing_tue_am_to,stop_staffing_tue_pm_from,stop_staffing_tue_pm_to,stop_staffing_wed_am_from,stop_staffing_wed_am_to,stop_staffing_wed_pm_from,stop_staffing_wed_pm_To,gtfs_stop_id
429,10481,St Bedes College/Naples Rd (Mentone),-37.990184,145.068511,4,,,,,,...,,,,,,,,,,
916,11016,Torbay St/Greensborough Hwy (Macleod),-37.718029,145.081354,4,,,,,,...,,,,,,,,,,
1470,11657,Rowellyn Park Primary School/Tattler St (Carru...,-38.103490,145.192132,4,,,,,,...,,,,,,,,,,
1737,11953,VicRoads Customer Service Center/Hartnett Dr (...,-38.117827,145.140699,4,,,,,,...,,,,,,,,,,
2008,12335,Oatlands Primary School/Kurrajong Rd (Narre Wa...,-38.006027,145.316499,4,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25207,6451,West Gate Fwy/Williamstown Rd (Yarraville),-37.825415,144.881106,4,,,,,,...,,,,,,,,,,
27609,9137,Organ Pipes National Park/Calder Fwy (Keilor N...,-37.671687,144.764630,4,,,,,,...,,,,,,,,,,
27610,9138,Calder Park/Calder Fwy (Calder Park),-37.666316,144.754180,4,,,,,,...,,,,,,,,,,
27611,9139,Duncans Lane/Calder Fwy (Diggers Rest),-37.655018,144.740647,4,,,,,,...,,,,,,,,,,


In [4]:
API_STOPS_INFO = json.load(open(f'{DATA_DIR}/stops_info.json', 'r'))

In [48]:
API_STOPS_INFO = json.load(open(f'{DATA_DIR}/stops_info.json', 'r'))
API_STOPS_INFO_LIST = []

for stop_id in API_STOPS_INFO:
    for route_type in API_STOPS_INFO[stop_id]:
        assert str(API_STOPS_INFO[stop_id][route_type]['stop']['route_type']) == str(route_type)
        assert str(API_STOPS_INFO[stop_id][route_type]['stop']['stop_id']) == str(stop_id)
        API_STOPS_INFO_LIST.append(API_STOPS_INFO[stop_id][route_type]['stop'])

for stop_info in API_STOPS_INFO_LIST:
    stop_info : dict
    kv_is_dict = [(k, v) for k, v in stop_info.items() if isinstance(v, dict)]
    for k, v in kv_is_dict:
        for k2, v2 in v.items():
            new_key = f'{k}_{k2}'
            assert new_key not in stop_info, f'{new_key} already exists in {stop_info}'
            stop_info[new_key] = v2
        del stop_info[k]
    for k, v in stop_info.items():
        if isinstance(v, int) and not isinstance(v, bool):
            stop_info[k] = str(v)

API_DF_STOPS_INFO = pd.DataFrame(API_STOPS_INFO_LIST, dtype=str)

In [49]:
[k for k, v in API_STOPS_INFO_LIST[0].items() if isinstance(v, int) and not isinstance(v, bool)]

[]

In [51]:
API_DF_STOPS_INFO['stop_location_municipality_id'].isna()

0        False
1        False
2        False
3        False
4        False
         ...  
20578     True
20579     True
20580     True
20581     True
20582     True
Name: stop_location_municipality_id, Length: 20583, dtype: bool