In [1]:
from hashlib import sha1
import hmac
import requests
import pandas as pd
import geopandas as gpd
import json
import os
import numpy as np
import logging
import sys
import time
import pyptvgtfs

In [3]:
DATA_DIR = '../local/ptv-api/data'

In [3]:
# with open(f'{DATA_DIR}/all_stops.json', 'w') as f:
#     f.write(json.dumps(API_STOPS_STOPS))
# with open(f'{DATA_DIR}/all_stops_geopaths.json', 'w') as f:
#     f.write(json.dumps(API_STOPS_GEOPATHS))

API_STOPS_STOPS = json.load(open(f'{DATA_DIR}/all_stops.json'))
API_STOPS_GEOPATHS = json.load(open(f'{DATA_DIR}/all_stops_geopaths.json'))
API_DF_STOPS = pd.DataFrame(API_STOPS_STOPS, dtype=str)
API_DF_STOPS.drop(columns=['disruption_ids'], inplace=True)
API_DF_STOPS['stop_ticket_zones'] = API_DF_STOPS['stop_ticket_zones'].apply(lambda x: ', '.join(map(str, x)) if isinstance(x, list) else x)
API_DF_STOPS.drop(columns=['stop_ticket'], inplace=True)
API_DF_STOPS['stop_is_regional'] = API_DF_STOPS['stop_zone'].apply(lambda x: 'Regional' in x)
API_DF_STOPS['stop_zones'] = API_DF_STOPS['stop_ticket_zones']
API_DF_STOPS.drop(columns=['stop_ticket_zones', 'stop_zone'], inplace=True)
API_DF_STOPS = API_DF_STOPS[['stop_id', 'stop_name', 'stop_suburb', 'stop_latitude', 'stop_longitude', 'stop_sequence', 'route_id', 'direction_id',  'route_type',  'stop_landmark', 'stop_zones', 'stop_ticket_type', 'stop_is_free_fare_zone', 'stop_is_regional', 'stop_ticket_machine', 'stop_ticket_checks', 'stop_vline_reservation']]
# 2s - 4s

API_all_stop_route_types = API_DF_STOPS[['stop_id', 'route_type']].drop_duplicates().apply(tuple, axis=1).tolist()

API_all_stop_route_types = [(str(stop_id), str(route_type)) for stop_id, route_type in API_all_stop_route_types]

API_STOPS_INFO = json.load(open(f'{DATA_DIR}/stops_info.json'))
# 1s - 3s

In [23]:
API_STOPS_INFO_LIST = []

for stop_id, route_type in API_all_stop_route_types:
    assert str(API_STOPS_INFO[stop_id][route_type]['stop']['route_type']) == str(route_type)
    assert str(API_STOPS_INFO[stop_id][route_type]['stop']['stop_id']) == str(stop_id)
    API_STOPS_INFO_LIST.append(API_STOPS_INFO[stop_id][route_type]['stop'])

for stop_info in API_STOPS_INFO_LIST:
    stop_info : dict
    kv_is_dict = [(k, v) for k, v in stop_info.items() if isinstance(v, dict)]
    for k, v in kv_is_dict:
        for k2, v2 in v.items():
            new_key = f'{k}_{k2}'
            assert new_key not in stop_info, f'{new_key} already exists in {stop_info}'
            stop_info[new_key] = v2
        del stop_info[k]

API_DF_STOPS_INFO = pd.DataFrame(API_STOPS_INFO_LIST, dtype=str)

assert API_DF_STOPS_INFO['station_details_id'].unique() == '0'

API_DF_STOPS_INFO.drop(columns=['station_details_id', 'disruption_ids'], inplace=True)

In [34]:
list(API_DF_STOPS_INFO.columns)

['point_id',
 'operating_hours',
 'mode_id',
 'flexible_stop_opening_hours',
 'station_type',
 'station_description',
 'route_type',
 'routes',
 'stop_id',
 'stop_name',
 'stop_landmark',
 'stop_contact_lost_property_contact_number',
 'stop_contact_phone',
 'stop_contact_lost_property',
 'stop_contact_feedback',
 'stop_ticket_ticket_type',
 'stop_ticket_zone',
 'stop_ticket_is_free_fare_zone',
 'stop_ticket_ticket_machine',
 'stop_ticket_ticket_checks',
 'stop_ticket_vline_reservation',
 'stop_ticket_ticket_zones',
 'stop_location_postcode',
 'stop_location_municipality',
 'stop_location_municipality_id',
 'stop_location_primary_stop_name',
 'stop_location_road_type_primary',
 'stop_location_second_stop_name',
 'stop_location_road_type_second',
 'stop_location_bay_nbr',
 'stop_location_suburb',
 'stop_location_gps',
 'stop_amenities_seat_type',
 'stop_amenities_pay_phone',
 'stop_amenities_indoor_waiting_area',
 'stop_amenities_sheltered_waiting_area',
 'stop_amenities_bicycle_rack',
 

In [37]:
GTFS = pyptvgtfs.process_gtfs_zip('../downloads/20240315_193134/gtfs.zip', '20240315_193134')
# 1m - 3m
GTFS.drop(columns=['version_id'], inplace=True)
GTFS_DFS = GTFS.set_index(['mode_id', 'table_name'])['df'].to_dict()
new_GTFS_DFS = {}
for mid, v in GTFS_DFS.items():
    new_GTFS_DFS[mid[0]] = new_GTFS_DFS.get(mid[0], {})
    new_GTFS_DFS[mid[0]][mid[1]] = v
GTFS_DFS : dict[str, dict[str, pd.DataFrame]] = new_GTFS_DFS
for mid in GTFS_DFS:
    for tn in GTFS_DFS[mid]:
        GTFS_DFS[mid][tn]['mode_id'] = mid

In [41]:
GTFS_DF_STOPS = pd.concat([GTFS_DFS[mid]['stops'] for mid in GTFS_DFS], ignore_index=True)

In [44]:
SESSION = requests.Session()
ENV = json.load(open('../local-env.json'))

def get_ptv_api_url(
        endpoint : str,
        dev_id : str | int, 
        api_key : str | int,
    ):
    """
    Returns the URL to use PTV TimeTable API.

    Generates a signature from dev id (user id), API key, and endpoint.

    See the following for more information:
    - Home page: https://www.ptv.vic.gov.au/footer/data-and-reporting/datasets/ptv-timetable-api/
    - Swagger UI: https://timetableapi.ptv.vic.gov.au/swagger/ui/index
    - Swagger Docs JSON: https://timetableapi.ptv.vic.gov.au/swagger/docs/v3 (You can use this to find the endpoints you want to use.)
    """
    assert endpoint.startswith('/'), f'Endpoint must start with /, got {endpoint}'
    raw = f'{endpoint}{'&' if '?' in endpoint else '?'}devid={dev_id}'
    hashed = hmac.new(api_key.encode('utf-8'), raw.encode('utf-8'), sha1)  # Encode the raw string to bytes
    signature = hashed.hexdigest()
    return f'https://timetableapi.ptv.vic.gov.au{raw}&signature={signature}'


def get_data(endpoint : str, need_auth : bool = True):
    """
    Returns the data from the URL.
    """
    if need_auth:
        url = get_ptv_api_url(endpoint, ENV['PTV_TIMETABLE_DEV_ID'], ENV['PTV_TIMETABLE_API_KEY'])
    else:
        url = f'https://timetableapi.ptv.vic.gov.au{endpoint}'
    response = SESSION.get(url)
    response.raise_for_status()
    return response.json()

In [47]:
API_DOCS = get_data('/swagger/docs/v3', need_auth=False)

STATIC_API_ENDPOINTS = [k for k in API_DOCS['paths'].keys() if '{' not in k]

API_ROUTES : dict = get_data('/v3/routes')['routes']

API_ROUTE_TYPES : dict = get_data('/v3/route_types')['route_types']

API_DISRUPTIONS : dict = get_data('/v3/disruptions')['disruptions']

API_DISRUPTION_MODES : dict = get_data('/v3/disruptions/modes')['disruption_modes']

API_OUTLETS : dict = get_data('/v3/outlets')['outlets']

# Create dataframes from the data

API_DF_ROUTE_TYPES = pd.DataFrame(API_ROUTE_TYPES)

API_DF_DISRUPTION_MODES = pd.DataFrame(API_DISRUPTION_MODES)

API_DF_OUTLETS = pd.DataFrame(API_OUTLETS)

# There are some faulty data in the outlets data. In particular, the latitude is > 0, which is not possible in Victoria.
API_DF_OUTLETS['outlet_latitude'] = API_DF_OUTLETS['outlet_latitude'].apply(lambda x: -x if x > 0 else x)
for route in API_ROUTES:
    for k, v in route['route_service_status'].items():
        assert k not in route, f'Key {k} already exists in route'
        route[k] = v
    del route['route_service_status']


API_DF_ROUTES = pd.DataFrame(API_ROUTES, dtype=str)


In [None]:
GTFS_MODE_TO_API_ROUTE_TYPE = {
    '1': '3',
    '2': '0',
    '3': '1',
    '4': '2',
    '5': '3',
    '6': '3',
    '7': '3',
    '8': '4',
    '11': '3',
}

In [78]:
API_DF_STOPS_GTFS_MAP = API_DF_STOPS_INFO[['stop_id', 'route_type', 'point_id']].drop_duplicates()
assert (API_DF_STOPS_GTFS_MAP.groupby(['stop_id', 'point_id'])['route_type'].nunique() == 1).all()

In [88]:
GA_DF_STOPS = pd.merge(GTFS_DF_STOPS, API_DF_STOPS_GTFS_MAP, left_on='stop_id', right_on='point_id', how='outer', suffixes=('_gtfs', '_api'))

In [133]:
# GA_STOPS[GA_STOPS['stop_id_gtfs'].isna()]
point_gtfs_mode_id_list = GA_DF_STOPS[GA_DF_STOPS['stop_id_api'].isna()][['stop_id_gtfs', 'mode_id']].apply(tuple, axis=1).unique()

# Group by mode_id, get list of unique route_types sort by number of occurences
gtfs_mode_2_route_type_api = GA_DF_STOPS.dropna(subset='route_type').groupby('mode_id')['route_type'].apply(lambda x: x.value_counts().index.tolist())


In [136]:
API_ROUTE_TYPE_IDS = [d['route_type'] for d in API_ROUTE_TYPES]

In [231]:
GA_MISSING_STOPS = {}
priority_route_types = {
    '1': ['3', '0', '2', '1', '4'],
    '2': ['0', '3', '2', '1', '4'],
    '3': ['1', '2', '3', '0', '4'],
    '4': ['2', '3', '1', '0', '4'],
    '5': ['3', '2', '1', '0', '4'],
    '6': ['2', '3', '1', '0', '4'],
    '7': ['2', '3', '1', '0', '4'],
    '8': ['2', '3', '1', '0', '4'],
    '10': ['0', '3', '2', '1', '4'],
    '11': ['2', '3', '1', '0', '4'],
}
for i, (point_id, mode_id) in enumerate(point_gtfs_mode_id_list):
    for route_type in priority_route_types[mode_id]:
        endpoint = f'/v3/stops/{point_id}/route_type/{route_type}?gtfs=true&stop_location=true&stop_amenities=true&stop_accessibility=true&stop_contact=true&stop_ticket=true&stop_staffing=true&stop_disruptions=false'
        data = None
        while data is None:
            try:
                data = get_data(endpoint)
                GA_MISSING_STOPS[point_id] = GA_MISSING_STOPS.get(point_id, {})
                GA_MISSING_STOPS[point_id][route_type] = data['stop']
                print(f'[{i}] [{point_id} {route_type}] Got data for stop {point_id}')
            except requests.exceptions.HTTPError as e:
                if e.response.status_code in [403, 503, 504]:
                    print(f'[{i}] [{point_id} {route_type}] Timeout. Error {e.response.status_code}. Retrying in 30 seconds...')
                    time.sleep(30)
                    continue
                else:
                    print(f'[{i}] [{point_id} {route_type}] Got error {e.response.status_code}.')
                    break
        # if data is not None:
        #     # Our assumption is that each stop has only one route type
        #     # We've tested this assumption when we tried getting data for all route types for a stop
        #     # Seems to be false. Some stops have multiple route types.
        #     break 
    if point_id not in GA_MISSING_STOPS:
        print(f'[{i}] ERROR: Failed to get data for stop {point_id}')

# Full 5 route_types: 1m - 2m per 100. 86 mins total for 7694 stops
# Select 1 route_type only: 17m - 20m total
with open(f'{DATA_DIR}/missing_stops_real.json', 'w') as f:
    f.write(json.dumps(GA_MISSING_STOPS))


[0] [10019 2] Got data for stop 10019
[0] [10019 3] Got error 400.
[0] [10019 1] Got error 400.
[0] [10019 0] Got error 400.
[0] [10019 4] Got error 400.
[0] ERROR: Failed to get data for stop 10019
[1] [10020 2] Got data for stop 10020
[1] [10020 3] Got error 400.
[1] [10020 1] Got error 400.
[1] [10020 0] Got error 400.
[1] [10020 4] Got error 400.
[1] ERROR: Failed to get data for stop 10020
[2] [10021 2] Got data for stop 10021
[2] [10021 3] Got error 400.
[2] [10021 1] Got error 400.
[2] [10021 0] Got error 400.
[2] [10021 4] Got error 400.
[2] ERROR: Failed to get data for stop 10021
[3] [10022 2] Got data for stop 10022
[3] [10022 3] Got error 400.
[3] [10022 1] Got error 400.
[3] [10022 0] Got error 400.
[3] [10022 4] Got error 400.
[3] ERROR: Failed to get data for stop 10022
[4] [10023 2] Got data for stop 10023
[4] [10023 3] Got error 400.
[4] [10023 1] Got error 400.
[4] [10023 0] Got error 400.
[4] [10023 4] Got error 400.
[4] ERROR: Failed to get data for stop 10023
[5] [

In [250]:
GA_MISSING_STOPS = json.load(open(f'{DATA_DIR}/missing_stops_real.json'))
GA_MISSING_STOPS_LIST = []
point_gtfs_list = list(set([point_id for point_id, mode_id in point_gtfs_mode_id_list]))
for i, point_id in enumerate(point_gtfs_list):
    if point_id not in GA_MISSING_STOPS:
        # print(f'[{i}] {point_id} not in GA_MISSING_STOPS')
        continue
    for route_type, stop in GA_MISSING_STOPS[point_id].items():
        # assert all(['gtfs' not in k for k in GA_MISSING_STOPS[point_id][route_type].keys()])
        assert 'gtfs_stop_id' not in stop, f'{point_id}'
        assert str(stop['route_type']) == str(route_type), f'{point_id}'
        stop['gtfs_stop_id'] = point_id
        GA_MISSING_STOPS_LIST.append(stop)

for stop_info in GA_MISSING_STOPS_LIST:
    stop_info : dict
    kv_is_dict = [(k, v) for k, v in stop_info.items() if isinstance(v, dict)]
    for k, v in kv_is_dict:
        for k2, v2 in v.items():
            new_key = f'{k}_{k2}'
            assert new_key not in stop_info, f'{new_key} already exists in {stop_info}'
            stop_info[new_key] = v2
        del stop_info[k]

GA_DF_MISSING_STOPS = pd.DataFrame(GA_MISSING_STOPS_LIST, dtype=str)

assert GA_DF_MISSING_STOPS['station_details_id'].unique() == '0'

GA_DF_MISSING_STOPS.drop(columns=['station_details_id', 'disruption_ids'], inplace=True)

GA_DF_MISSING_STOPS_MIN = GA_DF_MISSING_STOPS[['gtfs_stop_id', 'stop_id', 'point_id', 'route_type']]

assert len(GA_DF_MISSING_STOPS) == len(GA_DF_MISSING_STOPS_MIN)

In [246]:
GA_DF_MISSING_STOPS_MIN[GA_DF_MISSING_STOPS_MIN['gtfs_stop_id'] == GA_DF_MISSING_STOPS_MIN['point_id']]

k = GA_DF_MISSING_STOPS_MIN.groupby('stop_id')['route_type'].unique()
k[k.apply(len)>1]
GA_DF_MISSING_STOPS_MIN.groupby('stop_id')['route_type'].nunique().value_counts()

GA_DF_MISSING_STOPS_MIN[GA_DF_MISSING_STOPS_MIN['gtfs_stop_id'].duplicated(keep=False)]
1

1

In [259]:
API_DF_STOPS_INFO['gtfs_stop_id'] = API_DF_STOPS_INFO['point_id']

In [260]:
API_DF_STOPS_INFO_FULL = pd.concat([API_DF_STOPS_INFO, GA_DF_MISSING_STOPS], ignore_index=True)

In [265]:
API_DF_STOPS_INFO_FULL.groupby(['gtfs_stop_id'])['stop_id'].nunique().value_counts()

stop_id
1    27721
Name: count, dtype: int64

In [268]:
GA_DF_STOPS_FULL = pd.merge(GTFS_DF_STOPS, API_DF_STOPS_INFO_FULL, left_on='stop_id', right_on='gtfs_stop_id', suffixes=('_gtfs', '_api'), how='outer')

In [298]:
API_DF_ROUTES[API_DF_ROUTES['route_id'] == '5747']

Unnamed: 0,route_type,route_id,route_name,route_number,route_gtfs_id,geopath,description,timestamp
253,2,5747,Dandenong - Brighton via Parkmore Shopping Centre,812,4-812,[],Good Service,2024-03-16T12:20:48.3830121+00:00


In [326]:
# df1 = pd.DataFrame(get_data('/v3/stops/route/5747/route_type/2')['stops'])
df1 = pd.DataFrame(get_data('/v3/stops/route/5747/route_type/2?direction_id=261&include_geopath=true')['stops'])

In [327]:
len(df1)

198

In [333]:
datak = json.load(open(f'{DATA_DIR}/stops.json'))

In [336]:
dfdatak = pd.DataFrame(datak, dtype=str)
len(dfdatak[(dfdatak['route_id'] == '5747') & (dfdatak['direction_id'] == '261')])

197

In [318]:
df1[df1['stop_name'].str.contains('College')]

Unnamed: 0,disruption_ids,stop_suburb,route_type,stop_latitude,stop_longitude,stop_sequence,stop_ticket,stop_id,stop_name,stop_landmark
83,[],Brighton East,2,-37.929645,145.006088,121,"{'ticket_type': '', 'zone': 'Zone 2', 'is_free...",20367,Haileybury College Junior School/118 South Rd,Haileybury College Junior School
102,[],Mentone,2,-37.98406,145.066238,71,"{'ticket_type': '', 'zone': 'Zone 2', 'is_free...",20329,Kilbreda College/Como Pde,Kilbreda College
167,[],Mentone,2,-37.990185,145.068512,64,,23071,St Bedes College/Naples Rd,St Bedes College
168,[],Brighton East,2,-37.92957,145.007019,0,"{'ticket_type': '', 'zone': 'Zone 2', 'is_free...",20486,St Leonards College/South Rd,St Leonards College


In [320]:
get_data('/v3/stops/23071/route_type/2')

HTTPError: 400 Client Error: Bad Request for url: https://timetableapi.ptv.vic.gov.au/v3/stops/23071/route_type/2?devid=3002773&signature=330c768dc3d30214544b17fbab3253d68abf2ba5

In [293]:
GA_DF_STOPS_FULL[GA_DF_STOPS_FULL['stop_name_gtfs'].notna() & GA_DF_STOPS_FULL['stop_name_gtfs'].str.contains('VicRoads Customer Service Center/Hartnett Dr')]
GA_DF_STOPS_FULL[GA_DF_STOPS_FULL['stop_id_api'].notna() & GA_DF_STOPS_FULL['stop_id_api'].str.contains('23071')]

Unnamed: 0,stop_id_gtfs,stop_name_gtfs,stop_lat,stop_lon,mode_id_gtfs,point_id,operating_hours,mode_id_api,flexible_stop_opening_hours,station_type,...,stop_staffing_thu_pm_to,stop_staffing_tue_am_from,stop_staffing_tue_am_to,stop_staffing_tue_pm_from,stop_staffing_tue_pm_to,stop_staffing_wed_am_from,stop_staffing_wed_am_to,stop_staffing_wed_pm_from,stop_staffing_wed_pm_To,gtfs_stop_id
36,10040,Gilbert Rd/Bell St (Preston),-37.743975,144.989530,4,9039,N,1,,,...,,,,,,,,,,10040
37,10041,Erin St/Bell St (Preston),-37.744277,144.992926,4,10041,N,1,,,...,,,,,,,,,,10041
38,10042,Penola St/Bell St (Preston),-37.744553,144.995938,4,10042,N,1,,,...,,,,,,,,,,10042
40,10044,Plenty Rd/Bell St (Preston),-37.745483,145.005037,4,9043,N,1,,,...,,,,,,,,,,10044
46,10050,Plunkett St/Bell St (Bellfield (3081)),-37.750313,145.042668,4,10050,N,1,,,...,,,,,,,,,,10050
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28054,9630,Savona St/Warrigal Rd (Mentone),-37.980604,145.071746,4,9630,N,1,,,...,,,,,,,,,,9630
28055,9631,Cervara St/Warrigal Rd (Mentone),-37.978591,145.072128,4,9631,N,1,,,...,,,,,,,,,,9631
28212,982,Malpas St/Albert St (Preston),-37.734210,145.024369,4,982,N,1,,,...,,,,,,,,,,982
28278,9884,Sherbourne Ave/Blandford Cres (Bayswater North),-37.829825,145.293469,4,9884,N,1,,,...,,,,,,,,,,9884


In [199]:
stop_id_api_list = GA_DF_STOPS['stop_id_api'].unique()
df1 = GA_DF_MISSING_STOPS_MIN[GA_DF_MISSING_STOPS_MIN['stop_id'].apply(lambda x: x not in stop_id_api_list)]

In [242]:
GTFS_DF_STOPS[GTFS_DF_STOPS['stop_id'] == '13762']

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,mode_id
20584,13762,Hudson Park/Sydney St (Kilmore),-37.295707,144.952093,5
21461,13762,Hudson Park/Sydney St (Kilmore),-37.295707,144.952093,6


In [202]:
GTFS_DF_STOPS[GTFS_DF_STOPS['stop_id'] == '10481']

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,mode_id
2422,10481,St Bedes College/Naples Rd (Mentone),-37.990184,145.068511,4


In [209]:
GA_DF_MISSING_STOPS[GA_DF_MISSING_STOPS['stop_id'] == 23071]

Unnamed: 0,point_id,operating_hours,mode_id,flexible_stop_opening_hours,station_type,station_description,route_type,routes,stop_id,stop_name,...,stop_staffing_thu_pm_from,stop_staffing_thu_pm_to,stop_staffing_tue_am_from,stop_staffing_tue_am_to,stop_staffing_tue_pm_from,stop_staffing_tue_pm_to,stop_staffing_wed_am_from,stop_staffing_wed_am_to,stop_staffing_wed_pm_from,stop_staffing_wed_pm_To


In [283]:
GTFS_DF_ROUTES = pd.concat([GTFS_DFS[mid]['routes'] for mid in GTFS_DFS])
# 1m - 2m
GTFS_DF_ROUTES['route_idx'] = GTFS_DF_ROUTES['route_id'].apply(lambda x: x.split('-'))
GTFS_DF_ROUTES['route_id0'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[0])
GTFS_DF_ROUTES['route_id1'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[1])
GTFS_DF_ROUTES['route_id2'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[2] if len(x) > 4 else np.nan)
GTFS_DF_ROUTES['route_id3'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[-2])
GTFS_DF_ROUTES['route_id4'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[-1])
GTFS_DF_ROUTES['route_id01'] = GTFS_DF_ROUTES['mode_id'] + '-' + GTFS_DF_ROUTES['route_id1']
GTFS_DF_ROUTES['route_gtfs_id'] = GTFS_DF_ROUTES.apply(lambda x: f'{x["mode_id"]}-{x["route_id1"]}' + (x['route_id2'] if x['mode_id'] == '4' and pd.notna(x['route_id2']) else ''), axis=1)

GTFS_DF_ROUTES_MIN = GTFS_DF_ROUTES[['route_gtfs_id', 'route_short_name', 'route_long_name', 'mode_id']].drop_duplicates()


SHP_DIR = '../local/ptv-spatial-datasets'
SHP_GDFS : gpd.GeoDataFrame = { f.split('.')[0]: gpd.read_file(os.path.join(SHP_DIR, f)) for f in os.listdir(SHP_DIR) if f.endswith('.shp') }
for f in os.listdir(SHP_DIR):
    if f.endswith('.txt'):
        gdf_name = f.removesuffix('_column_names.txt').upper()
        with open(os.path.join(SHP_DIR, f), 'r') as file:
            gdf_column_names = [line.strip() for line in file.readlines()][4:]
        assert gdf_name in SHP_GDFS, f'{gdf_name} not in GDFS'
        for line in gdf_column_names:
            assert ' = ' in line, f'Invalid line: {line}'
        gdf_column_names = { line.split(' = ')[0]: line.split(' = ')[1] for line in gdf_column_names }
        SHP_GDFS[gdf_name].rename(columns=gdf_column_names, inplace=True)

assert SHP_GDFS['PTV_METRO_BUS_STOP']['ROUTES_USING_STOP'].notna().all()
assert SHP_GDFS['PTV_METRO_TRAM_STOP']['ROUTES_USING_STOP'].notna().all()
assert SHP_GDFS['PTV_METRO_TRAIN_STATION']['ROUTES_USING_STOP'].notna().all()
# Check if a column exists in a GeoDataFrame
assert 'ROUTES_USING_STOP' not in SHP_GDFS['PTV_REGIONAL_COACH_STOP'].columns
assert SHP_GDFS['PTV_SKYBUS_STOP']['ROUTES_USING_STOP'].isna().all()
assert SHP_GDFS['PTV_REGIONAL_BUS_STOP']['ROUTES_USING_STOP'].notna().any()



for mid, gdf in SHP_GDFS.items():
    if 'ROUTE' in mid:
        gdf['SHP_FILE'] = mid
SHP_DF_ROUTES : pd.DataFrame = pd.concat([gdf for mid, gdf in SHP_GDFS.items() if 'ROUTE' in mid], ignore_index=True)



SHP_DF_ROUTES['route_idx'] = SHP_DF_ROUTES['ROUTE_ID'].apply(lambda x: x.split('-'))
SHP_DF_ROUTES['route_id0'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[0])
SHP_DF_ROUTES['route_id1'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[1])
SHP_DF_ROUTES['route_id2'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[2] if len(x) > 4 else np.nan)
SHP_DF_ROUTES['route_id3'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[-2])
SHP_DF_ROUTES['route_id4'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[-1])
SHP_DF_ROUTES['route_id01'] = SHP_DF_ROUTES['route_id0'] + '-' + SHP_DF_ROUTES['route_id1']

assert SHP_DF_ROUTES.groupby('route_id01').aggregate({'route_id2': 'unique', 'route_id3': 'unique', 'route_id4': 'unique'}).apply(lambda x: len(x['route_id2']) <= 1 or len(x['route_id4']) <= 1, axis=1).all()

# Since ROUTE_LONG_NAME is all not null, when we merge dataframes, we can use ROUTE_LONG_NAME isna() to check if the other dataframe has the equivalent data in SHP_DF_ROUTES_MIN
assert SHP_DF_ROUTES['ROUTE_LONG_NAME'].notna().all()

# Inspect the route_id0 values of SHP_DF_ROUTES
SHP_DF_ROUTES['route_id0'].sort_values(key=lambda x: x.apply(int)).unique() # array(['3', '4', '5', '6', '7', '11'], dtype=object)

# Assert that all metro buses route_short_name are 3 or 4 characters long
assert SHP_DF_ROUTES[(SHP_DF_ROUTES['route_id0'] == '4')]['ROUTE_SHORT_NAME'].apply(lambda x: len(x) in [3, 4]).all()
# Proof that route_id1 is unique for each ROUTE_SHORT_NAME for route_id0 == 4
odd_bus_id1_names = SHP_DF_ROUTES[(SHP_DF_ROUTES['route_id1'] != SHP_DF_ROUTES['ROUTE_SHORT_NAME']) & (SHP_DF_ROUTES['route_id0'] == '4')]['ROUTE_SHORT_NAME'].unique()
assert SHP_DF_ROUTES[(SHP_DF_ROUTES['ROUTE_SHORT_NAME'].apply(lambda x: x in odd_bus_id1_names)) & (SHP_DF_ROUTES['route_id0'] == '4')].groupby('ROUTE_SHORT_NAME')['route_id1'].nunique().unique() == [1]

def get_gtfs_id(x):
    if x['route_id0'] == '4':
        return f'4-{x["ROUTE_SHORT_NAME"]}'
    elif x['route_id0'] == '7':
        assert 'TB' in x["route_id1"], f'7-TeleBus route_id1 {x["route_id1"]} does not contain TB'
        route_number = x["route_id1"].lstrip('TB')
        # Add left trailing 0s
        route_number = route_number.zfill(2)
        return f'7-B{route_number}'
    else:
        return f'{x["route_id0"]}-{x["route_id1"]}'

SHP_DF_ROUTES['route_shp_id'] = SHP_DF_ROUTES.apply(lambda x: f'{x["route_id0"]}-{x["route_id1"]}' + (x['route_id2'] if x['route_id0'] == '4' and pd.notna(x['route_id2']) else ''), axis=1)
SHP_DF_ROUTES['route_gtfs_id'] = SHP_DF_ROUTES.apply(lambda x: get_gtfs_id(x), axis=1)

SHP_DF_ROUTES_MIN = SHP_DF_ROUTES[['route_gtfs_id', 'route_shp_id', 'route_id0', 'ROUTE_SHORT_NAME', 'ROUTE_LONG_NAME', 'SHP_FILE']].drop_duplicates()

SHP_DF_ROUTES_MIN = SHP_DF_ROUTES_MIN.groupby('route_gtfs_id').aggregate({'route_shp_id': 'unique', 'route_id0': 'unique', 'ROUTE_SHORT_NAME': 'unique', 'ROUTE_LONG_NAME': 'unique', 'SHP_FILE': 'unique'}).reset_index()

assert SHP_DF_ROUTES_MIN['route_shp_id'].apply(lambda x: len(x) == 1).all()
assert SHP_DF_ROUTES_MIN['route_id0'].apply(lambda x: len(x) == 1).all()
assert SHP_DF_ROUTES_MIN['ROUTE_SHORT_NAME'].apply(lambda x: len(x) == 1).all()
assert SHP_DF_ROUTES_MIN['SHP_FILE'].apply(lambda x: len(x) == 1).all()

# Inspect multiple ROUTE_LONG_NAME of the same route_gtfs_id
SHP_DF_ROUTES_MIN[SHP_DF_ROUTES_MIN['ROUTE_LONG_NAME'].apply(lambda x: len(x) != 1)]

SHP_DF_ROUTES_MIN['route_shp_id'] = SHP_DF_ROUTES_MIN['route_shp_id'].apply(lambda x: x[0])
SHP_DF_ROUTES_MIN['route_id0'] = SHP_DF_ROUTES_MIN['route_id0'].apply(lambda x: x[0])
SHP_DF_ROUTES_MIN['ROUTE_SHORT_NAME'] = SHP_DF_ROUTES_MIN['ROUTE_SHORT_NAME'].apply(lambda x: x[0])
SHP_DF_ROUTES_MIN['SHP_FILE'] = SHP_DF_ROUTES_MIN['SHP_FILE'].apply(lambda x: x[0])

# Inspect SHP_FILE and route_id0
SHP_DF_ROUTES_MIN[['SHP_FILE', 'route_id0']].drop_duplicates().sort_values('SHP_FILE')


assert SHP_GDFS['PTV_METRO_TRAM_STOP']['STOP_ID'].is_unique
assert SHP_GDFS['PTV_METRO_TRAIN_STATION']['STOP_ID'].is_unique
assert SHP_GDFS['PTV_REGIONAL_BUS_STOP']['STOP_ID'].is_unique
assert SHP_GDFS['PTV_REGIONAL_COACH_STOP']['STOP_ID'].is_unique
assert SHP_GDFS['PTV_SKYBUS_STOP']['STOP_ID'].is_unique

# STOP ID in PTV_METRO_BUS_STOP is not unique, however it's only because of the addition of TeleBus routes
shp_metro_bus_stop_duplicated_ids = SHP_GDFS['PTV_METRO_BUS_STOP'][SHP_GDFS['PTV_METRO_BUS_STOP']['STOP_ID'].duplicated(keep=False)].groupby('STOP_ID')['ROUTES_USING_STOP'].unique()
assert shp_metro_bus_stop_duplicated_ids.apply(lambda x: len(x) == 2).all()
assert shp_metro_bus_stop_duplicated_ids.apply(lambda x: len([i for i in x if 'TeleBus' in i]) == 1).all()

# Split PTV_METRO_BUS_STOP into PTV_METROBUS_STOP and PTV_TELEBUS_STOP
SHP_GDFS['PTV_METROBUS_STOP'] = SHP_GDFS['PTV_METRO_BUS_STOP'][SHP_GDFS['PTV_METRO_BUS_STOP']['ROUTES_USING_STOP'].apply(lambda x: 'TeleBus' not in x)].reset_index(drop=True)
SHP_GDFS['PTV_TELEBUS_STOP'] = SHP_GDFS['PTV_METRO_BUS_STOP'][SHP_GDFS['PTV_METRO_BUS_STOP']['ROUTES_USING_STOP'].apply(lambda x: 'TeleBus' in x)].reset_index(drop=True)

assert SHP_GDFS['PTV_METROBUS_STOP']['STOP_ID'].is_unique
assert SHP_GDFS['PTV_TELEBUS_STOP']['STOP_ID'].is_unique

SHP_DFS_STOPS = {
    '2': SHP_GDFS['PTV_METRO_TRAIN_STATION'],
    '3': SHP_GDFS['PTV_METRO_TRAM_STOP'],
    '4': SHP_GDFS['PTV_METROBUS_STOP'],
    # '5': SHP_GDFS['PTV_REGIONAL_COACH_STOP'],
    '6': SHP_GDFS['PTV_REGIONAL_BUS_STOP'],
    '7': SHP_GDFS['PTV_TELEBUS_STOP'],
    '11': SHP_GDFS['PTV_SKYBUS_STOP']
}

for mid in SHP_DFS_STOPS:
    SHP_DFS_STOPS[mid]['ROUTE'] = SHP_DFS_STOPS[mid]['ROUTES_USING_STOP'].apply(lambda x: x.split(',') if pd.notna(x) else [])
    SHP_DFS_STOPS[mid] = SHP_DFS_STOPS[mid][['STOP_ID', 'ROUTE']].explode('ROUTE').reset_index(drop=True)

for mid in ['3', '4', '6', '7', '11']:
    SHP_DFS_STOPS[mid] = SHP_DFS_STOPS[mid].merge(SHP_DF_ROUTES_MIN[SHP_DF_ROUTES_MIN['route_id0'] == mid], left_on='ROUTE', right_on='ROUTE_SHORT_NAME', how='left')
        

# Assert that there is no odd ROUTE in SHP_DFS_STOPS
assert SHP_DFS_STOPS['3']['ROUTE_SHORT_NAME'].notna().all()
assert SHP_DFS_STOPS['4']['ROUTE_SHORT_NAME'].notna().all()
assert SHP_DFS_STOPS['7']['ROUTE_SHORT_NAME'].notna().all()
# assert 'ROUTE' not in SHP_DFS_STOPS['5'].columns
assert (SHP_DFS_STOPS['6'][SHP_DFS_STOPS['6']['ROUTE'].notna() & SHP_DFS_STOPS['6']['ROUTE_SHORT_NAME'].isna()]['ROUTE'] == '').all()
assert (SHP_DFS_STOPS['11'][SHP_DFS_STOPS['11']['ROUTE'].notna() & SHP_DFS_STOPS['11']['ROUTE_SHORT_NAME'].isna()]['ROUTE'] == '').all()


gtfs_df_routes_metrotrains = GTFS_DF_ROUTES_MIN[GTFS_DF_ROUTES_MIN['mode_id'] == '2'][['route_gtfs_id', 'route_short_name']].drop_duplicates().sort_values('route_short_name')

SHP_DFS_STOPS['2'] = SHP_DFS_STOPS['2'][['STOP_ID', 'ROUTE']].drop_duplicates()
SHP_DFS_STOPS['2'] = pd.merge(SHP_DFS_STOPS['2'], gtfs_df_routes_metrotrains, left_on='ROUTE', right_on='route_short_name', how='left')
assert SHP_DFS_STOPS['2']['route_gtfs_id'].notna().all()

for mid in ['2', '3', '4', '6', '7', '11']:
    SHP_DFS_STOPS[mid] = SHP_DFS_STOPS[mid].dropna(subset=['ROUTE'])
    SHP_DFS_STOPS[mid]['route_gtfs_id'] = SHP_DFS_STOPS[mid].apply(lambda x: x['route_gtfs_id'] if pd.notna(x['route_gtfs_id']) else x['ROUTE'], axis=1)
    SHP_DFS_STOPS[mid] = SHP_DFS_STOPS[mid].groupby('STOP_ID').aggregate({'ROUTE': 'unique', 'route_gtfs_id': 'unique'}).reset_index()
    SHP_DFS_STOPS[mid]['ROUTE'] = SHP_DFS_STOPS[mid]['ROUTE'].apply(lambda x: ','.join(x))
    SHP_DFS_STOPS[mid]['route_gtfs_id'] = SHP_DFS_STOPS[mid]['route_gtfs_id'].apply(lambda x: ','.join(x))


SHP_GDFS['PTV_METRO_TRAIN_STATION'] = pd.merge(SHP_GDFS['PTV_METRO_TRAIN_STATION'], SHP_DFS_STOPS['2'][['STOP_ID', 'route_gtfs_id']], on='STOP_ID', how='left')
SHP_GDFS['PTV_METRO_TRAM_STOP'] = pd.merge(SHP_GDFS['PTV_METRO_TRAM_STOP'], SHP_DFS_STOPS['3'][['STOP_ID', 'route_gtfs_id']], on='STOP_ID', how='left')
SHP_GDFS['PTV_METROBUS_STOP'] = pd.merge(SHP_GDFS['PTV_METROBUS_STOP'], SHP_DFS_STOPS['4'][['STOP_ID', 'route_gtfs_id']], on='STOP_ID', how='left')
SHP_GDFS['PTV_REGIONAL_BUS_STOP'] = pd.merge(SHP_GDFS['PTV_REGIONAL_BUS_STOP'], SHP_DFS_STOPS['6'][['STOP_ID', 'route_gtfs_id']], on='STOP_ID', how='left')
SHP_GDFS['PTV_TELEBUS_STOP'] = pd.merge(SHP_GDFS['PTV_TELEBUS_STOP'], SHP_DFS_STOPS['7'][['STOP_ID', 'route_gtfs_id']], on='STOP_ID', how='left')
SHP_GDFS['PTV_SKYBUS_STOP'] = pd.merge(SHP_GDFS['PTV_SKYBUS_STOP'], SHP_DFS_STOPS['11'][['STOP_ID', 'route_gtfs_id']], on='STOP_ID', how='left')


SHP_GDFS['PTV_METRO_TRAIN_STATION']['mode_id'] = '2'
SHP_GDFS['PTV_METRO_TRAM_STOP']['mode_id'] = '3'
SHP_GDFS['PTV_METROBUS_STOP']['mode_id'] = '4'
SHP_GDFS['PTV_REGIONAL_COACH_STOP']['mode_id'] = '5'
SHP_GDFS['PTV_REGIONAL_BUS_STOP']['mode_id'] = '6'
SHP_GDFS['PTV_TELEBUS_STOP']['mode_id'] = '7'
SHP_GDFS['PTV_SKYBUS_STOP']['mode_id'] = '11'


SHP_DF_STOPS : pd.DataFrame = pd.concat([
    SHP_GDFS['PTV_METRO_TRAIN_STATION'],
    SHP_GDFS['PTV_METRO_TRAM_STOP'],
    SHP_GDFS['PTV_METROBUS_STOP'],
    SHP_GDFS['PTV_REGIONAL_COACH_STOP'],
    SHP_GDFS['PTV_REGIONAL_BUS_STOP'],
    SHP_GDFS['PTV_TELEBUS_STOP'],
    SHP_GDFS['PTV_SKYBUS_STOP']
])

SHP_DF_STOPS['geometry'] = SHP_DF_STOPS['geometry'].apply(lambda x: x.coords[0])
SHP_DF_STOPS['STOP_FULL_NAME'] = SHP_DF_STOPS['STOP_NAME']


# Count occurrences of '(' and ')' in STOP_FULL_NAME
assert SHP_DF_STOPS['STOP_FULL_NAME'].apply(lambda x: x.count('(') == x.count(')')).all()
SHP_DF_STOPS['parentheses_count'] = SHP_DF_STOPS['STOP_FULL_NAME'].apply(lambda x: x.count('('))
SHP_DF_STOPS[SHP_DF_STOPS['parentheses_count'] != 1]
SHP_DF_STOPS.drop(columns='parentheses_count', inplace=True)

# Get only the last pair of parentheses
def get_suburb(stop_full_name):
    if '(' not in stop_full_name:
        stop_name = stop_full_name
        stop_suburb = np.nan
        return (stop_name, stop_suburb)
    parentheses_count = 0
    for i in range(len(stop_full_name) - 1, -1, -1):
        c = stop_full_name[i]
        if c == ')':
            parentheses_count += 1
        if c == '(':
            parentheses_count -= 1
        if parentheses_count == 0:
            stop_name = stop_full_name[:i].strip()
            stop_suburb = stop_full_name[i:].removesuffix(')').lstrip('(')
            return (stop_name, stop_suburb)

SHP_DF_STOPS['STOP_SUBURB'] = SHP_DF_STOPS['STOP_FULL_NAME'].apply(get_suburb)
SHP_DF_STOPS['STOP_NAME'] = SHP_DF_STOPS['STOP_SUBURB'].apply(lambda x: x[0])
SHP_DF_STOPS['STOP_SUBURB'] = SHP_DF_STOPS['STOP_SUBURB'].apply(lambda x: x[1])


# Inspection of the data shows that there are some stops with no suburb. We will manually fill these in.
SHP_DF_STOPS[SHP_DF_STOPS['STOP_SUBURB'].isna()]

def custom_stop_suburb(x):
    if x['STOP_ID'] == '35117':
        assert x['STOP_NAME'] == 'Ascot St/Sturt St', x['STOP_NAME']
        return 'Ballarat Central'
    return x['STOP_SUBURB']

SHP_DF_STOPS['STOP_SUBURB'] = SHP_DF_STOPS.apply(custom_stop_suburb, axis=1)

assert SHP_DF_STOPS['STOP_SUBURB'].apply(lambda x: x.count('(') == x.count(')')).all()

SHP_DF_STOPS['STOP_SUBURB_PARENTHESES'] = SHP_DF_STOPS['STOP_SUBURB'].apply(lambda x: x.count('('))

assert (SHP_DF_STOPS['STOP_SUBURB_PARENTHESES'] <= 1).all()

SHP_DF_STOPS['STOP_SUBURB_NAME'], SHP_DF_STOPS['STOP_SUBURB_POSTCODE'] = zip(*SHP_DF_STOPS['STOP_SUBURB'].apply(lambda x: (x.split('(')[0], x.split('(')[1].removesuffix(')') if '(' in x else np.nan)))


SHP_DF_STOPS['LATITUDE'] = SHP_DF_STOPS['LATITUDE'].apply(np.float64)
SHP_DF_STOPS['LONGITUDE'] = SHP_DF_STOPS['LONGITUDE'].apply(np.float64)


SHP_DF_STOPS = SHP_DF_STOPS.groupby('STOP_ID').aggregate({'STOP_NAME': 'unique', 'STOP_SUBURB': 'unique', 'STOP_FULL_NAME': 'unique', 'LATITUDE': 'unique', 'LONGITUDE': 'unique', 'TICKETZONE': 'unique', 'route_gtfs_id': 'unique', 'geometry': 'unique'})
# 7s - 10s
SHP_DF_STOPS.reset_index(inplace=True)

for col in SHP_DF_STOPS.columns:
    if col != 'STOP_ID':
        SHP_DF_STOPS[f'{col}_len'] = SHP_DF_STOPS[col].apply(len)

for col in ['STOP_NAME', 'STOP_SUBURB', 'STOP_FULL_NAME', 'LATITUDE', 'LONGITUDE', 'geometry']:
    assert SHP_DF_STOPS[f'{col}_len'].max() == 1
    SHP_DF_STOPS[col] = SHP_DF_STOPS[col].apply(lambda x: x[0])

SHP_DF_STOPS['TICKETZONE'] = SHP_DF_STOPS['TICKETZONE'].apply(lambda x: ','.join([str(i) for i in x if not pd.isna(i)]))
SHP_DF_STOPS['route_gtfs_id'] = SHP_DF_STOPS['route_gtfs_id'].apply(lambda x: ','.join([str(i) for i in x if not pd.isna(i)]))

SHP_DF_STOPS.drop(columns=[col for col in SHP_DF_STOPS.columns if col.endswith('_len')], inplace=True)

  SHP_DF_STOPS['geometry'] = SHP_DF_STOPS['geometry'].apply(lambda x: x.coords[0])


In [285]:
SHP_DF_STOPS[SHP_DF_STOPS['STOP_ID'] == '10697']

Unnamed: 0,STOP_ID,STOP_NAME,STOP_SUBURB,STOP_FULL_NAME,LATITUDE,LONGITUDE,TICKETZONE,route_gtfs_id,geometry
608,10697,VicRoads/Hartnett Dr,Seaford,VicRoads/Hartnett Dr (Seaford),-38.117843,145.140596,2,"4-779,4-778","(145.1406018143186, -38.1178298666463)"


In [286]:
GA_DF_STOPS_FULL[GA_DF_STOPS_FULL['stop_id_gtfs'] == '10697']

Unnamed: 0,stop_id_gtfs,stop_name_gtfs,stop_lat,stop_lon,mode_id_gtfs,point_id,operating_hours,mode_id_api,flexible_stop_opening_hours,station_type,...,stop_staffing_thu_pm_to,stop_staffing_tue_am_from,stop_staffing_tue_am_to,stop_staffing_tue_pm_from,stop_staffing_tue_pm_to,stop_staffing_wed_am_from,stop_staffing_wed_am_to,stop_staffing_wed_pm_from,stop_staffing_wed_pm_To,gtfs_stop_id
608,10697,VicRoads Customer Service Center/Hartnett Dr (...,-38.117843,145.140596,4,10697,N,1,,,...,,,,,,,,,,10697


In [287]:
GTFS_DF_STOPS[GTFS_DF_STOPS['stop_id'] == '10697']

Unnamed: 0,stop_id,stop_name,stop_lat,stop_lon,mode_id
2600,10697,VicRoads Customer Service Center/Hartnett Dr (...,-38.117843,145.140596,4


In [323]:
GA_DF_STOPS_FULL[GA_DF_STOPS_FULL['stop_id_api'].isna()]

Unnamed: 0,stop_id_gtfs,stop_name_gtfs,stop_lat,stop_lon,mode_id_gtfs,point_id,operating_hours,mode_id_api,flexible_stop_opening_hours,station_type,...,stop_staffing_thu_pm_to,stop_staffing_tue_am_from,stop_staffing_tue_am_to,stop_staffing_tue_pm_from,stop_staffing_tue_pm_to,stop_staffing_wed_am_from,stop_staffing_wed_am_to,stop_staffing_wed_pm_from,stop_staffing_wed_pm_To,gtfs_stop_id
429,10481,St Bedes College/Naples Rd (Mentone),-37.990184,145.068511,4,,,,,,...,,,,,,,,,,
916,11016,Torbay St/Greensborough Hwy (Macleod),-37.718029,145.081354,4,,,,,,...,,,,,,,,,,
1470,11657,Rowellyn Park Primary School/Tattler St (Carru...,-38.103490,145.192132,4,,,,,,...,,,,,,,,,,
1737,11953,VicRoads Customer Service Center/Hartnett Dr (...,-38.117827,145.140699,4,,,,,,...,,,,,,,,,,
2008,12335,Oatlands Primary School/Kurrajong Rd (Narre Wa...,-38.006027,145.316499,4,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25207,6451,West Gate Fwy/Williamstown Rd (Yarraville),-37.825415,144.881106,4,,,,,,...,,,,,,,,,,
27609,9137,Organ Pipes National Park/Calder Fwy (Keilor N...,-37.671687,144.764630,4,,,,,,...,,,,,,,,,,
27610,9138,Calder Park/Calder Fwy (Calder Park),-37.666316,144.754180,4,,,,,,...,,,,,,,,,,
27611,9139,Duncans Lane/Calder Fwy (Diggers Rest),-37.655018,144.740647,4,,,,,,...,,,,,,,,,,


In [4]:
API_STOPS_INFO = json.load(open(f'{DATA_DIR}/stops_info.json', 'r'))

In [48]:
API_STOPS_INFO = json.load(open(f'{DATA_DIR}/stops_info.json', 'r'))
API_STOPS_INFO_LIST = []

for stop_id in API_STOPS_INFO:
    for route_type in API_STOPS_INFO[stop_id]:
        assert str(API_STOPS_INFO[stop_id][route_type]['stop']['route_type']) == str(route_type)
        assert str(API_STOPS_INFO[stop_id][route_type]['stop']['stop_id']) == str(stop_id)
        API_STOPS_INFO_LIST.append(API_STOPS_INFO[stop_id][route_type]['stop'])

for stop_info in API_STOPS_INFO_LIST:
    stop_info : dict
    kv_is_dict = [(k, v) for k, v in stop_info.items() if isinstance(v, dict)]
    for k, v in kv_is_dict:
        for k2, v2 in v.items():
            new_key = f'{k}_{k2}'
            assert new_key not in stop_info, f'{new_key} already exists in {stop_info}'
            stop_info[new_key] = v2
        del stop_info[k]
    for k, v in stop_info.items():
        if isinstance(v, int) and not isinstance(v, bool):
            stop_info[k] = str(v)

API_DF_STOPS_INFO = pd.DataFrame(API_STOPS_INFO_LIST, dtype=str)

In [49]:
[k for k, v in API_STOPS_INFO_LIST[0].items() if isinstance(v, int) and not isinstance(v, bool)]

[]

In [51]:
API_DF_STOPS_INFO['stop_location_municipality_id'].isna()

0        False
1        False
2        False
3        False
4        False
         ...  
20578     True
20579     True
20580     True
20581     True
20582     True
Name: stop_location_municipality_id, Length: 20583, dtype: bool