In [1]:
from hashlib import sha1
import hmac
import requests
import pandas as pd
import geopandas as gpd
import json
import os
import numpy as np
import time
import pyptvgtfs
SESSION = requests.Session()

ENV = json.load(open('../local-env.json'))

def get_ptv_api_url(
        endpoint : str,
        dev_id : str | int, 
        api_key : str | int,
    ):
    """
    Returns the URL to use PTV TimeTable API.

    Generates a signature from dev id (user id), API key, and endpoint.

    See the following for more information:
    - Home page: https://www.ptv.vic.gov.au/footer/data-and-reporting/datasets/ptv-timetable-api/
    - Swagger UI: https://timetableapi.ptv.vic.gov.au/swagger/ui/index
    - Swagger Docs JSON: https://timetableapi.ptv.vic.gov.au/swagger/docs/v3 (You can use this to find the endpoints you want to use.)
    """
    assert endpoint.startswith('/'), f'Endpoint must start with /, got {endpoint}'
    raw = f'{endpoint}{'&' if '?' in endpoint else '?'}devid={dev_id}'
    hashed = hmac.new(api_key.encode('utf-8'), raw.encode('utf-8'), sha1)  # Encode the raw string to bytes
    signature = hashed.hexdigest()
    return f'https://timetableapi.ptv.vic.gov.au{raw}&signature={signature}'


def get_data(endpoint : str, need_auth : bool = True):
    """
    Returns the data from the URL.
    """
    if need_auth:
        url = get_ptv_api_url(endpoint, ENV['PTV_TIMETABLE_DEV_ID'], ENV['PTV_TIMETABLE_API_KEY'])
    else:
        url = f'https://timetableapi.ptv.vic.gov.au{endpoint}'
    response = SESSION.get(url)
    response.raise_for_status()
    return response.json()


In [2]:
API_DOCS = get_data('/swagger/docs/v3', need_auth=False)
STATIC_API_ENDPOINTS = [k for k in API_DOCS['paths'].keys() if '{' not in k]
API_ROUTES : dict = get_data('/v3/routes')['routes']
API_ROUTE_TYPES : dict = get_data('/v3/route_types')['route_types']
API_DISRUPTIONS : dict = get_data('/v3/disruptions')['disruptions']
API_DISRUPTION_MODES : dict = get_data('/v3/disruptions/modes')['disruption_modes']
API_OUTLETS : dict = get_data('/v3/outlets')['outlets']
API_DF_ROUTE_TYPES = pd.DataFrame(API_ROUTE_TYPES)
API_DF_DISRUPTION_MODES = pd.DataFrame(API_DISRUPTION_MODES)
API_DF_OUTLETS = pd.DataFrame(API_OUTLETS)
# There are some faulty data in the outlets data. In particular, the latitude is > 0, which is not possible in Victoria.
API_DF_OUTLETS['outlet_latitude'] = API_DF_OUTLETS['outlet_latitude'].apply(lambda x: -x if x > 0 else x)
for route in API_ROUTES:
    for k, v in route['route_service_status'].items():
        assert k not in route, f'Key {k} already exists in route'
        route[k] = v
    del route['route_service_status']

In [3]:
API_DF_ROUTES = pd.DataFrame(API_ROUTES) 
assert API_DF_ROUTES['route_id'].is_unique, 'route_id is not unique'
assert API_DF_ROUTES['route_gtfs_id'].is_unique, 'route_gtfs_id is not unique'
API_DF_ROUTES['route_id'] = API_DF_ROUTES['route_id'].apply(str)
API_DF_ROUTES['route_type'] = API_DF_ROUTES['route_type'].apply(lambda x: str(int(x)) if not pd.isna(x) else x)
# API_DF_ROUTES['route_gtfs_id'] = API_DF_ROUTES.apply(lambda x: f'3-{x["route_number"]}' if x['route_type'] == '1' else x["route_gtfs_id"], axis=1)

API_DF_ROUTES['route_api_gtfs_id'] = API_DF_ROUTES['route_gtfs_id']

# Add extra VLine gtfs ids
API_DF_ROUTES_VLINE = API_DF_ROUTES[API_DF_ROUTES['route_gtfs_id'].str.startswith('1-', na=False)].copy(deep=True)
API_DF_ROUTES_VLINE['route_gtfs_id'] = API_DF_ROUTES_VLINE['route_gtfs_id'].apply(lambda x: f'5-{x.split("-")[1]}')
API_DF_ROUTES_VLINE['route_type'] = '5'
API_DF_ROUTES = pd.concat([API_DF_ROUTES, API_DF_ROUTES_VLINE])
API_DF_ROUTES.reset_index(drop=True, inplace=True)


# Strip leading 0s from route numbers
API_DF_ROUTES['route_gtfs_idx'] = API_DF_ROUTES['route_gtfs_id'].apply(lambda x: x.split('-'))
API_DF_ROUTES['route_gtfs_id0'] = API_DF_ROUTES['route_gtfs_idx'].apply(lambda x: x[0])
API_DF_ROUTES['route_gtfs_id1'] = API_DF_ROUTES['route_gtfs_idx'].apply(lambda x: x[1])
API_DF_ROUTES['route_gtfs_id1'] = API_DF_ROUTES['route_gtfs_id1'].apply(lambda x: x.lstrip('0'))
API_DF_ROUTES['route_gtfs_id1'] = API_DF_ROUTES.apply(lambda x: x['route_number'] if (str(x['route_gtfs_id0']) == '4' and len(x['route_number']) == 3) else x['route_gtfs_id1'], axis=1)
API_DF_ROUTES.drop(columns=['geopath', 'description', 'timestamp'], inplace=True)
# API_DF_ROUTES.drop(columns=['route_gtfs_idx', 'route_gtfs_id0', 'route_gtfs_id1'], inplace=True)

# Inspect values of route_gtfs_id0
API_DF_ROUTES['route_gtfs_id0'].sort_values(key=lambda x: x.apply(int)).unique() # array(['1', '2', '3', '4', '5', '6', '11'], dtype=object)

# Comment this since GTFS_DF_ROUTES and SHP_DF_ROUTES are not available, but this is the code to inspect all bus numbers with 4 characters
# # Inspect all 4: Metro Buses in API_DF_ROUTES where route_number is 4 characters long
# API_DF_ROUTES[(API_DF_ROUTES['route_gtfs_id0'] == '4') & (API_DF_ROUTES['route_number'].apply(lambda x: len(x) == 4))]
# # ['route_number'].unique()
# # Check those routes with those route_number in GTFS_DF_ROUTES and SHP_DF_ROUTES
# GTFS_DF_ROUTES[GTFS_DF_ROUTES['route_short_name'].notna() & (GTFS_DF_ROUTES['route_short_name'].str.contains('745'))]
# SHP_DF_ROUTES[SHP_DF_ROUTES['ROUTE_SHORT_NAME'].notna() & (SHP_DF_ROUTES['ROUTE_SHORT_NAME'].str.contains('745'))]

# Assert that all metro buses with route_number != route_gtfs_id1 have route_number of length != 3
assert len(API_DF_ROUTES[(API_DF_ROUTES['route_gtfs_id0'] == '4') & (API_DF_ROUTES['route_number'] != API_DF_ROUTES['route_gtfs_id1']) & (API_DF_ROUTES['route_number'].apply(lambda x: len(x) == 3))]) == 0

# Assert that all metro buses with route_number size != 3 or 4 have route_number containing 'combined'
assert (API_DF_ROUTES[(API_DF_ROUTES['route_gtfs_id0'] == '4') & (API_DF_ROUTES['route_number'].apply(lambda x: len(x) not in [3, 4]))]['route_number'].str.lower().str.contains('combined').all())


API_DF_ROUTES['route_gtfs_id'] = API_DF_ROUTES.apply(lambda x: f'4-{x["route_number"]}' if (x['route_gtfs_id0'] == '4' and len(x['route_number']) in [3, 4]) else f'{x['route_gtfs_id0']}-{x['route_gtfs_id1']}', axis=1)

# # Inspect duplicated route_gtfs_id
# API_DF_ROUTES[API_DF_ROUTES['route_gtfs_id'].duplicated(keep=False)].sort_values('route_gtfs_id')?

In [4]:
SHP_DIR = '../local/ptv-spatial-datasets'
SHP_GDFS : gpd.GeoDataFrame = { f.split('.')[0]: gpd.read_file(os.path.join(SHP_DIR, f)) for f in os.listdir(SHP_DIR) if f.endswith('.shp') }
for f in os.listdir(SHP_DIR):
    if f.endswith('.txt'):
        gdf_name = f.removesuffix('_column_names.txt').upper()
        with open(os.path.join(SHP_DIR, f), 'r') as file:
            gdf_column_names = [line.strip() for line in file.readlines()][4:]
        assert gdf_name in SHP_GDFS, f'{gdf_name} not in GDFS'
        for line in gdf_column_names:
            assert ' = ' in line, f'Invalid line: {line}'
        gdf_column_names = { line.split(' = ')[0]: line.split(' = ')[1] for line in gdf_column_names }
        SHP_GDFS[gdf_name].rename(columns=gdf_column_names, inplace=True)
SHP_DF_ROUTES : pd.DataFrame = pd.concat([gdf for k, gdf in SHP_GDFS.items() if 'ROUTE' in k])
SHP_DF_ROUTES['route_idx'] = SHP_DF_ROUTES['ROUTE_ID'].apply(lambda x: x.split('-'))
SHP_DF_ROUTES['route_id0'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[0])
SHP_DF_ROUTES['route_id1'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[1])
SHP_DF_ROUTES['route_id2'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[2] if len(x) > 4 else np.nan)
SHP_DF_ROUTES['route_id3'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[-2])
SHP_DF_ROUTES['route_id4'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[-1])
SHP_DF_ROUTES['route_id01'] = SHP_DF_ROUTES['route_id0'] + '-' + SHP_DF_ROUTES['route_id1']

In [5]:
GTFS = pyptvgtfs.process_gtfs_zip('../downloads/20240312_113156/gtfs.zip', '')
GTFS.drop(columns=['version_id'], inplace=True)
GTFS_DFS = GTFS.set_index(['mode_id', 'table_name'])['df'].to_dict()
new_GTFS_DFS = {}
for k, v in GTFS_DFS.items():
    new_GTFS_DFS[k[0]] = new_GTFS_DFS.get(k[0], {})
    new_GTFS_DFS[k[0]][k[1]] = v
GTFS_DFS : dict[str, dict[str, pd.DataFrame]] = new_GTFS_DFS
for mid in GTFS_DFS:
    for tn in GTFS_DFS[mid]:
        GTFS_DFS[mid][tn]['mode_id'] = mid
GTFS_DF_ROUTES = pd.concat([GTFS_DFS[mid]['routes'] for mid in GTFS_DFS])
# 1m - 2m
GTFS_DF_ROUTES['route_idx'] = GTFS_DF_ROUTES['route_id'].apply(lambda x: x.split('-'))
GTFS_DF_ROUTES['route_id0'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[0])
GTFS_DF_ROUTES['route_id1'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[1])
GTFS_DF_ROUTES['route_id2'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[2] if len(x) > 4 else np.nan)
GTFS_DF_ROUTES['route_id3'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[-2])
GTFS_DF_ROUTES['route_id4'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[-1])
GTFS_DF_ROUTES['route_id01'] = GTFS_DF_ROUTES['mode_id'] + '-' + GTFS_DF_ROUTES['route_id1']

In [6]:
# Inspect: Check ids with multiple route numbers
GTFS_DF_ROUTES.groupby('route_id01').aggregate({'route_id2': 'unique', 'route_id3': 'unique', 'route_id4': 'unique', 'route_short_name': 'unique'}).sort_values('route_short_name', ascending=False, key=lambda x: x.apply(len))
# Inspect: Check ids with multiple route numbers
SHP_DF_ROUTES.groupby('route_id01').aggregate({'route_id2': 'unique', 'route_id3': 'unique', 'route_id4': 'unique', 'ROUTE_SHORT_NAME': 'unique'}).sort_values('ROUTE_SHORT_NAME', ascending=False, key=lambda x: x.apply(len))
# Inspect 695 and 695F
GTFS_DF_ROUTES[GTFS_DF_ROUTES['route_short_name'].notna() & GTFS_DF_ROUTES['route_short_name'].str.contains('695')]

assert GTFS_DF_ROUTES.groupby('route_id01').aggregate({'route_id2': 'unique', 'route_id3': 'unique', 'route_id4': 'unique'}).apply(lambda x: len(x['route_id2']) <= 1 or len(x['route_id4']) <= 1, axis=1).all()
assert SHP_DF_ROUTES.groupby('route_id01').aggregate({'route_id2': 'unique', 'route_id3': 'unique', 'route_id4': 'unique'}).apply(lambda x: len(x['route_id2']) <= 1 or len(x['route_id4']) <= 1, axis=1).all()

# Since ROUTE_LONG_NAME is all not null, when we merge dataframes, we can use ROUTE_LONG_NAME isna() to check if the other dataframe has the equivalent data in SHP_DF_ROUTES_MIN
assert SHP_DF_ROUTES['ROUTE_LONG_NAME'].notna().all()

# Inspect the route_id0 values of SHP_DF_ROUTES
SHP_DF_ROUTES['route_id0'].sort_values(key=lambda x: x.apply(int)).unique() # array(['3', '4', '5', '6', '7', '11'], dtype=object)

# Assert that all metro buses route_short_name are 3 or 4 characters long
assert SHP_DF_ROUTES[(SHP_DF_ROUTES['route_id0'] == '4')]['ROUTE_SHORT_NAME'].apply(lambda x: len(x) in [3, 4]).all()
# Proof that route_id1 is unique for each ROUTE_SHORT_NAME for route_id0 == 4
odd_bus_id1_names = SHP_DF_ROUTES[(SHP_DF_ROUTES['route_id1'] != SHP_DF_ROUTES['ROUTE_SHORT_NAME']) & (SHP_DF_ROUTES['route_id0'] == '4')]['ROUTE_SHORT_NAME'].unique()
assert SHP_DF_ROUTES[(SHP_DF_ROUTES['ROUTE_SHORT_NAME'].apply(lambda x: x in odd_bus_id1_names)) & (SHP_DF_ROUTES['route_id0'] == '4')].groupby('ROUTE_SHORT_NAME')['route_id1'].nunique().unique() == [1]

In [7]:
def get_gtfs_id(x):
    if x['route_id0'] == '4':
        return f'4-{x["ROUTE_SHORT_NAME"]}'
    elif x['route_id0'] == '7':
        assert 'TB' in x["route_id1"], f'7-TeleBus route_id1 {x["route_id1"]} does not contain TB'
        route_number = x["route_id1"].lstrip('TB')
        # Add left trailing 0s
        route_number = route_number.zfill(2)
        return f'7-B{route_number}'
    else:
        return f'{x["route_id0"]}-{x["route_id1"]}'

SHP_DF_ROUTES['route_shp_id'] = SHP_DF_ROUTES.apply(lambda x: f'{x["route_id0"]}-{x["route_id1"]}' + (x['route_id2'] if x['route_id0'] == '4' and pd.notna(x['route_id2']) else ''), axis=1)
SHP_DF_ROUTES['route_gtfs_id'] = SHP_DF_ROUTES.apply(lambda x: get_gtfs_id(x), axis=1)
GTFS_DF_ROUTES['route_gtfs_id'] = GTFS_DF_ROUTES.apply(lambda x: f'{x["mode_id"]}-{x["route_id1"]}' + (x['route_id2'] if x['mode_id'] == '4' and pd.notna(x['route_id2']) else ''), axis=1)

In [8]:
SHP_DF_ROUTES_MIN = SHP_DF_ROUTES[['route_gtfs_id', 'route_shp_id', 'route_id0', 'ROUTE_SHORT_NAME', 'ROUTE_LONG_NAME']].drop_duplicates()
GTFS_DF_ROUTES_MIN = GTFS_DF_ROUTES[['route_gtfs_id', 'route_short_name', 'route_long_name', 'mode_id']].drop_duplicates()
GS_DF_ROUTES = pd.merge(GTFS_DF_ROUTES_MIN, SHP_DF_ROUTES_MIN, on='route_gtfs_id', suffixes=('_gtfs', '_shp'), how='outer')
GS_DF_ROUTES.to_csv('../local/ptv-spatial-gtfs-routes.csv', index=False)

assert len(GS_DF_ROUTES[((GS_DF_ROUTES['mode_id'].isna()) & (GS_DF_ROUTES['route_id0'] == '3'))]) == 0
assert len(GS_DF_ROUTES[((GS_DF_ROUTES['mode_id'].isna()) & (GS_DF_ROUTES['route_id0'] == '5'))]) == 0
assert len(GS_DF_ROUTES[((GS_DF_ROUTES['mode_id'].isna()) & (GS_DF_ROUTES['route_id0'] == '7'))]) == 0
assert len(GS_DF_ROUTES[((GS_DF_ROUTES['route_id0'].isna()) & (GS_DF_ROUTES['mode_id'] == '11'))]) == 0

# Inspect 6: Regional Buses
GS_DF_ROUTES[((GS_DF_ROUTES['mode_id'].isna()) & (GS_DF_ROUTES['route_id0'] == '6')) | ((GS_DF_ROUTES['route_id0'].isna()) & (GS_DF_ROUTES['mode_id'] == '6'))]
# Inspect 4: Metro Buses
GS_DF_ROUTES[((GS_DF_ROUTES['mode_id'].isna()) & (GS_DF_ROUTES['route_id0'] == '4')) | ((GS_DF_ROUTES['route_id0'].isna()) & (GS_DF_ROUTES['mode_id'] == '4'))]

GS_DF_ROUTES['route_gs_id'] = GS_DF_ROUTES.apply(lambda x: x['route_gtfs_id'] if pd.notna(x['route_gtfs_id']) else x['route_shp_id'], axis=1)

In [9]:
# Final merge
GSA_DF_ROUTES = pd.merge(GS_DF_ROUTES, API_DF_ROUTES[['route_gtfs_id', 'route_api_gtfs_id', 'route_id', 'route_name', 'route_number', 'route_type', 'route_gtfs_id0']], left_on='route_gtfs_id', right_on='route_gtfs_id', suffixes=('-gs', '-api'), how='outer')
GSA_DF_ROUTES.to_csv('../local/gsa-routes.csv', index=False)
GSA_DF_ROUTES[GSA_DF_ROUTES['route_gs_id'].isna()].to_csv('../local/gsa-missing-gs.csv', index=False)
GSA_DF_ROUTES[GSA_DF_ROUTES['route_api_gtfs_id'].isna()].to_csv('../local/gsa-missing-api.csv', index=False)

In [10]:
# Check matchings between API's route_id and GTFS+SHP's route_gtfs_ids

DF_API2GS = GSA_DF_ROUTES.groupby('route_id')['route_gs_id'].unique().rename('route_gs_id').reset_index()
df_api2gs_nunique = GSA_DF_ROUTES.groupby('route_id')['route_gs_id'].nunique().rename('route_gs_nunique').reset_index()
DF_API2GS = pd.merge(DF_API2GS, df_api2gs_nunique, on='route_id')
DF_API2GS['route_gs_len'] = DF_API2GS['route_gs_id'].apply(len)
DF_API2GS['route_gs_nunique'] = DF_API2GS['route_gs_nunique'].apply(int)
DF_API2GS['gs_na'] = DF_API2GS['route_gs_len'] != DF_API2GS['route_gs_nunique']
DF_API2GS = pd.merge(DF_API2GS, API_DF_ROUTES, on='route_id')

DF_API2GS[DF_API2GS['gs_na'] & (DF_API2GS['route_gs_nunique'] == 0)]['route_gtfs_id0'].unique() # array(['4', '1', '5'], dtype=object)
DF_API2GS[DF_API2GS['route_gs_nunique'] >= 2]['route_gtfs_id0'].unique() # array(['1', '5'], dtype=object)

array(['1', '5'], dtype=object)