In [1]:
from hashlib import sha1
import hmac
import requests
import pandas as pd
import geopandas as gpd
import json
import os
import numpy as np
import time
import pyptvgtfs
SESSION = requests.Session()

ENV = json.load(open('../local-env.json'))

def get_ptv_api_url(
        endpoint : str,
        dev_id : str | int, 
        api_key : str | int,
    ):
    """
    Returns the URL to use PTV TimeTable API.

    Generates a signature from dev id (user id), API key, and endpoint.

    See the following for more information:
    - Home page: https://www.ptv.vic.gov.au/footer/data-and-reporting/datasets/ptv-timetable-api/
    - Swagger UI: https://timetableapi.ptv.vic.gov.au/swagger/ui/index
    - Swagger Docs JSON: https://timetableapi.ptv.vic.gov.au/swagger/docs/v3 (You can use this to find the endpoints you want to use.)
    """
    assert endpoint.startswith('/'), f'Endpoint must start with /, got {endpoint}'
    raw = f'{endpoint}{'&' if '?' in endpoint else '?'}devid={dev_id}'
    hashed = hmac.new(api_key.encode('utf-8'), raw.encode('utf-8'), sha1)  # Encode the raw string to bytes
    signature = hashed.hexdigest()
    return f'https://timetableapi.ptv.vic.gov.au{raw}&signature={signature}'


def get_data(endpoint : str, need_auth : bool = True):
    """
    Returns the data from the URL.
    """
    if need_auth:
        url = get_ptv_api_url(endpoint, ENV['PTV_TIMETABLE_DEV_ID'], ENV['PTV_TIMETABLE_API_KEY'])
    else:
        url = f'https://timetableapi.ptv.vic.gov.au{endpoint}'
    response = SESSION.get(url)
    response.raise_for_status()
    return response.json()


In [2]:
API_DOCS = get_data('/swagger/docs/v3', need_auth=False)
STATIC_API_ENDPOINTS = [k for k in API_DOCS['paths'].keys() if '{' not in k]
API_ROUTES : dict = get_data('/v3/routes')['routes']
API_ROUTE_TYPES : dict = get_data('/v3/route_types')['route_types']
API_DISRUPTIONS : dict = get_data('/v3/disruptions')['disruptions']
API_DISRUPTION_MODES : dict = get_data('/v3/disruptions/modes')['disruption_modes']
API_OUTLETS : dict = get_data('/v3/outlets')['outlets']
API_DF_ROUTE_TYPES = pd.DataFrame(API_ROUTE_TYPES)
API_DF_DISRUPTION_MODES = pd.DataFrame(API_DISRUPTION_MODES)
API_DF_OUTLETS = pd.DataFrame(API_OUTLETS)
# There are some faulty data in the outlets data. In particular, the latitude is > 0, which is not possible in Victoria.
API_DF_OUTLETS['outlet_latitude'] = API_DF_OUTLETS['outlet_latitude'].apply(lambda x: -x if x > 0 else x)
for route in API_ROUTES:
    for k, v in route['route_service_status'].items():
        assert k not in route, f'Key {k} already exists in route'
        route[k] = v
    del route['route_service_status']

API_DF_ROUTES = pd.DataFrame(API_ROUTES) 
assert API_DF_ROUTES['route_id'].is_unique, 'route_id is not unique'
assert API_DF_ROUTES['route_gtfs_id'].is_unique, 'route_gtfs_id is not unique'
API_DF_ROUTES['route_id'] = API_DF_ROUTES['route_id'].apply(str)
API_DF_ROUTES['route_type'] = API_DF_ROUTES['route_type'].apply(lambda x: str(int(x)) if not pd.isna(x) else x)

API_DF_ROUTES = API_DF_ROUTES[['route_type', 'route_id', 'route_name', 'route_number']]

In [3]:
SHP_DIR = '../local/ptv-spatial-datasets'
SHP_GDFS : gpd.GeoDataFrame = { f.split('.')[0]: gpd.read_file(os.path.join(SHP_DIR, f)) for f in os.listdir(SHP_DIR) if f.endswith('.shp') }
for f in os.listdir(SHP_DIR):
    if f.endswith('.txt'):
        gdf_name = f.removesuffix('_column_names.txt').upper()
        with open(os.path.join(SHP_DIR, f), 'r') as file:
            gdf_column_names = [line.strip() for line in file.readlines()][4:]
        assert gdf_name in SHP_GDFS, f'{gdf_name} not in GDFS'
        for line in gdf_column_names:
            assert ' = ' in line, f'Invalid line: {line}'
        gdf_column_names = { line.split(' = ')[0]: line.split(' = ')[1] for line in gdf_column_names }
        SHP_GDFS[gdf_name].rename(columns=gdf_column_names, inplace=True)
# SHP_DF_ROUTES : pd.DataFrame = pd.concat([gdf for k, gdf in SHP_GDFS.items() if 'ROUTE' in k])
# SHP_DF_ROUTES['route_idx'] = SHP_DF_ROUTES['ROUTE_ID'].apply(lambda x: x.split('-'))
# SHP_DF_ROUTES['route_id0'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[0])
# SHP_DF_ROUTES['route_id1'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[1])
# SHP_DF_ROUTES['route_id2'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[2] if len(x) > 4 else np.nan)
# SHP_DF_ROUTES['route_id3'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[-2])
# SHP_DF_ROUTES['route_id4'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[-1])
# SHP_DF_ROUTES['route_id01'] = SHP_DF_ROUTES['route_id0'] + '-' + SHP_DF_ROUTES['route_id1']

In [164]:
SHP_DF_STOPS : pd.DataFrame = pd.concat([gdf for k, gdf in SHP_GDFS.items() if 'PTV_METRO_TRAIN_STATION' == k or 'STOP' in k])
# SHP_GDFS[]
SHP_DF_STOPS['geometry'] = SHP_DF_STOPS['geometry'].apply(lambda x: x.coords[0])
SHP_DF_STOPS['STOP_FULL_NAME'] = SHP_DF_STOPS['STOP_NAME']
SHP_DF_STOPS['STOP_SUBURB'] = SHP_DF_STOPS['STOP_FULL_NAME'].apply(lambda x: x.split('(', 1)[1].lstrip('(').rstrip(')') if '(' in x else np.nan)
SHP_DF_STOPS['STOP_NAME'] = SHP_DF_STOPS['STOP_FULL_NAME'].apply(lambda x: x.split('(', 1)[0].strip())

# Inspection of the data shows that there are some stops with no suburb. We will manually fill these in.
SHP_DF_STOPS[SHP_DF_STOPS['STOP_SUBURB'].isna()]

def custom_stop_suburb(x):
    if x['STOP_ID'] == '35117':
        assert x['STOP_NAME'] == 'Ascot St/Sturt St', x['STOP_NAME']
        return 'Ballarat Central'
    return x['STOP_SUBURB']

SHP_DF_STOPS['STOP_SUBURB'] = SHP_DF_STOPS.apply(custom_stop_suburb, axis=1)

SHP_DF_STOPS = SHP_DF_STOPS.groupby('STOP_ID').aggregate({'STOP_NAME': 'unique', 'STOP_SUBURB': 'unique', 'STOP_FULL_NAME': 'unique', 'LATITUDE': 'unique', 'LONGITUDE': 'unique', 'TICKETZONE': 'unique', 'ROUTES_USING_STOP': 'unique', 'geometry': 'unique'})
# 7s - 10s
SHP_DF_STOPS.reset_index(inplace=True)

for col in SHP_DF_STOPS.columns:
    if col != 'STOP_ID':
        SHP_DF_STOPS[f'{col}_len'] = SHP_DF_STOPS[col].apply(len)

for col in ['STOP_NAME', 'STOP_SUBURB', 'STOP_FULL_NAME', 'LATITUDE', 'LONGITUDE', 'geometry']:
    assert SHP_DF_STOPS[f'{col}_len'].max() == 1
    SHP_DF_STOPS[col] = SHP_DF_STOPS[col].apply(lambda x: x[0])

SHP_DF_STOPS['TICKETZONE'] = SHP_DF_STOPS['TICKETZONE'].apply(lambda x: ','.join([str(i) for i in x if not pd.isna(i)]))
SHP_DF_STOPS['ROUTES_USING_STOP'] = SHP_DF_STOPS['ROUTES_USING_STOP'].apply(lambda x: ','.join([str(i) for i in x if not pd.isna(i)]))

SHP_DF_STOPS.drop(columns=[col for col in SHP_DF_STOPS.columns if col.endswith('_len')], inplace=True)

  SHP_DF_STOPS['geometry'] = SHP_DF_STOPS['geometry'].apply(lambda x: x.coords[0])


In [4]:
GTFS = pyptvgtfs.process_gtfs_zip('../downloads/20240312_113156/gtfs.zip', '')
GTFS.drop(columns=['version_id'], inplace=True)
GTFS_DFS = GTFS.set_index(['mode_id', 'table_name'])['df'].to_dict()
new_GTFS_DFS = {}
for k, v in GTFS_DFS.items():
    new_GTFS_DFS[k[0]] = new_GTFS_DFS.get(k[0], {})
    new_GTFS_DFS[k[0]][k[1]] = v
GTFS_DFS : dict[str, dict[str, pd.DataFrame]] = new_GTFS_DFS
for mid in GTFS_DFS:
    for tn in GTFS_DFS[mid]:
        GTFS_DFS[mid][tn]['mode_id'] = mid
# 45s - 1m

In [81]:
GTFS_DF_ROUTES = pd.concat([GTFS_DFS[mid]['routes'] for mid in GTFS_DFS])
# 1m - 2m
GTFS_DF_ROUTES['route_idx'] = GTFS_DF_ROUTES['route_id'].apply(lambda x: x.split('-'))
GTFS_DF_ROUTES['route_id0'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[0])
GTFS_DF_ROUTES['route_id1'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[1])
GTFS_DF_ROUTES['route_id2'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[2] if len(x) > 4 else np.nan)
GTFS_DF_ROUTES['route_id3'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[-2])
GTFS_DF_ROUTES['route_id4'] = GTFS_DF_ROUTES['route_idx'].apply(lambda x: x[-1])
GTFS_DF_ROUTES['route_id01'] = GTFS_DF_ROUTES['mode_id'] + '-' + GTFS_DF_ROUTES['route_id1']
GTFS_DF_ROUTES['route_gtfs_id'] = GTFS_DF_ROUTES.apply(lambda x: f'{x["mode_id"]}-{x["route_id1"]}' + (x['route_id2'] if x['mode_id'] == '4' and pd.notna(x['route_id2']) else ''), axis=1)

In [None]:
GTFS_DF_STOPS = pd.concat([GTFS_DFS[mid]['stops'] for mid in GTFS_DFS])
GTFS_DF_STOP_TIMES = pd.concat([GTFS_DFS[mid]['stop_times'] for mid in GTFS_DFS])
GTFS_DF_TRIPS = pd.concat([GTFS_DFS[mid]['trips'] for mid in GTFS_DFS])
GTFS_DF_TRIPS = GTFS_DF_TRIPS.merge(GTFS_DF_ROUTES, on='route_id', how='left', suffixes=('', '_route'))
GTFS_DF_TRIPS.drop(columns=['mode_id_route'], inplace=True)

# GTFS_DF_STOP_TIMES = GTFS_DF_STOP_TIMES.merge(GTFS_DF_STOPS, on='stop_id', suffixes=('', '_stop'))
GTFS_DF_STOP_TIMES = GTFS_DF_STOP_TIMES.merge(GTFS_DF_TRIPS, on='trip_id', suffixes=('', '_trip'))
# 1m 30s

GTFS_DF_STOP_TIMES = GTFS_DF_STOP_TIMES[['stop_id', 'stop_sequence', 'mode_id', 'direction_id', 'route_gtfs_id', 'route_short_name', 'route_long_name']].drop_duplicates()
# 20s
GTFS_DF_ROUTESTOPS = GTFS_DF_STOP_TIMES.reset_index(drop=True)

In [124]:
GTFS_DF_STOPS['stop_full_name'] = GTFS_DF_STOPS['stop_name']
GTFS_DF_STOPS['stop_suburb'] = GTFS_DF_STOPS['stop_full_name'].apply(lambda x: x.split('(', 1))
GTFS_DF_STOPS['stop_suburb'] = GTFS_DF_STOPS['stop_suburb'].apply(lambda x: x[1].removesuffix(')') if len(x) == 2 else np.nan)
GTFS_DF_STOPS['stop_name'] = GTFS_DF_STOPS['stop_full_name'].apply(lambda x: x.split('(')[0].strip())

# Assert no stop_name contains both ',' and '('
assert not (GTFS_DF_STOPS['stop_suburb'].notna() & GTFS_DF_STOPS['stop_full_name'].apply(lambda x: ',' in x)).any()

GTFS_DF_STOPS['stop_suburb'] = GTFS_DF_STOPS.apply(lambda x: x['stop_name'].split(', ')[1] if ',' in x['stop_name'] else x['stop_suburb'], axis=1)

GTFS_DF_STOPS['stop_name'] = GTFS_DF_STOPS.apply(lambda x: x['stop_name'].split(', ')[0] if ',' in x['stop_name'] else x['stop_name'], axis=1)

# Inspect NaN stop_suburb
GTFS_DF_STOPS[GTFS_DF_STOPS['stop_suburb'].isna()]

# Custom stop_name and stop_suburb

def custom_stop_name(x):
    if x['stop_id'] == '5588':
        assert x['stop_name'] == 'Rosemary St'
        return 'Rosemary St/High St'
    return x['stop_name']

def custom_stop_suburb(x):
    if x['stop_id'] == '5588':
        assert x['stop_name'] == 'Rosemary St' or x['stop_name'] == 'Rosemary St/High St', x['stop_name']
        return 'Templestowe Lower'
    if x['stop_id'] == '28185':
        assert x['stop_name'] == 'Keysborough South Shopping Centre/Braeside-Dandenong Rd', x['stop_name']
        return 'Keysborough'
    if x['stop_id'] == '35117':
        assert x['stop_name'] == 'Ascot St/Sturt St', x['stop_name']
        return 'Ballarat Central'
    return x['stop_suburb']


GTFS_DF_STOPS['stop_name'] = GTFS_DF_STOPS.apply(lambda x: custom_stop_name(x), axis=1)
GTFS_DF_STOPS['stop_suburb'] = GTFS_DF_STOPS.apply(lambda x: custom_stop_suburb(x), axis=1)

In [175]:
GTFS_DF_STOPS = GTFS_DF_STOPS.groupby('stop_id').aggregate({'stop_name': 'unique', 'stop_suburb': 'unique', 'stop_full_name': 'unique', 'stop_lat': 'unique', 'stop_lon': 'unique'})
# 5s - 7s

GTFS_DF_STOPS.reset_index(inplace=True)

In [178]:
for col in GTFS_DF_STOPS.columns:
    if col != 'stop_id':
        GTFS_DF_STOPS[f'{col}_len'] = GTFS_DF_STOPS[col].apply(len)

# Inspect len columns to find stops with multiple names, suburbs, etc.
# GTFS_DF_STOPS[['stop_name_len', 'stop_suburb_len', 'stop_full_name_len', 'stop_lat_len', 'stop_lon_len']].max()
        
assert GTFS_DF_STOPS['stop_suburb_len'].max() == 1

assert GTFS_DF_STOPS.apply(lambda x: x['stop_name_len'] == 1 or (x['stop_lat_len'] == 1 and x['stop_lon_len'] == 1), axis=1).all()

GTFS_DF_STOPS['stop_suburb'] = GTFS_DF_STOPS['stop_suburb'].apply(lambda x: x[0])

GTFS_DF_STOPS.drop(columns=[col for col in GTFS_DF_STOPS.columns if col.endswith('_len')], inplace=True)

In [233]:
GS_DF_STOPS = pd.merge(GTFS_DF_STOPS, SHP_DF_STOPS, left_on='stop_id', right_on='STOP_ID', suffixes=('_gtfs', '_shp'), how='outer')

GS_DF_STOPS.drop(columns=['stop_full_name', 'STOP_FULL_NAME'], inplace=True)

assert GS_DF_STOPS[GS_DF_STOPS['stop_name'].apply(lambda x: isinstance(x, np.ndarray) and len(x) > 1)]['STOP_ID'].notna().all()

# GS_DF_STOPS[GS_DF_STOPS['stop_id'].isna()]
# assert GS_DF_STOPS[GS_DF_STOPS['stop_name'].apply(lambda x: isinstance(x, np.ndarray) and len(x) > 1)]['STOP_ID'].notna().all()
assert GS_DF_STOPS[GS_DF_STOPS['STOP_ID'].notna() & GS_DF_STOPS['stop_id'].notna()].apply(lambda x: x['STOP_NAME'] in x['stop_name'] if len(x['stop_name']) > 1 else True, axis=1).all()

assert GS_DF_STOPS[GS_DF_STOPS['stop_lat'].apply(lambda x: isinstance(x, np.ndarray) and len(x) > 1)]['LATITUDE'].notna().all()

assert GS_DF_STOPS[GS_DF_STOPS['stop_lon'].apply(lambda x: isinstance(x, np.ndarray) and len(x) > 1)]['LONGITUDE'].notna().all()

In [239]:
GTFS_DF_STOPS[GTFS_DF_STOPS['stop_id'] == '1109']

Unnamed: 0,stop_id,stop_name,stop_suburb,stop_full_name,stop_lat,stop_lon
979,1109,[Inkerman St/Barkly St],St Kilda,[Inkerman St/Barkly St (St Kilda)],"[-37.8637295981912, -37.8637292085664]","[144.981937426538, 144.981914702951]"


In [242]:
k = GS_DF_STOPS[GS_DF_STOPS['LATITUDE'].notna() & GS_DF_STOPS['stop_lat'].notna()]
k[k.apply(lambda x: x['LATITUDE'] in x['stop_lat'] if len(x['stop_lat']) > 1 else False, axis=1)]

Unnamed: 0,stop_id,stop_name,stop_suburb,stop_lat,stop_lon,STOP_ID,STOP_NAME,STOP_SUBURB,LATITUDE,LONGITUDE,TICKETZONE,ROUTES_USING_STOP,geometry


In [241]:
assert GS_DF_STOPS[GS_DF_STOPS['LONGITUDE'].notna() & GS_DF_STOPS['stop_lon'].notna()].apply(lambda x: x['LONGITUDE'] in x['stop_lon'] if len(x['stop_lon']) > 1 else True, axis=1).all()

AssertionError: 

In [None]:

GS_DF_STOPS['stop_name'] = GS_DF_STOPS.apply(lambda x: (x['STOP_NAME'] if (x['STOP_NAME'] in x['stop_name']) else x['stop_name'][0]) if isinstance(x['stop_name'], np.ndarray) else x['stop_name'], axis=1)


GS_DF_STOPS['stop_lat'] = GS_DF_STOPS.apply(lambda x: (x['LATITUDE'] if (x['LATITUDE'] in x['stop_lat']) else x['stop_lat'][0]) if isinstance(x['stop_lat'], np.ndarray) else x['stop_lat'], axis=1)
GS_DF_STOPS['stop_lon'] = GS_DF_STOPS.apply(lambda x: (x['LONGITUDE'] if (x['LONGITUDE'] in x['stop_lon']) else x['stop_lon'][0]) if isinstance(x['stop_lon'], np.ndarray) else x['stop_lon'], axis=1)

In [230]:
GS_DF_STOPS

Unnamed: 0,stop_id,stop_name,stop_suburb,stop_lat,stop_lon,STOP_ID,STOP_NAME,STOP_SUBURB,LATITUDE,LONGITUDE,TICKETZONE,ROUTES_USING_STOP,geometry
0,1000,Dole Ave/Cheddar Rd,Reservoir,-37.700775,145.018951,1000,Dole Ave/Cheddar Rd,Reservoir,-37.700775,145.018951,2,556,"(145.01895679496207, -37.700761839394126)"
1,10001,Rex St/Taylors Rd,Kings Park,-37.726975,144.776152,10001,Rex St/Taylors Rd,Kings Park,-37.726975,144.776152,2,418,"(144.77615808013866, -37.726961811610934)"
2,10002,Yuille St/Centenary Ave,Melton,-37.676160,144.595789,10002,Yuille St/Centenary Ave,Melton,-37.676160,144.595789,2,458,"(144.59579487015532, -37.676146816695955)"
3,10009,Gum Rd/Main Rd West,Albanvale,-37.741497,144.775899,10009,Gum Rd/Main Rd West,Albanvale,-37.741497,144.775899,2,424,"(144.77590499996512, -37.741483851234776)"
4,1001,Lloyd Ave/Cheddar Rd,Reservoir,-37.699183,145.019685,1001,Lloyd Ave/Cheddar Rd,Reservoir,-37.699183,145.019685,2,556,"(145.01969083617018, -37.69916982951895)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
28374,9991,Donald St/Wood St,Preston,-37.735366,145.022375,9991,Donald St/Wood St,Preston,-37.735366,145.022375,12,555,"(145.0223811139475, -37.73535279930797)"
28375,9992,Highview Rd/Wood St,Preston,-37.735117,145.019886,9992,Highview Rd/Wood St,Preston,-37.735117,145.019886,12,555,"(145.01989207591876, -37.73510385903128)"
28376,9993,Plenty Rd/Wood St,Preston,-37.734193,145.013681,9993,Plenty Rd/Wood St,Preston,-37.734460,145.014014,12,555,"(145.0140201207862, -37.73444685842192)"
28377,9994,Yellow Gum Rd/Copperfield Dr,Delahey,-37.713855,144.771894,9994,Yellow Gum Rd/Copperfield Dr,Delahey,-37.713855,144.771894,2,425,"(144.77189999987596, -37.7138417920479)"


In [None]:

for col in ['stop_name', 'stop_suburb', 'stop_full_name', 'stop_lat', 'stop_lon']:
    assert GTFS_DF_STOPS[f'{col}_len'].max() == 1
    GTFS_DF_STOPS[col] = GTFS_DF_STOPS[col].apply(lambda x: x[0])

In [128]:
# Inspect NaN stop_suburb
GTFS_DF_STOPS[GTFS_DF_STOPS['stop_suburb'].isna()]

# Custom stop_name and stop_suburb

def custom_stop_name(x):
    if x['stop_id'] == '5588':
        assert x['stop_name'] == 'Rosemary St'
        return 'Rosemary St/High St'
    return x['stop_name']

def custom_stop_suburb(x):
    if x['stop_id'] == '5588':
        assert x['stop_name'] == 'Rosemary St' or x['stop_name'] == 'Rosemary St/High St', x['stop_name']
        return 'Templestowe Lower'
    if x['stop_id'] == '28185':
        assert x['stop_name'] == 'Keysborough South Shopping Centre/Braeside-Dandenong Rd', x['stop_name']
        return 'Keysborough'
    if x['stop_id'] == '35117':
        assert x['stop_name'] == 'Ascot St/Sturt St', x['stop_name']
        return 'Ballarat Central'
    return x['stop_suburb']


GTFS_DF_STOPS['stop_name'] = GTFS_DF_STOPS.apply(lambda x: custom_stop_name(x), axis=1)
GTFS_DF_STOPS['stop_suburb'] = GTFS_DF_STOPS.apply(lambda x: custom_stop_suburb(x), axis=1)

In [127]:
GTFS_DF_STOPS[GTFS_DF_STOPS['stop_suburb'].isna()]['stop_name'].unique()


array(['Keysborough South Shopping Centre/Braeside-Dandenong Rd',
       'Rosemary St', 'Ascot St/Sturt St'], dtype=object)

In [None]:
GTFS_DF_STOPS[GTFS_DF_STOPS['stop_suburb'].isna()]


In [7]:
GSA_DF_ROUTES = pd.read_csv('../local/gsa-routes.csv', dtype=str)

In [None]:

DF_API2GS = GSA_DF_ROUTES.groupby('route_id')['route_gs_id'].unique().rename('route_gs_id').reset_index()
df_api2gs_nunique = GSA_DF_ROUTES.groupby('route_id')['route_gs_id'].nunique().rename('route_gs_nunique').reset_index()
DF_API2GS = pd.merge(DF_API2GS, df_api2gs_nunique, on='route_id')
DF_API2GS['route_gs_len'] = DF_API2GS['route_gs_id'].apply(len)
DF_API2GS['route_gs_nunique'] = DF_API2GS['route_gs_nunique'].apply(int)
DF_API2GS['gs_na'] = DF_API2GS['route_gs_len'] != DF_API2GS['route_gs_nunique']
DF_API2GS = pd.merge(DF_API2GS, API_DF_ROUTES, on='route_id')

DF_API2GS[DF_API2GS['gs_na'] & (DF_API2GS['route_gs_nunique'] == 0)]['route_gtfs_id0'].unique() # array(['4', '1', '5'], dtype=object)
DF_API2GS[DF_API2GS['route_gs_nunique'] >= 2]['route_gtfs_id0'].unique() # array(['1', '5'], dtype=object)

In [9]:
api_directions_endpoints = API_DF_ROUTES.apply(lambda x: f'/v3/directions/route/{x["route_id"]}', axis=1).unique()

api_all_directions = {}
for i, endpoint in enumerate(api_directions_endpoints):
    route_id = int(endpoint.split('/')[4])
    directions = None
    while directions is None:
        try:
            directions = get_data(endpoint)
            # print(f'[{i}] Got directions for route {route_id}')
        except requests.exceptions.HTTPError:
            # print(f'Failed to get directions for route {route_id}. Retrying in 30 seconds...')
            time.sleep(30)
            continue
    api_all_directions[route_id] = directions
api_all_directions = { str(k): v['directions'] for k, v in api_all_directions.items() }
# 2m - 3m

In [10]:
assert all([str(direction['route_id']) == str(k) for k, v in api_all_directions.items() for direction in v])

API_DIRECTIONS_LIST = [direction for k, v in api_all_directions.items() for direction in v]

API_DF_DIRECTIONS = pd.DataFrame(API_DIRECTIONS_LIST)[['route_id', 'route_type', 'direction_id', 'direction_name']]
API_DF_DIRECTIONS.to_csv('../local/ptv-api/all_directions.csv', index=False)
assert API_DF_DIRECTIONS[['route_id', 'route_type']].value_counts().max() <= 2

API_DF_DIRECTIONS = pd.read_csv('../local/ptv-api/all_directions.csv')

API_route_rtds = API_DF_DIRECTIONS[['route_id', 'route_type', 'direction_id']].values
API_route_rtds = [(str(r), str(t), str(d)) for r, t, d in API_route_rtds]

In [12]:
API_STOPS_dict = {}
for i, (route_id, route_type, direction_id) in enumerate(API_route_rtds):
    # print(f'[{i}] Getting stops for route {route_id}, route type {route_type}, direction {direction_id}')
    endpoint = f'/v3/stops/route/{route_id}/route_type/{route_type}?direction_id={direction_id}&include_geopath=true'
    stops = None
    while stops is None:
        try:
            stops = get_data(endpoint)
            # print(f'[{i}] Got stops for route {route_id}')
        except requests.exceptions.HTTPError:
            # print(f'Failed to get stops for route {route_id}. Retrying in 30 seconds...')
            time.sleep(30)
            continue
    API_STOPS_dict[route_id] = API_STOPS_dict.get(route_id, {})
    API_STOPS_dict[route_id][direction_id] = stops
# 3m - 5m
    
API_STOPS_dict = { str(k): { str(k2): v2 for k2, v2 in v.items()} for k, v in API_STOPS_dict.items() }
with open('../local/ptv-api/all_stops_by_direction.json', 'w') as f:
    f.write(json.dumps(API_STOPS_dict))

In [None]:
API_STOPS_STOPS = []
API_STOPS_GEOPATHS = []
for route_id, route_type, direction_id in API_route_rtds:
    stops = API_STOPS_dict[route_id][direction_id]['stops']
    for stop in stops:
        if 'stop_ticket' not in stop:
            # print(f'Route {route_id} has no ticket key for stop {stop["stop_id"]}')
            continue
        if stop['stop_ticket'] is None:
            # print(f'Route {route_id}: stop {stop["stop_id"]}: stop ticket is None. Skipping...')
            continue
        for k, v in stop['stop_ticket'].items():
            k = f'stop_{k}'
            assert k not in stop, f'Key {k} already exists in stop'
            stop[k] = v
        if 'route_id' not in stop:
            stop['route_id'] = route_id
        if 'route_type' not in stop:
            stop['route_type'] = route_type
        if 'direction_id' not in stop:
            stop['direction_id'] = direction_id
        API_STOPS_STOPS.append(stop)
    geopath = API_STOPS_dict[route_id][direction_id]['geopath']
    for path in geopath:
        if 'route_id' not in path:
            path['route_id'] = route_id
        if 'route_type' not in path:
            path['route_type'] = route_type
        if 'direction_id' not in path:
            path['direction_id'] = direction_id
        API_STOPS_GEOPATHS.append(path)
        
with open('../local/ptv-api/all_stops_stops.json', 'w') as f:
    f.write(json.dumps(API_STOPS_STOPS))
with open('../local/ptv-api/all_stops_geopaths.json', 'w') as f:
    f.write(json.dumps(API_STOPS_GEOPATHS))


In [41]:

API_STOPS_STOPS = json.load(open('../local/ptv-api/all_stops_stops.json'))
API_STOPS_GEOPATHS = json.load(open('../local/ptv-api/all_stops_geopaths.json'))

API_DF_STOPS = pd.DataFrame(API_STOPS_STOPS)
API_DF_STOPS.drop(columns=['disruption_ids'], inplace=True)
API_DF_STOPS['stop_ticket_zones'] = API_DF_STOPS['stop_ticket_zones'].apply(lambda x: ', '.join(map(str, x)) if isinstance(x, list) else x)
API_DF_STOPS.drop(columns=['stop_ticket'], inplace=True)
API_DF_STOPS['stop_is_regional'] = API_DF_STOPS['stop_zone'].apply(lambda x: 'Regional' in x)
API_DF_STOPS['stop_zones'] = API_DF_STOPS['stop_ticket_zones']
API_DF_STOPS.drop(columns=['stop_ticket_zones', 'stop_zone'], inplace=True)
API_DF_STOPS = API_DF_STOPS[['stop_id', 'stop_name', 'stop_suburb', 'stop_latitude', 'stop_longitude', 'stop_sequence', 'route_id', 'direction_id',  'route_type',  'stop_landmark', 'stop_zones', 'stop_ticket_type', 'stop_is_free_fare_zone', 'stop_is_regional', 'stop_ticket_machine', 'stop_ticket_checks', 'stop_vline_reservation']]

API_DF_STOPS.to_csv('../local/ptv-api/all_stops_stops.csv', index=False)
API_DF_STOPS = pd.read_csv('../local/ptv-api/all_stops_stops.csv')

In [47]:
API_DF_STOPS_MIN = API_DF_STOPS[['stop_id', 'stop_name', 'stop_suburb', 'stop_latitude', 'stop_longitude']].drop_duplicates().reset_index(drop=True)

In [54]:
API_DF_STOPS_MIN.groupby('stop_id').aggregate({'stop_name': 'nunique', 'stop_suburb': 'nunique', 'stop_latitude': 'nunique', 'stop_longitude': 'nunique'}).max()

stop_name         2
stop_suburb       1
stop_latitude     1
stop_longitude    1
dtype: int64

In [55]:
API_DF_STOPS_MIN_GROUP = API_DF_STOPS_MIN.groupby('stop_id').aggregate({'stop_name': 'unique', 'stop_suburb': 'unique', 'stop_latitude': 'unique', 'stop_longitude': 'unique'})
# 7s - 9s
assert API_DF_STOPS_MIN_GROUP['stop_suburb'].apply(lambda x: len(x) == 1).all()
assert API_DF_STOPS_MIN_GROUP['stop_latitude'].apply(lambda x: len(x) == 1).all()
assert API_DF_STOPS_MIN_GROUP['stop_longitude'].apply(lambda x: len(x) == 1).all()

In [62]:
API_DF_STOPS_MIN_GROUP[API_DF_STOPS_MIN_GROUP['stop_name'].apply(lambda x: len(x) > 1)]

Unnamed: 0_level_0,stop_name,stop_suburb,stop_latitude,stop_longitude
stop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1021,"[Berwick Station, Berwick Railway Station]",[Berwick],[-38.04041],[145.345718]
1028,"[Broadmeadows Station, Broadmeadows Railway St...",[Broadmeadows],[-37.6830521],[144.919617]
1036,"[Caulfield Station, Caulfield Railway Station]",[Caulfield East],[-37.8774567],[145.042526]
1040,"[Clayton Station, Clayton Railway Station]",[Clayton],[-37.9246826],[145.120529]
1044,"[Craigieburn Station, Craigieburn Railway Stat...",[Craigieburn],[-37.6019249],[144.943314]
1049,"[Dandenong Station, Dandenong Railway Station]",[Dandenong],[-37.9899673],[145.209732]
1064,"[Essendon Station, Essendon Railway Station]",[Essendon],[-37.75601],[144.9162]
1071,"[Flinders Street Station, Flinders Street Rail...",[Melbourne City],[-37.81831],[144.966965]
1072,"[Footscray Station, Footscray Railway Station]",[Footscray],[-37.8010864],[144.9032]
1144,"[North Melbourne Station, North Melbourne Rail...",[West Melbourne],[-37.807415],[144.942566]


In [70]:
API_DF_STOPS[API_DF_STOPS['stop_name'] == 'Clayton Railway Station']

Unnamed: 0,stop_id,stop_name,stop_suburb,stop_latitude,stop_longitude,stop_sequence,route_id,direction_id,route_type,stop_landmark,stop_zones,stop_ticket_type,stop_is_free_fare_zone,stop_is_regional,stop_ticket_machine,stop_ticket_checks,stop_vline_reservation
60197,1040,Clayton Railway Station,Clayton,-37.924683,145.120529,41,1721,0,3,,2,,False,True,False,False,False
60235,1040,Clayton Railway Station,Clayton,-37.924683,145.120529,0,1721,23,3,,2,,False,True,False,False,False
61822,1040,Clayton Railway Station,Clayton,-37.924683,145.120529,25,1823,0,3,,2,,False,True,False,False,False
61852,1040,Clayton Railway Station,Clayton,-37.924683,145.120529,6,1823,11,3,,2,,False,True,False,False,False
61883,1040,Clayton Railway Station,Clayton,-37.924683,145.120529,18,1824,0,3,,2,,False,True,False,False,False
61911,1040,Clayton Railway Station,Clayton,-37.924683,145.120529,6,1824,39,3,,2,,False,True,False,False,False
62430,1040,Clayton Railway Station,Clayton,-37.924683,145.120529,22,5838,0,3,,2,,False,True,False,False,False
62455,1040,Clayton Railway Station,Clayton,-37.924683,145.120529,0,5838,43,3,,2,,False,True,False,False,False
