In [1]:
import requests
import pandas as pd
import geopandas as gpd
import json
import os
import numpy as np
import time

In [2]:
SHP_DIR = '../local/ptv-spatial-datasets'
SHP_GDFS : gpd.GeoDataFrame = { f.split('.')[0]: gpd.read_file(os.path.join(SHP_DIR, f)) for f in os.listdir(SHP_DIR) if f.endswith('.shp') }
for f in os.listdir(SHP_DIR):
    if f.endswith('.txt'):
        gdf_name = f.removesuffix('_column_names.txt').upper()
        with open(os.path.join(SHP_DIR, f), 'r') as file:
            gdf_column_names = [line.strip() for line in file.readlines()][4:]
        assert gdf_name in SHP_GDFS, f'{gdf_name} not in GDFS'
        for line in gdf_column_names:
            assert ' = ' in line, f'Invalid line: {line}'
        gdf_column_names = { line.split(' = ')[0]: line.split(' = ')[1] for line in gdf_column_names }
        SHP_GDFS[gdf_name].rename(columns=gdf_column_names, inplace=True)

In [3]:
assert SHP_GDFS['PTV_METRO_BUS_STOP']['ROUTES_USING_STOP'].notna().all()
assert SHP_GDFS['PTV_METRO_TRAM_STOP']['ROUTES_USING_STOP'].notna().all()
assert SHP_GDFS['PTV_METRO_TRAIN_STATION']['ROUTES_USING_STOP'].notna().all()
# Check if a column exists in a GeoDataFrame
assert 'ROUTES_USING_STOP' not in SHP_GDFS['PTV_REGIONAL_COACH_STOP'].columns
assert SHP_GDFS['PTV_SKYBUS_STOP']['ROUTES_USING_STOP'].isna().all()
assert SHP_GDFS['PTV_REGIONAL_BUS_STOP']['ROUTES_USING_STOP'].notna().any()

In [4]:
shp_gdf_routes = []
for k, gdf in SHP_GDFS.items():
    if 'ROUTE' in k:
        gdf['SHP_FILE'] = k
        shp_gdf_routes.append(gdf)
SHP_DF_ROUTES : pd.DataFrame = pd.concat(shp_gdf_routes)

In [5]:
SHP_DF_ROUTES['route_idx'] = SHP_DF_ROUTES['ROUTE_ID'].apply(lambda x: x.split('-'))
SHP_DF_ROUTES['route_id0'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[0])
SHP_DF_ROUTES['route_id1'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[1])
SHP_DF_ROUTES['route_id2'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[2] if len(x) > 4 else np.nan)
SHP_DF_ROUTES['route_id3'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[-2])
SHP_DF_ROUTES['route_id4'] = SHP_DF_ROUTES['route_idx'].apply(lambda x: x[-1])
SHP_DF_ROUTES['route_id01'] = SHP_DF_ROUTES['route_id0'] + '-' + SHP_DF_ROUTES['route_id1']

assert SHP_DF_ROUTES.groupby('route_id01').aggregate({'route_id2': 'unique', 'route_id3': 'unique', 'route_id4': 'unique'}).apply(lambda x: len(x['route_id2']) <= 1 or len(x['route_id4']) <= 1, axis=1).all()

# Since ROUTE_LONG_NAME is all not null, when we merge dataframes, we can use ROUTE_LONG_NAME isna() to check if the other dataframe has the equivalent data in SHP_DF_ROUTES_MIN
assert SHP_DF_ROUTES['ROUTE_LONG_NAME'].notna().all()

# Inspect the route_id0 values of SHP_DF_ROUTES
SHP_DF_ROUTES['route_id0'].sort_values(key=lambda x: x.apply(int)).unique() # array(['3', '4', '5', '6', '7', '11'], dtype=object)

# Assert that all metro buses route_short_name are 3 or 4 characters long
assert SHP_DF_ROUTES[(SHP_DF_ROUTES['route_id0'] == '4')]['ROUTE_SHORT_NAME'].apply(lambda x: len(x) in [3, 4]).all()
# Proof that route_id1 is unique for each ROUTE_SHORT_NAME for route_id0 == 4
odd_bus_id1_names = SHP_DF_ROUTES[(SHP_DF_ROUTES['route_id1'] != SHP_DF_ROUTES['ROUTE_SHORT_NAME']) & (SHP_DF_ROUTES['route_id0'] == '4')]['ROUTE_SHORT_NAME'].unique()
assert SHP_DF_ROUTES[(SHP_DF_ROUTES['ROUTE_SHORT_NAME'].apply(lambda x: x in odd_bus_id1_names)) & (SHP_DF_ROUTES['route_id0'] == '4')].groupby('ROUTE_SHORT_NAME')['route_id1'].nunique().unique() == [1]

def get_gtfs_id(x):
    if x['route_id0'] == '4':
        return f'4-{x["ROUTE_SHORT_NAME"]}'
    elif x['route_id0'] == '7':
        assert 'TB' in x["route_id1"], f'7-TeleBus route_id1 {x["route_id1"]} does not contain TB'
        route_number = x["route_id1"].lstrip('TB')
        # Add left trailing 0s
        route_number = route_number.zfill(2)
        return f'7-B{route_number}'
    else:
        return f'{x["route_id0"]}-{x["route_id1"]}'

SHP_DF_ROUTES['route_shp_id'] = SHP_DF_ROUTES.apply(lambda x: f'{x["route_id0"]}-{x["route_id1"]}' + (x['route_id2'] if x['route_id0'] == '4' and pd.notna(x['route_id2']) else ''), axis=1)
SHP_DF_ROUTES['route_gtfs_id'] = SHP_DF_ROUTES.apply(lambda x: get_gtfs_id(x), axis=1)

SHP_DF_ROUTES_MIN = SHP_DF_ROUTES[['route_gtfs_id', 'route_shp_id', 'route_id0', 'ROUTE_SHORT_NAME', 'ROUTE_LONG_NAME', 'SHP_FILE']].drop_duplicates()

SHP_DF_ROUTES_MIN = SHP_DF_ROUTES_MIN.groupby('route_gtfs_id').aggregate({'route_shp_id': 'unique', 'route_id0': 'unique', 'ROUTE_SHORT_NAME': 'unique', 'ROUTE_LONG_NAME': 'unique', 'SHP_FILE': 'unique'}).reset_index()

assert SHP_DF_ROUTES_MIN['route_shp_id'].apply(lambda x: len(x) == 1).all()
assert SHP_DF_ROUTES_MIN['route_id0'].apply(lambda x: len(x) == 1).all()
assert SHP_DF_ROUTES_MIN['ROUTE_SHORT_NAME'].apply(lambda x: len(x) == 1).all()
assert SHP_DF_ROUTES_MIN['SHP_FILE'].apply(lambda x: len(x) == 1).all()

# Inspect multiple ROUTE_LONG_NAME of the same route_gtfs_id
SHP_DF_ROUTES_MIN[SHP_DF_ROUTES_MIN['ROUTE_LONG_NAME'].apply(lambda x: len(x) != 1)]

SHP_DF_ROUTES_MIN['route_shp_id'] = SHP_DF_ROUTES_MIN['route_shp_id'].apply(lambda x: x[0])
SHP_DF_ROUTES_MIN['route_id0'] = SHP_DF_ROUTES_MIN['route_id0'].apply(lambda x: x[0])
SHP_DF_ROUTES_MIN['ROUTE_SHORT_NAME'] = SHP_DF_ROUTES_MIN['ROUTE_SHORT_NAME'].apply(lambda x: x[0])
SHP_DF_ROUTES_MIN['SHP_FILE'] = SHP_DF_ROUTES_MIN['SHP_FILE'].apply(lambda x: x[0])

SHP_DF_ROUTES_MIN[['SHP_FILE', 'route_id0']].drop_duplicates().sort_values('SHP_FILE')

Unnamed: 0,SHP_FILE,route_id0
33,PTV_METRO_BUS_ROUTE,4
763,PTV_METRO_BUS_ROUTE,7
9,PTV_METRO_TRAM_ROUTE,3
434,PTV_REGIONAL_BUS_ROUTE,6
386,PTV_REGIONAL_COACH_ROUTE,5
0,PTV_SKYBUS_ROUTE,11


In [52]:
assert SHP_GDFS['PTV_METRO_TRAM_STOP']['STOP_ID'].is_unique
assert SHP_GDFS['PTV_METRO_TRAIN_STATION']['STOP_ID'].is_unique
assert SHP_GDFS['PTV_REGIONAL_BUS_STOP']['STOP_ID'].is_unique
assert SHP_GDFS['PTV_REGIONAL_COACH_STOP']['STOP_ID'].is_unique
assert SHP_GDFS['PTV_SKYBUS_STOP']['STOP_ID'].is_unique

# STOP ID in PTV_METRO_BUS_STOP is not unique, however it's only because of the addition of TeleBus routes
shp_metro_bus_stop_duplicated_ids = SHP_GDFS['PTV_METRO_BUS_STOP'][SHP_GDFS['PTV_METRO_BUS_STOP']['STOP_ID'].duplicated(keep=False)].groupby('STOP_ID')['ROUTES_USING_STOP'].unique()
assert shp_metro_bus_stop_duplicated_ids.apply(lambda x: len(x) == 2).all()
assert shp_metro_bus_stop_duplicated_ids.apply(lambda x: len([i for i in x if 'TeleBus' in i]) == 1).all()

# Split PTV_METRO_BUS_STOP into PTV_METROBUS_STOP and PTV_TELEBUS_STOP
SHP_GDFS['PTV_METROBUS_STOP'] = SHP_GDFS['PTV_METRO_BUS_STOP'][SHP_GDFS['PTV_METRO_BUS_STOP']['ROUTES_USING_STOP'].apply(lambda x: 'TeleBus' not in x)].reset_index(drop=True)
SHP_GDFS['PTV_TELEBUS_STOP'] = SHP_GDFS['PTV_METRO_BUS_STOP'][SHP_GDFS['PTV_METRO_BUS_STOP']['ROUTES_USING_STOP'].apply(lambda x: 'TeleBus' in x)].reset_index(drop=True)

assert SHP_GDFS['PTV_METROBUS_STOP']['STOP_ID'].is_unique
assert SHP_GDFS['PTV_TELEBUS_STOP']['STOP_ID'].is_unique

SHP_DFS_STOPS = {
    '2': SHP_GDFS['PTV_METRO_TRAIN_STATION'],
    '3': SHP_GDFS['PTV_METRO_TRAM_STOP'],
    '4': SHP_GDFS['PTV_METROBUS_STOP'],
    '5': SHP_GDFS['PTV_REGIONAL_COACH_STOP'],
    '6': SHP_GDFS['PTV_REGIONAL_BUS_STOP'],
    '7': SHP_GDFS['PTV_TELEBUS_STOP'],
    '11': SHP_GDFS['PTV_SKYBUS_STOP']
}

for mid in SHP_DFS_STOPS:
    if 'ROUTES_USING_STOP' in SHP_DFS_STOPS[mid].columns:
        SHP_DFS_STOPS[mid]['ROUTE'] = SHP_DFS_STOPS[mid]['ROUTES_USING_STOP'].apply(lambda x: x.split(',') if pd.notna(x) else [])
        SHP_DFS_STOPS[mid] = SHP_DFS_STOPS[mid][['STOP_ID', 'ROUTE']].explode('ROUTE').reset_index(drop=True)

for k in SHP_DFS_STOPS:
    if 'ROUTE' in SHP_DFS_STOPS[k].columns:
        SHP_DFS_STOPS[k] = SHP_DFS_STOPS[k].merge(SHP_DF_ROUTES_MIN[SHP_DF_ROUTES_MIN['route_id0'] == k], left_on='ROUTE', right_on='ROUTE_SHORT_NAME', how='left')

# Assert that there is no odd ROUTE in SHP_DFS_STOPS
assert SHP_DFS_STOPS['3']['ROUTE_SHORT_NAME'].notna().all()
assert SHP_DFS_STOPS['4']['ROUTE_SHORT_NAME'].notna().all()
assert SHP_DFS_STOPS['7']['ROUTE_SHORT_NAME'].notna().all()
assert 'ROUTE' not in SHP_DFS_STOPS['5'].columns
assert (SHP_DFS_STOPS['6'][SHP_DFS_STOPS['6']['ROUTE'].notna() & SHP_DFS_STOPS['6']['ROUTE_SHORT_NAME'].isna()]['ROUTE'] == '').all()
assert (SHP_DFS_STOPS['11'][SHP_DFS_STOPS['11']['ROUTE'].notna() & SHP_DFS_STOPS['11']['ROUTE_SHORT_NAME'].isna()]['ROUTE'] == '').all()