In [1]:
import pandas as pd
import os
import shapely.geometry as sg
import geopandas as gpd
import numpy as np
from shapely.geometry import LineString
from shapely.ops import unary_union
data_directory = os.path.join('..', 'data', 'ptv', '20240224')
ROUTE_TYPES = {
    0 : 'Tram',
    1 : 'Metro',
    2 : 'Rail',
    3 : 'Bus',
    4 : 'Ferry',
    5 : 'Cable tram',
    6 : 'Gondola',
    7 : 'Funicular',
    11 : 'Trolleybus',
    12 : 'Monorail',
}
ROUTE_TYPES_LONG = {
    0 : 'Tram, Streetcar, Light rail. Any light rail or street level system within a metropolitan area.',
    1 : 'Subway, Metro. Any underground rail system within a metropolitan area.',
    2 : 'Rail. Used for intercity or long-distance travel.',
    3 : 'Bus. Used for short- and long-distance bus routes.',
    4 : 'Ferry. Used for short- and long-distance boat service.',
    5 : 'Cable tram. Used for street-level rail cars where the cable runs beneath the vehicle, e.g., cable car in San Francisco.',
    6 : 'Aerial lift, suspended cable car (e.g., gondola lift, aerial tramway). Cable transport where cabins, cars, gondolas or open chairs are suspended by means of one or more cables.',
    7 : 'Funicular. Any rail system designed for steep inclines.',
    11 : 'Trolleybus. Electric buses that draw power from overhead wires using poles.',
    12 : 'Monorail. Railway in which the track consists of a single rail or a beam.',
}

BRANCH_IDS_ALL = ['1', '2', '3', '4', '5', '6', '7', '8', '10', '11']
BRANCH_IDS = ['1', '2', '3', '4', '5', '6', '10', '11']
TABLE_NAMES = ['stop_times', 'stops', 'trips', 'routes', 'calendar', 'calendar_dates', 'agency', 'shapes']
# GTFS File Fields
# agency.txt 
# agency_id, agency_name, agency_url, agency_timezone, agency_lang
# calendar.txt 
# service_id, monday, tuesday, wednesday, thursday, friday, saturday, sunday, start_date, end_date
# calendar_dates.txt 
# service_id ,date, exception_type
# routes.txt 
# route_id, agency_id, route_short_name, route_long_name,
# route_type, route_color,route_text_color
# trips.txt 
# route_id, service_id, trip_id, shape_id, trip_headsign, direction_id
# stops.txt 
# stop_id, stop_name, stop_lat, stop_lon
# stop_times.txt 
# trip_id, arrival_time, departure_time, stop_id, stop_sequence, stop_headsign, pickup_type, drop_off_type, shape_dist_traveled
# shapes.txt 
# shape_id, shape_pt_lat, shape_pt_lon, shape_pt_sequence, shape_dist_traveled 
def get_df(branch_id, table_name):
    files = [os.path.join(data_directory, f) for f in os.listdir(data_directory) if f.split('-')[1] == str(branch_id) and f.split('-')[2] == table_name]
    if len(files) == 0:
        return None
    return pd.concat([pd.read_csv(f, keep_default_na=False, na_values=['']) for f in files])

DF = {branch_id: {table_name: get_df(branch_id, table_name) for table_name in TABLE_NAMES} for branch_id in BRANCH_IDS_ALL}
# 15s - 30s

# Assert all shape_id contains route_id
for bid in BRANCH_IDS:
    assert DF[bid]['trips'].dropna(subset=['route_id', 'shape_id']).apply(lambda x: x['route_id'] in x['shape_id'], axis=1).all(), bid

for bid in BRANCH_IDS:
    DF[bid]['stop_times'] = pd.merge(DF[bid]['stop_times'], DF[bid]['stops'], on='stop_id')

for bid in BRANCH_IDS:
    assert DF[bid]['shapes'][DF[bid]['shapes']['shape_pt_sequence'] == 1]['shape_dist_traveled'].unique() == [0], bid

# Proof: Each shape of a route is equivalent to a unique stop sequence pattern of a route
for bid in BRANCH_IDS:
    DF[bid]['trip_stops'] = DF[bid]['stop_times'].sort_values(by=['trip_id', 'stop_sequence']).groupby('trip_id')['stop_id'].apply(list).reset_index()
    DF[bid]['trip_stops']['stops_count'] = DF[bid]['trip_stops']['stop_id'].apply(len)

for bid in BRANCH_IDS:
    DF[bid]['trips_full'] = pd.merge(DF[bid]['trips'], DF[bid]['trip_stops'], on='trip_id')
    DF[bid]['trips_full']['stop_pattern'] = DF[bid]['trips_full']['stop_id'].apply(lambda x: '-'.join([str(i) for i in x])) 
    DF[bid]['shape_stops'] = DF[bid]['trips_full'].groupby('shape_id')['stop_pattern'].unique().apply(lambda x: x[0]).reset_index()

DFTRIPSFULL : pd.DataFrame = pd.concat([DF[bid]['trips_full'] for bid in BRANCH_IDS])
assert DFTRIPSFULL.groupby('shape_id')['stop_pattern'].nunique().unique() == [1]

DFSHAPESTOPS = pd.concat([DF[bid]['shape_stops'] for bid in BRANCH_IDS])

# Total 1m 30s

for bid in BRANCH_IDS:
    DF[bid]['shapes'].sort_values(by=['shape_id', 'shape_pt_sequence'], inplace=True)
    DF[bid]['shapes']['points'] = list(zip(DF[bid]['shapes']['shape_pt_lon'], DF[bid]['shapes']['shape_pt_lat']))
    DF[bid]['lines'] = DF[bid]['shapes'].groupby('shape_id').apply(lambda x: x['points'].to_numpy()).rename('line').reset_index()
# 15s - 30s
    
for bid in BRANCH_IDS:
    DF[bid]['lines']['direction'] = DF[bid]['lines']['shape_id'].transform(lambda x: ''.join(x.split('.')[-1]))
    DF[bid]['lines']['route_id'] = DF[bid]['lines']['shape_id'].transform(lambda x: ''.join(x.split('.')[0]))
    DF[bid]['lines']['route_name'] = DF[bid]['lines']['route_id'].transform(lambda x: ''.join(x.split('-')[1:-2]))
    DF[bid]['lines']['branch'] = DF[bid]['lines']['shape_id'].transform(lambda x: x.split('-')[0])
    DF[bid]['lines']['class'] = bid

# Total 2m    
    
DFLINES : pd.DataFrame = pd.concat([DF[bid]['lines'] for bid in BRANCH_IDS])

DFLINES = pd.merge(DFLINES, DFSHAPESTOPS, on='shape_id')

DFLINES['stops'] = DFLINES['stop_pattern'].apply(lambda x: x.split('-'))
DFLINES['points_count'] = DFLINES['line'].apply(len)
DFLINES['stops_count'] = DFLINES['stops'].apply(len)

DFLINES = DFLINES[['route_name', 'direction', 'branch', 'class', 'route_id', 'shape_id', 'stop_pattern', 'stops', 'stops_count', 'line', 'points_count']]

DFLINES.sort_values(
    by=['class', 'route_name', 'branch', 'direction', 'stops_count', 'points_count', 'stop_pattern'], 
    ascending=[True, True, True, True, False, False, True],
    inplace=True, 
)

DFLINES.reset_index(drop=True, inplace=True)

# Total: 1m 30s - 2m

In [2]:
DFSTOPPATTERNS = DFLINES.drop_duplicates(subset=['route_name', 'class', 'direction', 'stop_pattern'])[['route_name', 'class', 'direction', 'stop_pattern', 'stops', 'stops_count']]

In [3]:
DFROUTESTOPS = DFSTOPPATTERNS[['route_name', 'class', 'direction', 'stops']].explode('stops').reset_index(drop=True)
DFROUTESTOPS = DFROUTESTOPS.groupby(['route_name', 'class', 'direction'])['stops'].unique().reset_index()
DFROUTESTOPS['stops_count'] = DFROUTESTOPS['stops'].apply(len)
DFROUTESTOPS.rename(columns={'stops': 'stopset', 'stops_count' : 'stopsetsize'}, inplace=True)
# 3s

In [4]:
DFSTOPPATTERNS.sort_values(by=['class', 'route_name', 'direction', 'stops_count', 'stop_pattern'], ascending=[True, True, True, False, True], inplace=True)
DFSTOPPATTERNS.reset_index(drop=True, inplace=True)


In [11]:
def find_smallest_number_of_lists(lists):
    # Create a set to store all unique elements from all lists
    all_elements = set()
    for lst in lists:
        all_elements.update(lst)

    # Initialize an empty list to store the selected lists
    selected_lists = []

    # Iterate until all elements are covered
    while all_elements:
        # Find the list that covers the maximum number of uncovered elements
        max_covered = set()
        max_list = None
        for lst in lists:
            covered = set(lst).intersection(all_elements)
            if len(covered) > len(max_covered):
                max_covered = covered
                max_list = lst

        # Remove covered elements from the set of all elements
        all_elements.difference_update(max_covered)

        # Add the selected list to the result
        selected_lists.append(max_list)

    return selected_lists

In [34]:
DFROUTEPATTERNSMULTI = DFSTOPPATTERNS.groupby(['route_name', 'class', 'direction'])['stops'].apply(list).rename('stop_patterns').reset_index()
DFROUTEPATTERNSMULTI['stop_patterns'] = DFROUTEPATTERNSMULTI['stop_patterns'].apply(lambda x: sorted(x, key=lambda y: len(y), reverse=True))
DFROUTEPATTERNSMULTI['spcover'] = DFROUTEPATTERNSMULTI['stop_patterns'].apply(lambda x: find_smallest_number_of_lists(x))
DFROUTEPATTERNSMULTI['stop_patterns_len'] = DFROUTEPATTERNSMULTI['stop_patterns'].apply(len)
DFROUTEPATTERNSMULTI['spcover_len'] = DFROUTEPATTERNSMULTI['spcover'].apply(len)
DFROUTEPATTERNSMULTI['diff'] = DFROUTEPATTERNSMULTI['stop_patterns_len'] - DFROUTEPATTERNSMULTI['spcover_len']
DFROUTEPATTERNSMULTI['spcoverid'] = DFROUTEPATTERNSMULTI['spcover'].apply(lambda lst: ['-'.join(lst1) for lst1 in lst])
DFROUTEPATTERNSMULTI.sort_values(by=['spcover_len', 'diff'], ascending=[False, True], inplace=True)
DFROUTEPATTERNSMULTI.reset_index(drop=True, inplace=True)

In [45]:
DFROUTEPATTERNCOVER = DFROUTEPATTERNSMULTI.explode('spcoverid').reset_index(drop=True)[['route_name', 'class', 'direction', 'spcoverid']]
DFROUTEPATTERNCOVER.rename(columns={'spcoverid': 'stop_pattern'}, inplace=True)


In [46]:
DFROUTEPATTERNCOVER

Unnamed: 0,route_name,class,direction,spcoverid
0,V48,5,H,20836-22248-22250-22252-20333-20352-20313-2029...
1,V48,5,H,21466-45071-21464-21706-37766-6285-16091-16096...
2,V48,5,H,3427-22364-22365-21466
3,V48,5,H,20836-22252-20333-20352-20313-20299-20324-2030...
4,V48,5,H,21706-37766-6285-16091-16096-21463-21462-16287...
...,...,...,...,...
1779,SYM,2,R,15351-15353-20000-20001-20002-20003-20004-2002...
1780,V23,1,H,20043-22240-22241-19982-47648-47647-20323-2032...
1781,901,4,H,47997-42278-39610-43905-42251-42289-22817-4225...
1782,BEL,2,R,19844-19845-19846-19867-19868-19869-19870-1987...


In [51]:
DFSTOPPATTERNSMIN = pd.merge(DFSTOPPATTERNS, DFROUTEPATTERNCOVER, on=['route_name', 'class', 'direction', 'stop_pattern'], how='right')
DFSTOPPATTERNSMIN.sort_values(by=['class', 'route_name', 'direction', 'stops_count', 'stop_pattern'], ascending=[True, True, True, False, True], inplace=True)
DFSTOPPATTERNSMIN.reset_index(drop=True, inplace=True)

In [75]:
df1 = DFLINES.groupby(['route_name', 'branch', 'direction', 'stop_pattern'])['points_count'].unique().reset_index()
df1['pc_len'] = df1['points_count'].apply(len)
df1.sort_values(by='pc_len', ascending=False)

Unnamed: 0,route_name,branch,direction,stop_pattern,points_count,pc_len
3755,V41,5,H,15126-15124-15145-20332-17191,"[402, 251]",2
0,1,3,H,18663-18662-18661-18660-18659-18658-18573-1848...,[43],1
2761,G60,6,H,45541-45327-30490-30492-41739-48446-45417-3106...,[658],1
2748,G50,6,H,45539-45327-30671-30673-47749-30675-30676-3067...,[766],1
2749,G50,6,R,48889-30941-30942-30943-30944-30945-30946-3094...,[773],1
...,...,...,...,...,...,...
1384,703,20,R,21010-22299-22300-11467-11465-23266-15860-1586...,[316],1
1385,703,20,R,21010-22299-22300-11467-11465-23266-15860-1586...,[505],1
1386,704,20,H,19800-15683-15682-15681-15680-22968-22436-2243...,[205],1
1387,704,20,R,27833-27831-16540-16541-16542-10479-8187-51598...,[209],1


In [77]:
df1[df1['route_name'] == 'V41']

Unnamed: 0,route_name,branch,direction,stop_pattern,points_count,pc_len
3754,V41,5,H,15126-15124-15145-17156-48658-20332-17191,[402],1
3755,V41,5,H,15126-15124-15145-20332-17191,"[402, 251]",2
3756,V41,5,H,15126-15145-20332-17191,[8],1
3757,V41,5,H,17452-48741-17480-17486-17532-17566-17569-1757...,[2358],1
3758,V41,5,H,17566-17568-17569-17570-17573-17574-17576-1757...,[732],1
3759,V41,5,H,17566-17569-17570-17573-17574-17576-17577-1758...,[588],1
3760,V41,5,H,17569-17570-17573-17574-17576-17577-17580-4800...,[401],1
3761,V41,5,R,15126-17579-17578-17576-17575-17572-17571-1756...,[595],1
3762,V41,5,R,15126-17579-17578-17576-17575-17572-17571-1756...,[2367],1
3763,V41,5,R,15126-48007-17579-17578-17576-17575-17572-1757...,[408],1


In [78]:
DF['5']['trips'][DF['5']['trips']['route_id'].str.contains('V41')]

Unnamed: 0,route_id,service_id,trip_id,shape_id,trip_headsign,direction_id
5387,5-V41-mjp-1,T0,1.T0.5-V41-mjp-1.2.H,5-V41-mjp-1.2.H,Melbourne,0
5388,5-V41-mjp-1,T0,10.T0.5-V41-mjp-1.9.R,5-V41-mjp-1.9.R,Griffith,1
5389,5-V41-mjp-1,T0,11.T0.5-V41-mjp-1.10.R,5-V41-mjp-1.10.R,Griffith,1
5390,5-V41-mjp-1,T0,12.T0.5-V41-mjp-1.11.R,5-V41-mjp-1.11.R,Griffith,1
5391,5-V41-mjp-1,T0,13.T0.5-V41-mjp-1.12.R,5-V41-mjp-1.12.R,Griffith,1
...,...,...,...,...,...,...
5639,5-V41-mjp-9,T0_5,5.T0.5-V41-mjp-9.5.H,5-V41-mjp-9.5.H,Melbourne,0
5640,5-V41-mjp-9,T0_5,6.T0.5-V41-mjp-9.6.H,5-V41-mjp-9.6.H,Melbourne,0
5641,5-V41-mjp-9,T0_5,7.T0.5-V41-mjp-9.7.H,5-V41-mjp-9.7.H,Melbourne,0
5642,5-V41-mjp-9,T0_5,8.T0.5-V41-mjp-9.8.R,5-V41-mjp-9.8.R,Griffith,1


In [88]:
df2 = DF['2']['stop_times'].sort_values(by=['trip_id', 'stop_sequence']).groupby('trip_id')['stop_sequence'].apply(list).reset_index()
df2['stop_sequence_2'] = df2['stop_sequence'].apply(lambda x: list(range(1, len(x) + 1)))
df2['stop_sequence_max'] = df2['stop_sequence'].apply(max)

In [108]:
df3 = pd.DataFrame()
df3['trip_id'] = DF['2']['stop_times']['trip_id']
# df3['trip_idx'] = df3['trip_id'].transform(lambda x: x.split('.'))
df3[['torder', 'service_id', 'route_id', 'sorder', 'direction_id']] = df3['trip_id'].str.split('.', expand=True)
# df3['torder'] = df3['trip_idx'].transform(lambda x: x[0])
# df3['service_id'] = df3['trip_idx'].transform(lambda x: x[1])
# df3['route_id'] = df3['trip_idx'].transform(lambda x: x[2])
# df3['sorder'] = df3['trip_idx'].transform(lambda x: x[3])
# df3['direction_id'] = df3['trip_idx'].transform(lambda x: x[4])
df3['stop_sequence'] = DF['2']['stop_times']['stop_sequence']
df3['stop_id'] = DF['2']['stop_times']['stop_id']
df3.sort_values(by=['trip_id', 'stop_sequence'], inplace=True)

In [109]:
df3

Unnamed: 0,trip_id,torder,service_id,route_id,sorder,direction_id,stop_sequence,stop_id
0,1.T0.2-ALM-mjp-2.1.H,1,T0,2-ALM-mjp-2,1,H,1,19853
4138,1.T0.2-ALM-mjp-2.1.H,1,T0,2-ALM-mjp-2,1,H,2,19852
5684,1.T0.2-ALM-mjp-2.1.H,1,T0,2-ALM-mjp-2,1,H,3,19851
7218,1.T0.2-ALM-mjp-2.1.H,1,T0,2-ALM-mjp-2,1,H,4,19850
8752,1.T0.2-ALM-mjp-2.1.H,1,T0,2-ALM-mjp-2,1,H,5,19849
...,...,...,...,...,...,...,...,...
40096,99.UT.2-WBE-mjp-12.12.R,99,UT,2-WBE-mjp-12,12,R,17,19854
413435,99.UT.2-WMN-mjp-12.3.R,99,UT,2-WMN-mjp-12,3,R,1,19991
411262,99.UT.2-WMN-mjp-12.3.R,99,UT,2-WMN-mjp-12,3,R,2,19992
409089,99.UT.2-WMN-mjp-12.3.R,99,UT,2-WMN-mjp-12,3,R,3,19993


In [105]:
df3

Unnamed: 0,trip_id,route_id,stop_sequence,stop_id,trip_id_order,service_id,shape_id_order,direction_id,torder,sorder
0,1.T0.2-ALM-mjp-2.1.H,2-ALM-mjp-2,1,19853,0,T0,3,H,1,1
4138,1.T0.2-ALM-mjp-2.1.H,2-ALM-mjp-2,2,19852,0,T0,3,H,1,1
5684,1.T0.2-ALM-mjp-2.1.H,2-ALM-mjp-2,3,19851,0,T0,3,H,1,1
7218,1.T0.2-ALM-mjp-2.1.H,2-ALM-mjp-2,4,19850,0,T0,3,H,1,1
8752,1.T0.2-ALM-mjp-2.1.H,2-ALM-mjp-2,5,19849,0,T0,3,H,1,1
...,...,...,...,...,...,...,...,...,...,...
40096,99.UT.2-WBE-mjp-12.12.R,2-WBE-mjp-12,17,19854,0,UT,3,R,99,12
413435,99.UT.2-WMN-mjp-12.3.R,2-WMN-mjp-12,1,19991,0,UT,3,R,99,3
411262,99.UT.2-WMN-mjp-12.3.R,2-WMN-mjp-12,2,19992,0,UT,3,R,99,3
409089,99.UT.2-WMN-mjp-12.3.R,2-WMN-mjp-12,3,19993,0,UT,3,R,99,3


In [99]:
df3.groupby(['route_id', 'stop_id'])['stop_sequence'].unique()

route_id     stop_id
2-ALM-mjp-2  19841             [16, 3]
             19842      [17, 15, 2, 4]
             19843      [18, 14, 1, 5]
             19847      [7, 1, 14, 18]
             19848      [6, 2, 13, 17]
                             ...      
2-ain-mjp-5  19842                 [3]
             19843                 [4]
             19854                 [5]
             19973                 [1]
             22180                 [6]
Name: stop_sequence, Length: 2310, dtype: object

In [61]:
DFSTOPPATTERNSMIN[DFSTOPPATTERNSMIN['class'] == '2']

Unnamed: 0,route_name,class,direction,stop_pattern,stops,stops_count
42,ALM,2,H,19843-19842-19841-22180-19854-19908-19907-1990...,"[19843, 19842, 19841, 22180, 19854, 19908, 199...",18
43,ALM,2,R,19847-19848-19849-19850-19851-19852-19853-1990...,"[19847, 19848, 19849, 19850, 19851, 19852, 198...",18
44,B31,2,H,19841-19842-19843-19854-22180-19973-20041-2004...,"[19841, 19842, 19843, 19854, 22180, 19973, 200...",21
45,B31,2,R,40221-40220-44817-20030-20031-20032-20033-2003...,"[40221, 40220, 44817, 20030, 20031, 20032, 200...",21
46,BEL,2,H,19843-19842-19841-22180-19854-19908-19906-1990...,"[19843, 19842, 19841, 22180, 19854, 19908, 199...",30
47,BEL,2,H,19854-19908-19907-19906-19905-19904-19903-1985...,"[19854, 19908, 19907, 19906, 19905, 19904, 199...",27
48,BEL,2,R,19844-19845-19846-19867-19868-19869-19870-1987...,"[19844, 19845, 19846, 19867, 19868, 19869, 198...",31
49,CRB,2,H,19843-19842-19841-22180-19854-19908-19959-1994...,"[19843, 19842, 19841, 22180, 19854, 19908, 199...",24
50,CRB,2,R,19886-19887-45793-19888-19889-19890-19891-1991...,"[19886, 19887, 45793, 19888, 19889, 19890, 198...",24
51,FKN,2,H,22180-19854-19908-19959-19947-19946-19945-1994...,"[22180, 19854, 19908, 19959, 19947, 19946, 199...",29
