In [26]:
import pandas as pd
import os
import shapely.geometry as sg
import geopandas as gpd
import numpy as np
data_directory = os.path.join('..', 'data', 'ptv', '20240224')
ROUTE_TYPES = {
    0 : 'Tram',
    1 : 'Metro',
    2 : 'Rail',
    3 : 'Bus',
    4 : 'Ferry',
    5 : 'Cable tram',
    6 : 'Gondola',
    7 : 'Funicular',
    11 : 'Trolleybus',
    12 : 'Monorail',
}
ROUTE_TYPES_LONG = {
    0 : 'Tram, Streetcar, Light rail. Any light rail or street level system within a metropolitan area.',
    1 : 'Subway, Metro. Any underground rail system within a metropolitan area.',
    2 : 'Rail. Used for intercity or long-distance travel.',
    3 : 'Bus. Used for short- and long-distance bus routes.',
    4 : 'Ferry. Used for short- and long-distance boat service.',
    5 : 'Cable tram. Used for street-level rail cars where the cable runs beneath the vehicle, e.g., cable car in San Francisco.',
    6 : 'Aerial lift, suspended cable car (e.g., gondola lift, aerial tramway). Cable transport where cabins, cars, gondolas or open chairs are suspended by means of one or more cables.',
    7 : 'Funicular. Any rail system designed for steep inclines.',
    11 : 'Trolleybus. Electric buses that draw power from overhead wires using poles.',
    12 : 'Monorail. Railway in which the track consists of a single rail or a beam.',
}

BRANCH_IDS_ALL = ['1', '2', '3', '4', '5', '6', '7', '8', '10', '11']
BRANCH_IDS = ['1', '2', '3', '4', '5', '6', '10', '11']
TABLE_NAMES = ['stop_times', 'stops', 'trips', 'routes', 'calendar', 'calendar_dates', 'agency', 'shapes']
# GTFS File Fields
# agency.txt 
# agency_id, agency_name, agency_url, agency_timezone, agency_lang
# calendar.txt 
# service_id, monday, tuesday, wednesday, thursday, friday, saturday, sunday, start_date, end_date
# calendar_dates.txt 
# service_id ,date, exception_type
# routes.txt 
# route_id, agency_id, route_short_name, route_long_name,
# route_type, route_color,route_text_color
# trips.txt 
# route_id, service_id, trip_id, shape_id, trip_headsign, direction_id
# stops.txt 
# stop_id, stop_name, stop_lat, stop_lon
# stop_times.txt 
# trip_id, arrival_time, departure_time, stop_id, stop_sequence, stop_headsign, pickup_type, drop_off_type, shape_dist_traveled
# shapes.txt 
# shape_id, shape_pt_lat, shape_pt_lon, shape_pt_sequence, shape_dist_traveled 
def get_df(branch_id, table_name):
    files = [os.path.join(data_directory, f) for f in os.listdir(data_directory) if f.split('-')[1] == str(branch_id) and f.split('-')[2] == table_name]
    if len(files) == 0:
        return None
    return pd.concat([pd.read_csv(f, keep_default_na=False, na_values=['']) for f in files])

DF = {branch_id: {table_name: get_df(branch_id, table_name) for table_name in TABLE_NAMES} for branch_id in BRANCH_IDS_ALL}
# 15s - 30s

# Assert all shape_id contains route_id
for bid in BRANCH_IDS:
    assert DF[bid]['trips'].dropna(subset=['route_id', 'shape_id']).apply(lambda x: x['route_id'] in x['shape_id'], axis=1).all()

for bid in BRANCH_IDS:
    DF[bid]['stop_times'] = pd.merge(DF[bid]['stop_times'], DF[bid]['stops'], on='stop_id')

In [27]:
for bid in BRANCH_IDS_ALL:
    DF[bid]['routes']['route_idx'] = DF[bid]['routes']['route_id'].str.split('-')
    DF[bid]['routes']['branch'] = DF[bid]['routes']['route_idx'].apply(lambda x: x[0])
    DF[bid]['routes']['route_name'] = DF[bid]['routes']['route_idx'].apply(lambda x: ''.join(x[1:-2]))
    DF[bid]['routes']['route_range'] = DF[bid]['routes']['route_idx'].apply(lambda x: x[-2])
    DF[bid]['routes']['route_id_order'] = DF[bid]['routes']['route_idx'].apply(lambda x: x[-1])

In [10]:
df4_branches = DF['4']['routes'].groupby('route_name')['branch'].unique()

In [12]:
df4_branches[df4_branches.apply(lambda x: len(x)) > 1]

route_name
200        [14, 17, 33]
207        [14, 17, 33]
234            [17, 35]
235            [17, 35]
236            [17, 35]
             ...       
903    [14, 17, 18, 35]
905            [14, 17]
906            [14, 17]
907            [14, 17]
908            [14, 17]
Name: branch, Length: 94, dtype: object

In [14]:
DF['4']['routes'][DF['4']['routes']['route_name'] == '903']

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_type,route_color,route_text_color,route_idx,branch,route_name,route_range,route_id_order
48,14-903-aus-1,,903,Mordialloc - Altona,3,FF8200,FFFFFF,"[14, 903, aus, 1]",14,903,aus,1
87,17-903-aus-1,,903,Mordialloc - Altona,3,FF8200,FFFFFF,"[17, 903, aus, 1]",17,903,aus,1
94,18-903-aus-1,,903,Mordialloc - Altona,3,FF8200,FFFFFF,"[18, 903, aus, 1]",18,903,aus,1
315,35-903-aus-1,,903,Mordialloc - Altona,3,FF8200,FFFFFF,"[35, 903, aus, 1]",35,903,aus,1


In [32]:
for bid in BRANCH_IDS:
    if DF[bid]['trips']['shape_id'].isna().any():
        print(bid, '> trips > shape_id contains NA')

4 shape_id contains null


In [37]:
routeId_shapeIdNA = DF['4']['trips'][DF['4']['trips']['shape_id'].isna()]['route_id'].unique()

In [41]:
DF['4']['trips'].groupby('route_id')['shape_id'].unique()[routeId_shapeIdNA[1]]

array([nan], dtype=object)

In [47]:
DF['4']['routes'][DF['4']['routes']['route_short_name'].str.contains('301')]

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_type,route_color,route_text_color,route_idx,branch,route_name,route_range,route_id_order
329,39-301-A-aus-1,,301SB,,3,FF8200,FFFFFF,"[39, 301, A, aus, 1]",39,301A,aus,1
330,39-301-aus-1,,301,Reservoir - La Trobe University,3,FF8200,FFFFFF,"[39, 301, aus, 1]",39,301,aus,1


In [48]:
DF['4']['trips'][DF['4']['trips']['route_id'].str.contains('39-301-')]

Unnamed: 0,route_id,service_id,trip_id,shape_id,trip_headsign,direction_id
94812,39-301-A-aus-1,MF3-39-301-A-aus,39-301-A-1-MF3-516,,Reservoir Station,1
94813,39-301-A-aus-1,MF6-39-301-A-aus,39-301-A-1-MF6-516,,Reservoir Station,1
94814,39-301-aus-1,MF3-39-301-aus,39-301--1-MF3-443,39-301-aus-1.2.H,Reservoir,0
94815,39-301-aus-1,MF3-39-301-aus,39-301--1-MF3-444,39-301-aus-1.2.H,Reservoir,0
94816,39-301-aus-1,MF3-39-301-aus,39-301--1-MF3-445,39-301-aus-1.2.H,Reservoir,0
...,...,...,...,...,...,...
95101,39-301-aus-1,MF6-39-301-aus,39-301--1-MF6-585,39-301-aus-1.3.R,La Trobe University,1
95102,39-301-aus-1,MF6-39-301-aus,39-301--1-MF6-586,39-301-aus-1.3.R,La Trobe University,1
95103,39-301-aus-1,MF6-39-301-aus,39-301--1-MF6-587,39-301-aus-1.3.R,La Trobe University,1
95104,39-301-aus-1,MF6-39-301-aus,39-301--1-MF6-588,39-301-aus-1.3.R,La Trobe University,1


In [50]:
DF['4']['stop_times'][DF['4']['stop_times']['trip_id'] == '39-301--1-MF6-588']

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
357415,39-301--1-MF6-588,19:00:00,19:00:00,48036,1,,0,0,0.0
357416,39-301--1-MF6-588,19:13:00,19:13:00,18864,2,,0,0,4576.73


In [52]:
for bid in BRANCH_IDS:
    DF[bid]['stop_times'] = pd.merge(DF[bid]['stop_times'], DF[bid]['stops'], on='stop_id')

In [None]:
bid = '2'
dft = DF[bid]['stop_times'].groupby('trip_id')['stop_id'].apply(lambda x: x.to_numpy()).to_frame(name='stops_sequence').reset_index()
dft = pd.merge(DF[bid]['trips'], dft, on='trip_id')
dft = pd.merge(DF[bid]['routes'], dft, on='route_id')
dft['stops_count'] = dft['stops_sequence'].apply(lambda x: len(x))

In [None]:
DF['1']['routes']

In [None]:
for bid in BRANCH_IDS:
    dfrt = pd.merge(DF[bid]['routes'], DF[bid]['trips'], on='route_id')
    dfrd = dfrt[['route_short_name', 'trip_headsign' ,'direction_id']].drop_duplicates()
    print(dfrd.groupby(['route_short_name', 'direction_id']).value_counts().nunique())
    


In [None]:
for bid in BRANCH_IDS:
    if not (DF[bid]['routes']['route_short_name'].notna().all() or DF[bid]['routes']['route_short_name'].isna().all()):
        print(bid)

In [None]:
dfshapes = pd.concat([DF[bid]['shapes'] for bid in BRANCH_IDS]).groupby('shape_id')[['shape_pt_lon', 'shape_pt_lat']].apply(lambda x: x.to_numpy())

In [None]:
dfshapes

In [None]:
bid = '4'
k_short_long_names = {}
k_route_order = {}
k_range = {}
k_type = {}
k_branches = {}
for bid in BRANCH_IDS_ALL:
    k_short_long_names[bid] = DF[bid]['routes'].dropna(subset=['route_short_name', 'route_long_name']).groupby('route_long_name')['route_short_name'].nunique().unique()
    k_range[bid] = DF[bid]['routes']['route_range'].unique()
    k_type[bid] = DF[bid]['routes']['route_type'].apply(lambda x: ROUTE_TYPES[x]).unique()
    k_route_order[bid] = DF[bid]['routes']['route_id_order'].unique()
    k_branches[bid] = DF[bid]['routes']['branch'].unique()

In [None]:
k_branches

In [None]:
DF['8']['routes']['route_short_name'].sort_values().unique()

In [None]:
DF['4']['routes']['route_short_name'].sort_values().unique()

In [None]:
DF['2']['routes'].dropna(subset=['route_short_name', 'route_long_name']).groupby('route_short_name')['route_long_name'].unique()

In [None]:
bid = '10'
DF[bid]['routes'][(DF[bid]['routes']['route_short_name'].astype(str) != DF[bid]['routes']['route_name']) & (DF[bid]['routes']['route_short_name'].notna())]
DF[bid]['routes']

In [None]:
DF['4']['routes']['route_id'].transform(lambda x: x.split('-')[2]).unique()

In [None]:
DF['11']['routes']['route_short_name'].isna().all()

In [None]:
DF['4']['routes']['route_id'].transform(lambda x: x.split('-')[1]).unique()

In [None]:
bid = '5'
dfrt = pd.merge(DF[bid]['routes'], DF[bid]['trips'], on='route_id')
dfrd = dfrt[['route_short_name', 'trip_headsign' ,'direction_id']].drop_duplicates()
dfrd

In [None]:
dfrd.groupby(['route_short_name', 'direction_id']).value_counts()

In [None]:
dfrd.groupby(['route_short_name', 'direction_id']).value_counts().nunique()

In [None]:
dft.groupby('route_id')['service_id'].unique()

In [None]:
bid = '3'

In [None]:
df_trams = DF['3']['stop_times'].groupby('trip_id')['stop_id'].apply(lambda x: x.to_numpy()).to_frame(name='stops_sequence').reset_index()
df_trams = pd.merge(DF['3']['trips'], df_trams, on='trip_id')
df_trams = pd.merge(DF['3']['routes'], df_trams, on='route_id')
df_trams['stops_count'] = df_trams['stops_sequence'].apply(lambda x: len(x))
df_trams = df_trams.sort_values(by=['route_short_name', 'stops_count'], ascending=False).drop_duplicates(subset=['route_short_name', 'trip_headsign', 'direction_id'], keep='first')

df_tram_stops = pd.merge(DF['3']['stop_times'], DF['3']['trips'], on='trip_id')
df_tram_stops = pd.merge(df_tram_stops, DF['3']['routes'], on='route_id')
df_tram_stops  = df_tram_stops.groupby(['route_short_name', 'trip_headsign', 'direction_id'])['stop_id'].apply(lambda x: x.unique()).to_frame(name='stops').reset_index()

df_trams = df_trams.merge(df_tram_stops, on=['route_short_name', 'trip_headsign', 'direction_id'])

df_trams['stops_count_2'] = df_trams['stops'].apply(lambda x: len(x))
df_trams['stops_set_1'] = df_trams['stops_sequence'].apply(lambda x: sorted(x))
df_trams['stops_set_2'] = df_trams['stops'].apply(lambda x: sorted(x))

df_trams['stops_set_1_size'] = df_trams['stops_set_1'].apply(lambda x: len(set(x)))
df_trams['stops_set_2_size'] = df_trams['stops_set_2'].apply(lambda x: len(set(x)))

In [None]:
df_trams[df_trams['stops_set_1'] != df_trams['stops_set_2']].iloc[2]['stops_sequence']

In [None]:
df_routes_full = pd.merge(DF['3']['trips'], DF['3']['routes'], on='route_id', how='left')

In [None]:
df_routes_full = pd.merge(df_routes_full, DF['3']['stop_times'], on='trip_id', how='left')

In [None]:
df_trams['anomalies'] = df_trams.apply(lambda x: set(x['stops']) - set(x['stops_sequence']), axis=1)

In [None]:
df_anomalies = df_trams[df_trams['anomalies'].apply(lambda x: len(x) > 0)][['route_short_name', 'trip_headsign', 'direction_id', 'anomalies']]

In [None]:
df_at = df_anomalies.explode('anomalies').merge(df_routes_full, left_on=['route_short_name', 'trip_headsign', 'direction_id', 'anomalies'], right_on=['route_short_name', 'trip_headsign', 'direction_id', 'stop_id']).sort_values(by=['trip_id', 'stop_sequence'], ascending=True)
df_at = df_at.groupby('trip_id')['stop_id'].apply(lambda x: x.to_numpy()).to_frame(name='stops_sequence').reset_index()

In [None]:
df_at = df_at.groupby('trip_id')['stop_id'].apply(lambda x: x.to_numpy()).to_frame(name='stops_sequence').reset_index()

In [None]:
df_at = pd.merge(DF['3']['trips'], df_at, on='trip_id')

In [None]:
df_at['stops_count'] = df_at['stops_sequence'].apply(lambda x: len(x))

In [None]:
df_at = pd.merge(DF['3']['routes'], df_at, on='route_id')

In [None]:
df_at = df_at.sort_values(by=['route_short_name', 'stops_count'], ascending=False).drop_duplicates(subset=['route_short_name', 'trip_headsign', 'direction_id'], keep='first').merge(df_anomalies, on=['route_short_name', 'trip_headsign', 'direction_id'], how='left')

In [None]:
df_at[df_at.apply(lambda x: set(x['anomalies']) - set(x['stops_sequence']), axis=1).apply(lambda x: len(x) > 0)]

In [None]:
df_at = df_at[['route_short_name', 'trip_headsign', 'direction_id', 'trip_id']]

In [None]:
df_at = df_at.merge(DF['3']['stop_times'], on='trip_id')

In [None]:
df_at = df_at.groupby('trip_id')['stop_id'].apply(lambda x: x.to_numpy()).to_frame(name='stops_sequence').reset_index()

In [None]:
df_at = pd.merge(DF['3']['trips'], df_at, on='trip_id')
df_at = pd.merge(DF['3']['routes'], df_at, on='route_id')

In [None]:
df_at = pd.concat([df_at, df_trams])

In [None]:
df_at.drop(columns=['anomalies', 'stops_count', 'stops', 'stops_count_2', 'stops_set_1', 'stops_set_2', 'stops_set_1_size', 'stops_set_2_size'], inplace=True)

In [None]:
df_at['stops_count'] = df_at['stops_sequence'].apply(lambda x: len(x))

In [None]:
df_at.sort_values(by=['route_short_name', 'trip_headsign', 'direction_id', 'stops_count'], ascending=False, inplace=True)
df_at.to_csv('trips.csv', index=False)

In [None]:
DF['3']['shapes']['point'] = DF['3']['shapes'].apply(lambda x: sg.Point(x['shape_pt_lon'], x['shape_pt_lat']), axis=1)

In [None]:
df_lines = DF['3']['shapes'].groupby('shape_id')['point'].apply(lambda x: sg.LineString(x.to_numpy())).to_frame(name='line').reset_index()

In [None]:
DF['3']['stops']['stop_pt'] = DF['3']['stops'].apply(lambda x: sg.Point(x['stop_lon'], x['stop_lat']), axis=1)

In [None]:
df_geo = pd.merge(DF['3']['stop_times'], DF['3']['trips'][['trip_id', 'shape_id']], on='trip_id')
df_geo = pd.merge(df_geo, df_lines, on='shape_id')
df_geo = df_geo[['shape_id', 'stop_id']].drop_duplicates()
df_geo = pd.merge(df_geo, df_lines, on='shape_id')
df_geo = pd.merge(df_geo, DF['3']['stops'][['stop_id', 'stop_pt']], on='stop_id')

In [None]:
df_geo[df_geo.apply(lambda x: x['line'].contains(x['stop_pt']), axis=1)]

In [None]:
bid = '4'
dfrt = pd.merge(DF[bid]['routes'], DF[bid]['trips'], on='route_id')
# assert dfrt.apply(lambda x: str(x['route_short_name']) in x['route_id'], axis=1).all()


In [None]:
assert dfrt.apply(lambda x: str(x['route_id']) in str(x['shape_id']), axis=1).all()
assert dfrt.apply(lambda x: x['shape_id'] in x['trip_id'], axis=1).all()


In [None]:
bid = '5'
dfrt = pd.merge(DF[bid]['routes'], DF[bid]['trips'], on='route_id')
dfrt.dropna(subset=['shape_id'], inplace=True)
dfrt['shape_id_x'] = dfrt['shape_id'].apply(lambda x: x.split('.'))
shape_id_x = dfrt['shape_id_x'].apply(len).unique()
assert len(shape_id_x) == 1
shape_id_count = shape_id_x[0]
for i in range(shape_id_count):
    dfrt[f'shape_id_{i}'] = dfrt['shape_id_x'].apply(lambda x: x[i])
dfrt[['shape_id_1', 'shape_id_2']].sort_values(by=['shape_id_1', 'shape_id_2'], ascending=True).value_counts()


In [None]:
bid = '11'
dfrt = pd.merge(DF[bid]['routes'], DF[bid]['trips'], on='route_id')
dfrt.groupby('route_id')['direction_id'].unique().apply(len).value_counts()

In [None]:
dfrt[['shape_id_1', 'shape_id_2']].value_counts()

In [None]:
bid = '11'
DF[bid]['routes'][DF[bid]['routes'].apply(lambda x: str(x['route_short_name']) not in x['route_id'], axis=1)]

In [None]:
dfrt[['direction_id', 'trip_id_4']].value_counts()

In [None]:
DF['3']['trips']['direction_id'].unique()

In [None]:
DF['5']['trips']['direction_id']

In [None]:
assert dfrt.apply(lambda x: x['shape_id'] in x['trip_id'], axis=1).all()

In [None]:
dfrt['shape_id_1'].unique()

In [None]:
assert dfrt['shape_id_x'].apply(len).nunique()

In [None]:
dfrt['trip_id_x'].apply(len).unique()

In [None]:
dfrt['shape_id_2'] = dfrt['shape_id_1'].apply(lambda x: x[1])
dfrt['trip_id_2'] = dfrt['trip_id_1'].apply(lambda x: x[1])


In [None]:
pd.merge(DF['3']['routes'], DF['3']['trips'], on='route_id').groupby(['route_short_name', 'trip_headsign', 'direction_id'])['shape_id'].unique()