In [1]:
import pandas as pd
import os
import shapely.geometry as sg
import geopandas as gpd
import numpy as np
data_directory = os.path.join('..', 'data', 'ptv', '20240224')
ROUTE_TYPES = {
    0 : 'Tram',
    1 : 'Metro',
    2 : 'Rail',
    3 : 'Bus',
    4 : 'Ferry',
    5 : 'Cable tram',
    6 : 'Gondola',
    7 : 'Funicular',
    11 : 'Trolleybus',
    12 : 'Monorail',
}
ROUTE_TYPES_LONG = {
    0 : 'Tram, Streetcar, Light rail. Any light rail or street level system within a metropolitan area.',
    1 : 'Subway, Metro. Any underground rail system within a metropolitan area.',
    2 : 'Rail. Used for intercity or long-distance travel.',
    3 : 'Bus. Used for short- and long-distance bus routes.',
    4 : 'Ferry. Used for short- and long-distance boat service.',
    5 : 'Cable tram. Used for street-level rail cars where the cable runs beneath the vehicle, e.g., cable car in San Francisco.',
    6 : 'Aerial lift, suspended cable car (e.g., gondola lift, aerial tramway). Cable transport where cabins, cars, gondolas or open chairs are suspended by means of one or more cables.',
    7 : 'Funicular. Any rail system designed for steep inclines.',
    11 : 'Trolleybus. Electric buses that draw power from overhead wires using poles.',
    12 : 'Monorail. Railway in which the track consists of a single rail or a beam.',
}

BRANCH_IDS_ALL = ['1', '2', '3', '4', '5', '6', '7', '8', '10', '11']
BRANCH_IDS = ['1', '2', '3', '4', '5', '6', '10', '11']
TABLE_NAMES = ['stop_times', 'stops', 'trips', 'routes', 'calendar', 'calendar_dates', 'agency', 'shapes']
# GTFS File Fields
# agency.txt 
# agency_id, agency_name, agency_url, agency_timezone, agency_lang
# calendar.txt 
# service_id, monday, tuesday, wednesday, thursday, friday, saturday, sunday, start_date, end_date
# calendar_dates.txt 
# service_id ,date, exception_type
# routes.txt 
# route_id, agency_id, route_short_name, route_long_name,
# route_type, route_color,route_text_color
# trips.txt 
# route_id, service_id, trip_id, shape_id, trip_headsign, direction_id
# stops.txt 
# stop_id, stop_name, stop_lat, stop_lon
# stop_times.txt 
# trip_id, arrival_time, departure_time, stop_id, stop_sequence, stop_headsign, pickup_type, drop_off_type, shape_dist_traveled
# shapes.txt 
# shape_id, shape_pt_lat, shape_pt_lon, shape_pt_sequence, shape_dist_traveled 
def get_df(branch_id, table_name):
    files = [os.path.join(data_directory, f) for f in os.listdir(data_directory) if f.split('-')[1] == str(branch_id) and f.split('-')[2] == table_name]
    if len(files) == 0:
        return None
    return pd.concat([pd.read_csv(f, keep_default_na=False, na_values=['']) for f in files])

DF = {branch_id: {table_name: get_df(branch_id, table_name) for table_name in TABLE_NAMES} for branch_id in BRANCH_IDS_ALL}
# 15s - 30s

# Assert all shape_id contains route_id
for bid in BRANCH_IDS:
    assert DF[bid]['trips'].dropna(subset=['route_id', 'shape_id']).apply(lambda x: x['route_id'] in x['shape_id'], axis=1).all()

for bid in BRANCH_IDS:
    DF[bid]['stop_times'] = pd.merge(DF[bid]['stop_times'], DF[bid]['stops'], on='stop_id')

# 1m 30s

In [2]:
for bid in BRANCH_IDS_ALL:
    DF[bid]['routes']['route_idx'] = DF[bid]['routes']['route_id'].str.split('-')
    DF[bid]['routes']['branch'] = DF[bid]['routes']['route_idx'].apply(lambda x: x[0])
    DF[bid]['routes']['route_name'] = DF[bid]['routes']['route_idx'].apply(lambda x: ''.join(x[1:-2]))
    DF[bid]['routes']['route_range'] = DF[bid]['routes']['route_idx'].apply(lambda x: x[-2])
    DF[bid]['routes']['route_id_order'] = DF[bid]['routes']['route_idx'].apply(lambda x: x[-1])

In [3]:
DF['4']['routes'][DF['4']['routes']['route_name'] != DF['4']['routes']['route_short_name']]

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_type,route_color,route_text_color,route_idx,branch,route_name,route_range,route_id_order
329,39-301-A-aus-1,,301SB,,3,FF8200,FFFFFF,"[39, 301, A, aus, 1]",39,301A,aus,1


In [6]:
DF['4']['routes']['route_id_order'].unique()

array(['1'], dtype=object)

In [8]:
dftemp = DF['4']['routes'].groupby('route_name')['branch'].unique()
dftemp[dftemp.apply(lambda x: len(x) > 1)]

route_name
200        [14, 17, 33]
207        [14, 17, 33]
234            [17, 35]
235            [17, 35]
236            [17, 35]
             ...       
903    [14, 17, 18, 35]
905            [14, 17]
906            [14, 17]
907            [14, 17]
908            [14, 17]
Name: branch, Length: 94, dtype: object

In [13]:
dftemp2 = DF['4']['routes'].groupby('route_short_name')['route_short_name'].value_counts()

In [26]:
dftemp3 = DF['4']['routes'].groupby('route_short_name').aggregate({'branch': 'nunique', 'route_id': 'nunique'})
assert (dftemp3['branch'] == dftemp3['route_id']).all()

True

In [46]:
df_idlist = pd.DataFrame()
df_idlist['route_idx'] = DF['4']['trips']['route_id'].str.split('-')
df_idlist['service_idx'] = DF['4']['trips']['service_id'].str.split('-')
df_idlist['trip_idx'] = DF['4']['trips']['trip_id'].str.split('-')

In [44]:
assert df_idlist['trip_idx'].apply(len).unique() == [6]

In [57]:
df_idlist['route_branch'] = df_idlist['route_idx'].apply(lambda x: x[0])
df_idlist['route_range'] = df_idlist['route_idx'].apply(lambda x: x[-2])
df_idlist['route_order'] = df_idlist['route_idx'].apply(lambda x: x[-1])
df_idlist['route_name'] = df_idlist['route_idx'].apply(lambda x: ''.join(x[1:-2]))
df_idlist['service_sname'] = df_idlist['service_idx'].apply(lambda x: x[0])
df_idlist['service_branch'] = df_idlist['service_idx'].apply(lambda x: x[1])
df_idlist['service_range'] = df_idlist['service_idx'].apply(lambda x: x[-1])
df_idlist['service_name'] = df_idlist['service_idx'].apply(lambda x: ''.join(x[2:-1]))
df_idlist['trip_sname'] = df_idlist['trip_idx'].apply(lambda x: x[4])
df_idlist['trip_branch'] = df_idlist['trip_idx'].apply(lambda x: x[0])
df_idlist['trip_name'] = df_idlist['trip_idx'].apply(lambda x: ''.join(x[1:3]))
df_idlist['trip_order'] = df_idlist['trip_idx'].apply(lambda x: x[3])

In [58]:
assert (df_idlist['route_branch'] == df_idlist['service_branch']).all()
assert (df_idlist['route_branch'] == df_idlist['trip_branch']).all()
assert (df_idlist['trip_branch'] == df_idlist['service_branch']).all()
assert (df_idlist['route_range'] == df_idlist['service_range']).all()
assert (df_idlist['route_order'] == df_idlist['trip_order']).all()
assert (df_idlist['service_sname'] == df_idlist['trip_sname']).all()
assert (df_idlist['route_name'] == df_idlist['trip_name']).all()
assert (df_idlist['service_name'] == df_idlist['trip_name']).all()
assert (df_idlist['route_name'] == df_idlist['service_name']).all()


In [138]:
DF['4']['calendar']['service_idx'] = DF['4']['calendar']['service_id'].str.split('-')
DF['4']['calendar']['service_name'] = DF['4']['calendar']['service_idx'].apply(lambda x: x[0])
DF['4']['calendar']['branch'] = DF['4']['calendar']['service_idx'].apply(lambda x: x[1])
DF['4']['calendar']['pattern'] = DF['4']['calendar'][['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']].apply(lambda x: ''.join(x.astype(str)), axis=1)
DF['4']['calendar']['startend'] = DF['4']['calendar']['start_date'].astype(str) + '-' + DF['4']['calendar']['end_date'].astype(str)
DF['4']['calendar']['patternfull'] = DF['4']['calendar']['pattern'] + '-' + DF['4']['calendar']['startend']
# Remove digits from service_name
DF['4']['calendar']['service_str'] = DF['4']['calendar']['service_name'].apply(lambda input_string: ''.join([char for char in input_string if char.isalpha()]))
# Get only digits from service_name
DF['4']['calendar']['service_int'] = DF['4']['calendar']['service_name'].apply(lambda input_string: ''.join([char for char in input_string if char.isdigit()]))
DF['4']['calendar']['route_name'] = DF['4']['calendar']['service_idx'].apply(lambda x: ''.join(x[2:-1]))
DF['4']['calendar']['stb'] = DF['4']['calendar']['service_str'] + '-' + DF['4']['calendar']['branch']
DF['4']['calendar']['snb'] = DF['4']['calendar']['service_name'] + '-' + DF['4']['calendar']['branch']

In [187]:
df4_calendar = DF['4']['calendar'].melt(id_vars=['service_id', 'service_name', 'service_str', 'service_int', 'branch', 'pattern', 'startend', 'patternfull', 'service_idx', 'route_name', 'stb', 'snb'], value_vars=['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'], var_name='day', value_name='active')
# Drop all inactive services
df4_calendar = df4_calendar[df4_calendar['active'] == 1]

In [200]:
DF['4']['calendar']['pattern'].unique()

array(['0000100', '1101000', '0001000', '1000000', '0010000', '1111000',
       '0010100', '1111100', '0100000', '1110000', '0001100', '1101100',
       '1011000', '1010000', '1000010', '0000010', '0000001', '0000101'],
      dtype=object)

In [207]:
DF['4']['calendar_dates']['pattern'] = DF['4']['calendar_dates']['date'].astype(str) + '.' + DF['4']['calendar_dates']['exception_type'].astype(str)
df4_calendar_dates = DF['4']['calendar_dates'].groupby('service_id')['pattern'].apply(lambda x: '-'.join(x)).reset_index()

In [208]:
df4_calendar_dates

Unnamed: 0,service_id,pattern
0,MF1-12-831-aus,20240403.2-20240410.2-20240417.2-20240424.2-20...
1,MF1-12-834-aus,20240403.2-20240410.2-20240417.2-20240424.2-20...
2,MF1-12-835-aus,20240403.2-20240410.2-20240417.2-20240424.2-20...
3,MF1-12-836-aus,20240403.2-20240410.2-20240417.2-20240424.2-20...
4,MF1-12-837-aus,20240403.2-20240410.2-20240417.2-20240424.2-20...
...,...,...
3075,Sun9-32-603-aus,20240310.2-20240317.2-20240324.2-20240331.2-20...
3076,Sun9-32-604-aus,20240310.2-20240317.2-20240324.2-20240331.2-20...
3077,Sun9-32-922-aus,20240310.2-20240317.2-20240324.2-20240331.2-20...
3078,Sun9-32-923-aus,20240310.2-20240317.2-20240324.2-20240331.2-20...


In [214]:
df4_calendar.groupby('branch')['route_name'].unique().to_dict()

{'12': array(['831', '834', '835', '836', '837', '838', '839', '840', '841',
        '842', '846', '847', '888', '889', '899', '925', '926', '927',
        '928', '929'], dtype=object),
 '13': array(['462', '463', '460', '469', '476'], dtype=object),
 '14': array(['200', '207', '270', '271', '273', '279', '280', '281', '282',
        '284', '285', '293', '295', '302', '303', '304', '305', '309',
        '318', '364', '370', '380', '903', '905', '906', '907', '908',
        '901'], dtype=object),
 '15': array(['503', '506'], dtype=object),
 '16': array(['501', '511', '528', '529', '530', '531', '532', '533', '537',
        '544'], dtype=object),
 '17': array(['200', '202', '207', '234', '235', '236', '237', '246', '250',
        '251', '270', '279', '284', '285', '293', '302', '304', '305',
        '309', '350', '510', '512', '903', '905', '906', '907', '908'],
       dtype=object),
 '18': array(['901', '902', '903'], dtype=object),
 '19': array(['663', '664', '670', '671', '672', '675'

In [180]:
df4_routenames = DF['4']['calendar'].groupby('route_name')['branch'].apply(lambda x: sorted(x.unique())).reset_index()
df4_routenames['branch_count'] = df4_routenames['branch'].apply(lambda x: len(x))
df4_routenames['branch_pattern'] = df4_routenames['branch'].apply(lambda x: ''.join(x))

In [215]:
DF['4']['calendar'].to_csv('manual.csv', index=False)

In [186]:
df4_routenames.sort_values('branch_count', ascending=False)['branch_pattern'].unique()

array(['19212229', '14171835', '192122', '192129', '141833', '204149',
       '141733', '2641', '3853', '1922', '5382', '3942', '1921', '1417',
       '1833', '1733', '1735', '2129', '28', '20', '49', '32', '21', '39',
       '44', '40', '82', '51', '12', '27', '23', '41', '53', '38', '42',
       '35', '17', '14', '48', '16', '43', '13', '24', '25', '15'],
      dtype=object)

In [222]:
assert DF['4']['calendar'].groupby('branch')['startend'].nunique().unique() == [1]

In [235]:
df4rsl = DF['4']['routes'].groupby(['route_short_name'])['route_long_name'].unique()

In [239]:
df4rsl = df4rsl.reset_index()

In [240]:
df4rsl['name_count'] = df4rsl['route_long_name'].apply(lambda x: len(x))

In [247]:
DF['4']['calendar'][DF['4']['calendar']['route_name'] == '800']

Unnamed: 0,service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date,...,service_name,pattern,branch,service_str,service_int,route_name,stb,snb,startend,patternfull
266,MF1-26-800-aus,1,1,1,1,1,0,0,20240223,20240526,...,MF1,1111100,26,MF,1,800,MF-26,MF1-26,20240223-20240526,1111100-20240223-20240526
371,MF1-41-800-aus,1,1,1,1,1,0,0,20240223,20240526,...,MF1,1111100,41,MF,1,800,MF-41,MF1-41,20240223-20240526,1111100-20240223-20240526
1780,Sat1-41-800-aus,0,0,0,0,0,1,0,20240223,20240526,...,Sat1,10,41,Sat,1,800,Sat-41,Sat1-41,20240223-20240526,0000010-20240223-20240526


In [245]:
DF['4']['routes'][DF['4']['routes']['route_short_name'] == '675']

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_type,route_color,route_text_color,route_idx,branch,route_name,route_range,route_id_order
100,19-675-aus-1,,675,Chirnside Park SC - Mooroolbark,3,FF8200,FFFFFF,"[19, 675, aus, 1]",19,675,aus,1
142,21-675-aus-1,,675,Mooroolbark - Chirnside Park SC,3,FF8200,FFFFFF,"[21, 675, aus, 1]",21,675,aus,1
266,29-675-aus-1,,675,Chirnside Park SC - Mooroolbark,3,FF8200,FFFFFF,"[29, 675, aus, 1]",29,675,aus,1


In [250]:
DF4TRIPS_STOP_COUNT = DF['4']['stop_times'].groupby('trip_id')['stop_id'].count().rename('stop_count').reset_index()

In [251]:
DF['4']['trips'] = pd.merge(DF['4']['trips'], DF4TRIPS_STOP_COUNT, on='trip_id')

In [265]:
def get_dates(row):
    start_date = pd.to_datetime(str(row['start_date']), format='%Y%m%d')
    end_date = pd.to_datetime(str(row['end_date']), format='%Y%m%d')
    
    # Days of the week when service is available
    days_of_week = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday']
    service_days = [day for day in days_of_week if row[day] == 1]
    
    dates = []
    current_date = start_date
    while current_date <= end_date:
        if current_date.strftime('%A').lower() in service_days:
            dates.append(current_date.strftime('%Y-%m-%d'))
        current_date += pd.Timedelta(days=1)
    
    return sorted(dates)

In [279]:
# Get all dates given start date, end date, and monday, tuesday, wednesday, thursday, friday, saturday, sunday
DF['4']['calendar']['date'] = DF['4']['calendar'].apply(get_dates, axis=1)

In [259]:
DF['4']['calendar_dates']['exception_type'].unique()

array([2], dtype=int64)

In [105]:
dftemp = DF['4']['calendar'][['service_name', 'branch', 'pattern', 'service_str', 'service_int']]

In [262]:
DF['4']['calendar'].groupby('service_name')['pattern'].unique()

service_name
MF1      [0010000, 0000100, 1111000, 1000000, 0010100, ...
MF10                           [0000100, 1101000, 0001000]
MF11                                    [1000000, 0000100]
MF12                                    [0100000, 1110000]
MF13                                    [0010000, 1111000]
MF14                                    [0001000, 0000100]
MF15                                             [0000100]
MF2      [1101100, 1111000, 0000100, 0100000, 1101000, ...
MF3      [0010000, 0000100, 1110000, 0100000, 1111000, ...
MF4                   [0001000, 0000100, 1111100, 1111000]
MF5      [1111100, 0000100, 0001100, 0001000, 1011000, ...
MF6      [1000000, 1111100, 1110000, 1010000, 0100000, ...
MF7                   [0000100, 0100000, 0001100, 1111100]
MF8                   [1111000, 0010000, 1110000, 0000100]
MF9                   [0001000, 0010100, 0000100, 1111000]
Sat1                                    [1000010, 0000010]
Sat2                                    [10

In [284]:
DF['4']['calendar_dates']['date'] = DF['4']['calendar_dates']['date'].astype(str)

In [299]:
df4dates = DF['4']['calendar'][['service_id', 'date']].explode('date').reset_index(drop=True)
# Convert date to string of format YYYYMMDD
df4dates['date'] = df4dates['date'].astype(str).apply(lambda x: x.replace('-', ''))
df4dates = pd.merge(df4dates, DF['4']['calendar_dates'][['service_id', 'date', 'exception_type']], on=['service_id', 'date'], how='left')
df4dates = df4dates[df4dates['exception_type'] != 2]
df4dates = df4dates.groupby('service_id')['date'].unique()
df4dates = df4dates.apply(lambda x: sorted(x))
df4dates = df4dates.apply(lambda x: '-'.join(x))
df4dates = df4dates.reset_index()

In [304]:
df4calendar = DF['4']['calendar'][['service_id', 'service_name', 'pattern', 'branch', 'service_str', 'service_int', 'route_name', 'stb', 'snb', 'startend', 'patternfull']]

In [305]:
df4dates = pd.merge(df4dates, df4calendar, on='service_id')

In [306]:
df4dates

Unnamed: 0,service_id,date,service_name,pattern,branch,service_str,service_int,route_name,stb,snb,startend,patternfull
0,MF1-12-831-aus,20240228-20240306-20240313-20240320-20240327,MF1,0010000,12,MF,1,831,MF-12,MF1-12,20240223-20240526,0010000-20240223-20240526
1,MF1-12-834-aus,20240228-20240306-20240313-20240320-20240327,MF1,0010000,12,MF,1,834,MF-12,MF1-12,20240223-20240526,0010000-20240223-20240526
2,MF1-12-835-aus,20240228-20240306-20240313-20240320-20240327,MF1,0010000,12,MF,1,835,MF-12,MF1-12,20240223-20240526,0010000-20240223-20240526
3,MF1-12-836-aus,20240228-20240306-20240313-20240320-20240327,MF1,0010000,12,MF,1,836,MF-12,MF1-12,20240223-20240526,0010000-20240223-20240526
4,MF1-12-837-aus,20240228-20240306-20240313-20240320-20240327,MF1,0010000,12,MF,1,837,MF-12,MF1-12,20240223-20240526,0010000-20240223-20240526
...,...,...,...,...,...,...,...,...,...,...,...,...
3075,Sun9-32-603-aus,20240225-20240303,Sun9,0000001,32,Sun,9,603,Sun-32,Sun9-32,20240223-20240526,0000001-20240223-20240526
3076,Sun9-32-604-aus,20240225-20240303,Sun9,0000001,32,Sun,9,604,Sun-32,Sun9-32,20240223-20240526,0000001-20240223-20240526
3077,Sun9-32-922-aus,20240225-20240303,Sun9,0000001,32,Sun,9,922,Sun-32,Sun9-32,20240223-20240526,0000001-20240223-20240526
3078,Sun9-32-923-aus,20240225-20240303,Sun9,0000001,32,Sun,9,923,Sun-32,Sun9-32,20240223-20240526,0000001-20240223-20240526


In [298]:
df4dates

Unnamed: 0,index,service_id,date
0,0,MF1-12-831-aus,20240228-20240306-20240313-20240320-20240327
1,1,MF1-12-834-aus,20240228-20240306-20240313-20240320-20240327
2,2,MF1-12-835-aus,20240228-20240306-20240313-20240320-20240327
3,3,MF1-12-836-aus,20240228-20240306-20240313-20240320-20240327
4,4,MF1-12-837-aus,20240228-20240306-20240313-20240320-20240327
...,...,...,...
3075,3075,Sun9-32-603-aus,20240225-20240303
3076,3076,Sun9-32-604-aus,20240225-20240303
3077,3077,Sun9-32-922-aus,20240225-20240303
3078,3078,Sun9-32-923-aus,20240225-20240303


In [281]:
DF['4']['calendar_dates']

Unnamed: 0,service_id,date,exception_type,pattern
0,MF10-14-200-aus,20240308,2,20240308.2
1,MF10-14-200-aus,20240315,2,20240315.2
2,MF10-14-200-aus,20240322,2,20240322.2
3,MF10-14-200-aus,20240329,2,20240329.2
4,MF10-14-200-aus,20240405,2,20240405.2
...,...,...,...,...
48891,Sun9-33-902-aus,20240428,2,20240428.2
48892,Sun9-33-902-aus,20240505,2,20240505.2
48893,Sun9-33-902-aus,20240512,2,20240512.2
48894,Sun9-33-902-aus,20240519,2,20240519.2


In [117]:
dftemp['stb'] = dftemp['service_str'] + '-' + dftemp['branch']
dftemp['snb'] = dftemp['service_name'] + '-' + dftemp['branch']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dftemp['stb'] = dftemp['service_str'] + '-' + dftemp['branch']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dftemp['snb'] = dftemp['service_name'] + '-' + dftemp['branch']


In [120]:
dftemp.groupby('snb')['pattern'].unique()

snb
MF1-12                           [0010000-20240223-20240526]
MF1-13     [0000100-20240223-20240526, 1111000-20240223-2...
MF1-14     [1000000-20240223-20240526, 0000100-20240223-2...
MF1-15     [0010100-20240223-20240526, 1111100-20240223-2...
MF1-16                           [1111100-20240223-20240526]
                                 ...                        
Sun6-25                          [0000101-20240223-20240526]
Sun6-39                          [0000001-20240223-20240526]
Sun6-44                          [1000000-20240226-20240526]
Sun9-32                          [0000001-20240223-20240526]
Sun9-33                          [0000001-20240223-20240526]
Name: pattern, Length: 290, dtype: object

In [85]:
dftemp[dftemp.apply(lambda x: len(x['pattern']) > 1, axis=1)]

Unnamed: 0,service_name,branch,pattern
1,MF1,13,"[0000100-20240223-20240526, 1111000-20240223-2..."
2,MF1,14,"[1000000-20240223-20240526, 0000100-20240223-2..."
3,MF1,15,"[0010100-20240223-20240526, 1111100-20240223-2..."
10,MF1,25,"[1111100-20240223-20240526, 1111000-20240223-2..."
13,MF1,28,"[1111100-20240223-20240526, 1111000-20240223-2..."
18,MF1,41,"[1111100-20240223-20240526, 0000100-20240223-2..."
19,MF1,42,"[1111100-20240223-20240526, 1111000-20240223-2..."
53,MF2,13,"[1111000-20240223-20240526, 0000100-20240223-2..."
54,MF2,14,"[0100000-20240223-20240526, 0000100-20240223-2..."
61,MF2,22,"[1110000-20240223-20240526, 0001100-20240223-2..."


In [68]:
df4_calendar[['start_date', 'end_date']].drop_duplicates()

Unnamed: 0,start_date,end_date
0,20240223,20240526
461,20240226,20240526


In [40]:
DF['4']['trips']['route_idx']

0         [12, 831, aus, 1]
1         [12, 831, aus, 1]
2         [12, 831, aus, 1]
3         [12, 831, aus, 1]
4         [12, 831, aus, 1]
                ...        
124313    [82, 498, aus, 1]
124314    [82, 498, aus, 1]
124315    [82, 498, aus, 1]
124316    [82, 498, aus, 1]
124317    [82, 498, aus, 1]
Name: route_idx, Length: 124318, dtype: object

In [38]:
DF['4']['trips'][['service_id', 'trip_id']]

Unnamed: 0,service_id,trip_id
0,MF1-12-831-aus,12-831--1-MF1-104
1,MF1-12-831-aus,12-831--1-MF1-106
2,MF1-12-831-aus,12-831--1-MF1-108
3,MF1-12-831-aus,12-831--1-MF1-110
4,MF1-12-831-aus,12-831--1-MF1-112
...,...,...
124313,Sun4-82-498-aus,82-498--1-Sun4-32374
124314,Sun4-82-498-aus,82-498--1-Sun4-32474
124315,Sun4-82-498-aus,82-498--1-Sun4-32774
124316,Sun4-82-498-aus,82-498--1-Sun4-32874


In [5]:
DF['4']['routes'][DF['4']['routes']['route_name'].str.contains('301')]

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_type,route_color,route_text_color,route_idx,branch,route_name,route_range,route_id_order
329,39-301-A-aus-1,,301SB,,3,FF8200,FFFFFF,"[39, 301, A, aus, 1]",39,301A,aus,1
330,39-301-aus-1,,301,Reservoir - La Trobe University,3,FF8200,FFFFFF,"[39, 301, aus, 1]",39,301,aus,1


In [32]:
for bid in BRANCH_IDS:
    if DF[bid]['trips']['shape_id'].isna().any():
        print(bid, '> trips > shape_id contains NA')

4 shape_id contains null


In [37]:
routeId_shapeIdNA = DF['4']['trips'][DF['4']['trips']['shape_id'].isna()]['route_id'].unique()

In [41]:
DF['4']['trips'].groupby('route_id')['shape_id'].unique()[routeId_shapeIdNA[1]]

array([nan], dtype=object)

In [53]:
DF['4']['trips'][DF['4']['trips']['shape_id'].isna()]

Unnamed: 0,route_id,service_id,trip_id,shape_id,trip_headsign,direction_id
50411,19-754-aus-1,MF3-19-754-aus,19-754--1-MF3-4,,Glen Waverley,0
50412,19-754-aus-1,MF5-19-754-aus,19-754--1-MF5-4,,Glen Waverley,0
50413,19-754-aus-1,MF6-19-754-aus,19-754--1-MF6-14,,Glen Waverley,0
94812,39-301-A-aus-1,MF3-39-301-A-aus,39-301-A-1-MF3-516,,Reservoir Station,1
94813,39-301-A-aus-1,MF6-39-301-A-aus,39-301-A-1-MF6-516,,Reservoir Station,1


In [47]:
DF['4']['routes'][DF['4']['routes']['route_short_name'].str.contains('301')]

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_type,route_color,route_text_color,route_idx,branch,route_name,route_range,route_id_order
329,39-301-A-aus-1,,301SB,,3,FF8200,FFFFFF,"[39, 301, A, aus, 1]",39,301A,aus,1
330,39-301-aus-1,,301,Reservoir - La Trobe University,3,FF8200,FFFFFF,"[39, 301, aus, 1]",39,301,aus,1


In [48]:
DF['4']['trips'][DF['4']['trips']['route_id'].str.contains('39-301-')]

Unnamed: 0,route_id,service_id,trip_id,shape_id,trip_headsign,direction_id
94812,39-301-A-aus-1,MF3-39-301-A-aus,39-301-A-1-MF3-516,,Reservoir Station,1
94813,39-301-A-aus-1,MF6-39-301-A-aus,39-301-A-1-MF6-516,,Reservoir Station,1
94814,39-301-aus-1,MF3-39-301-aus,39-301--1-MF3-443,39-301-aus-1.2.H,Reservoir,0
94815,39-301-aus-1,MF3-39-301-aus,39-301--1-MF3-444,39-301-aus-1.2.H,Reservoir,0
94816,39-301-aus-1,MF3-39-301-aus,39-301--1-MF3-445,39-301-aus-1.2.H,Reservoir,0
...,...,...,...,...,...,...
95101,39-301-aus-1,MF6-39-301-aus,39-301--1-MF6-585,39-301-aus-1.3.R,La Trobe University,1
95102,39-301-aus-1,MF6-39-301-aus,39-301--1-MF6-586,39-301-aus-1.3.R,La Trobe University,1
95103,39-301-aus-1,MF6-39-301-aus,39-301--1-MF6-587,39-301-aus-1.3.R,La Trobe University,1
95104,39-301-aus-1,MF6-39-301-aus,39-301--1-MF6-588,39-301-aus-1.3.R,La Trobe University,1


In [56]:
DF['4']['stop_times'][DF['4']['stop_times']['trip_id'] == '19-754--1-MF3-4']

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,stop_name,stop_lat,stop_lon
5197169,19-754--1-MF3-4,06:59:00,06:59:00,21015,2,,0,0,0.0,Glen Waverley Station/Railway Pde (Glen Waverley),-37.879473,145.162979
5200859,19-754--1-MF3-4,06:39:00,06:39:00,21316,1,,0,0,0.0,Stud Park SC (Rowville),-37.919465,145.237567


In [52]:
for bid in BRANCH_IDS:
    DF[bid]['stop_times'] = pd.merge(DF[bid]['stop_times'], DF[bid]['stops'], on='stop_id')

In [None]:
bid = '2'
dft = DF[bid]['stop_times'].groupby('trip_id')['stop_id'].apply(lambda x: x.to_numpy()).to_frame(name='stops_sequence').reset_index()
dft = pd.merge(DF[bid]['trips'], dft, on='trip_id')
dft = pd.merge(DF[bid]['routes'], dft, on='route_id')
dft['stops_count'] = dft['stops_sequence'].apply(lambda x: len(x))

In [None]:
DF['1']['routes']

In [None]:
for bid in BRANCH_IDS:
    dfrt = pd.merge(DF[bid]['routes'], DF[bid]['trips'], on='route_id')
    dfrd = dfrt[['route_short_name', 'trip_headsign' ,'direction_id']].drop_duplicates()
    print(dfrd.groupby(['route_short_name', 'direction_id']).value_counts().nunique())
    


In [None]:
for bid in BRANCH_IDS:
    if not (DF[bid]['routes']['route_short_name'].notna().all() or DF[bid]['routes']['route_short_name'].isna().all()):
        print(bid)

In [None]:
dfshapes = pd.concat([DF[bid]['shapes'] for bid in BRANCH_IDS]).groupby('shape_id')[['shape_pt_lon', 'shape_pt_lat']].apply(lambda x: x.to_numpy())

In [None]:
dfshapes

In [None]:
bid = '4'
k_short_long_names = {}
k_route_order = {}
k_range = {}
k_type = {}
k_branches = {}
for bid in BRANCH_IDS_ALL:
    k_short_long_names[bid] = DF[bid]['routes'].dropna(subset=['route_short_name', 'route_long_name']).groupby('route_long_name')['route_short_name'].nunique().unique()
    k_range[bid] = DF[bid]['routes']['route_range'].unique()
    k_type[bid] = DF[bid]['routes']['route_type'].apply(lambda x: ROUTE_TYPES[x]).unique()
    k_route_order[bid] = DF[bid]['routes']['route_id_order'].unique()
    k_branches[bid] = DF[bid]['routes']['branch'].unique()

In [None]:
k_branches

In [None]:
DF['8']['routes']['route_short_name'].sort_values().unique()

In [None]:
DF['4']['routes']['route_short_name'].sort_values().unique()

In [None]:
DF['2']['routes'].dropna(subset=['route_short_name', 'route_long_name']).groupby('route_short_name')['route_long_name'].unique()

In [None]:
bid = '10'
DF[bid]['routes'][(DF[bid]['routes']['route_short_name'].astype(str) != DF[bid]['routes']['route_name']) & (DF[bid]['routes']['route_short_name'].notna())]
DF[bid]['routes']

In [None]:
DF['4']['routes']['route_id'].transform(lambda x: x.split('-')[2]).unique()

In [None]:
DF['11']['routes']['route_short_name'].isna().all()

In [None]:
DF['4']['routes']['route_id'].transform(lambda x: x.split('-')[1]).unique()

In [None]:
bid = '5'
dfrt = pd.merge(DF[bid]['routes'], DF[bid]['trips'], on='route_id')
dfrd = dfrt[['route_short_name', 'trip_headsign' ,'direction_id']].drop_duplicates()
dfrd

In [None]:
dfrd.groupby(['route_short_name', 'direction_id']).value_counts()

In [None]:
dfrd.groupby(['route_short_name', 'direction_id']).value_counts().nunique()

In [None]:
dft.groupby('route_id')['service_id'].unique()

In [None]:
bid = '3'

In [None]:
df_trams = DF['3']['stop_times'].groupby('trip_id')['stop_id'].apply(lambda x: x.to_numpy()).to_frame(name='stops_sequence').reset_index()
df_trams = pd.merge(DF['3']['trips'], df_trams, on='trip_id')
df_trams = pd.merge(DF['3']['routes'], df_trams, on='route_id')
df_trams['stops_count'] = df_trams['stops_sequence'].apply(lambda x: len(x))
df_trams = df_trams.sort_values(by=['route_short_name', 'stops_count'], ascending=False).drop_duplicates(subset=['route_short_name', 'trip_headsign', 'direction_id'], keep='first')

df_tram_stops = pd.merge(DF['3']['stop_times'], DF['3']['trips'], on='trip_id')
df_tram_stops = pd.merge(df_tram_stops, DF['3']['routes'], on='route_id')
df_tram_stops  = df_tram_stops.groupby(['route_short_name', 'trip_headsign', 'direction_id'])['stop_id'].apply(lambda x: x.unique()).to_frame(name='stops').reset_index()

df_trams = df_trams.merge(df_tram_stops, on=['route_short_name', 'trip_headsign', 'direction_id'])

df_trams['stops_count_2'] = df_trams['stops'].apply(lambda x: len(x))
df_trams['stops_set_1'] = df_trams['stops_sequence'].apply(lambda x: sorted(x))
df_trams['stops_set_2'] = df_trams['stops'].apply(lambda x: sorted(x))

df_trams['stops_set_1_size'] = df_trams['stops_set_1'].apply(lambda x: len(set(x)))
df_trams['stops_set_2_size'] = df_trams['stops_set_2'].apply(lambda x: len(set(x)))

In [None]:
df_trams[df_trams['stops_set_1'] != df_trams['stops_set_2']].iloc[2]['stops_sequence']

In [None]:
df_routes_full = pd.merge(DF['3']['trips'], DF['3']['routes'], on='route_id', how='left')

In [None]:
df_routes_full = pd.merge(df_routes_full, DF['3']['stop_times'], on='trip_id', how='left')

In [None]:
df_trams['anomalies'] = df_trams.apply(lambda x: set(x['stops']) - set(x['stops_sequence']), axis=1)

In [None]:
df_anomalies = df_trams[df_trams['anomalies'].apply(lambda x: len(x) > 0)][['route_short_name', 'trip_headsign', 'direction_id', 'anomalies']]

In [None]:
df_at = df_anomalies.explode('anomalies').merge(df_routes_full, left_on=['route_short_name', 'trip_headsign', 'direction_id', 'anomalies'], right_on=['route_short_name', 'trip_headsign', 'direction_id', 'stop_id']).sort_values(by=['trip_id', 'stop_sequence'], ascending=True)
df_at = df_at.groupby('trip_id')['stop_id'].apply(lambda x: x.to_numpy()).to_frame(name='stops_sequence').reset_index()

In [None]:
df_at = df_at.groupby('trip_id')['stop_id'].apply(lambda x: x.to_numpy()).to_frame(name='stops_sequence').reset_index()

In [None]:
df_at = pd.merge(DF['3']['trips'], df_at, on='trip_id')

In [None]:
df_at['stops_count'] = df_at['stops_sequence'].apply(lambda x: len(x))

In [None]:
df_at = pd.merge(DF['3']['routes'], df_at, on='route_id')

In [None]:
df_at = df_at.sort_values(by=['route_short_name', 'stops_count'], ascending=False).drop_duplicates(subset=['route_short_name', 'trip_headsign', 'direction_id'], keep='first').merge(df_anomalies, on=['route_short_name', 'trip_headsign', 'direction_id'], how='left')

In [None]:
df_at[df_at.apply(lambda x: set(x['anomalies']) - set(x['stops_sequence']), axis=1).apply(lambda x: len(x) > 0)]

In [None]:
df_at = df_at[['route_short_name', 'trip_headsign', 'direction_id', 'trip_id']]

In [None]:
df_at = df_at.merge(DF['3']['stop_times'], on='trip_id')

In [None]:
df_at = df_at.groupby('trip_id')['stop_id'].apply(lambda x: x.to_numpy()).to_frame(name='stops_sequence').reset_index()

In [None]:
df_at = pd.merge(DF['3']['trips'], df_at, on='trip_id')
df_at = pd.merge(DF['3']['routes'], df_at, on='route_id')

In [None]:
df_at = pd.concat([df_at, df_trams])

In [None]:
df_at.drop(columns=['anomalies', 'stops_count', 'stops', 'stops_count_2', 'stops_set_1', 'stops_set_2', 'stops_set_1_size', 'stops_set_2_size'], inplace=True)

In [None]:
df_at['stops_count'] = df_at['stops_sequence'].apply(lambda x: len(x))

In [None]:
df_at.sort_values(by=['route_short_name', 'trip_headsign', 'direction_id', 'stops_count'], ascending=False, inplace=True)
df_at.to_csv('trips.csv', index=False)

In [None]:
DF['3']['shapes']['point'] = DF['3']['shapes'].apply(lambda x: sg.Point(x['shape_pt_lon'], x['shape_pt_lat']), axis=1)

In [None]:
df_lines = DF['3']['shapes'].groupby('shape_id')['point'].apply(lambda x: sg.LineString(x.to_numpy())).to_frame(name='line').reset_index()

In [None]:
DF['3']['stops']['stop_pt'] = DF['3']['stops'].apply(lambda x: sg.Point(x['stop_lon'], x['stop_lat']), axis=1)

In [None]:
df_geo = pd.merge(DF['3']['stop_times'], DF['3']['trips'][['trip_id', 'shape_id']], on='trip_id')
df_geo = pd.merge(df_geo, df_lines, on='shape_id')
df_geo = df_geo[['shape_id', 'stop_id']].drop_duplicates()
df_geo = pd.merge(df_geo, df_lines, on='shape_id')
df_geo = pd.merge(df_geo, DF['3']['stops'][['stop_id', 'stop_pt']], on='stop_id')

In [None]:
df_geo[df_geo.apply(lambda x: x['line'].contains(x['stop_pt']), axis=1)]

In [None]:
bid = '4'
dfrt = pd.merge(DF[bid]['routes'], DF[bid]['trips'], on='route_id')
# assert dfrt.apply(lambda x: str(x['route_short_name']) in x['route_id'], axis=1).all()


In [None]:
assert dfrt.apply(lambda x: str(x['route_id']) in str(x['shape_id']), axis=1).all()
assert dfrt.apply(lambda x: x['shape_id'] in x['trip_id'], axis=1).all()


In [None]:
bid = '5'
dfrt = pd.merge(DF[bid]['routes'], DF[bid]['trips'], on='route_id')
dfrt.dropna(subset=['shape_id'], inplace=True)
dfrt['shape_id_x'] = dfrt['shape_id'].apply(lambda x: x.split('.'))
shape_id_x = dfrt['shape_id_x'].apply(len).unique()
assert len(shape_id_x) == 1
shape_id_count = shape_id_x[0]
for i in range(shape_id_count):
    dfrt[f'shape_id_{i}'] = dfrt['shape_id_x'].apply(lambda x: x[i])
dfrt[['shape_id_1', 'shape_id_2']].sort_values(by=['shape_id_1', 'shape_id_2'], ascending=True).value_counts()


In [None]:
bid = '11'
dfrt = pd.merge(DF[bid]['routes'], DF[bid]['trips'], on='route_id')
dfrt.groupby('route_id')['direction_id'].unique().apply(len).value_counts()

In [None]:
dfrt[['shape_id_1', 'shape_id_2']].value_counts()

In [None]:
bid = '11'
DF[bid]['routes'][DF[bid]['routes'].apply(lambda x: str(x['route_short_name']) not in x['route_id'], axis=1)]

In [None]:
dfrt[['direction_id', 'trip_id_4']].value_counts()

In [None]:
DF['3']['trips']['direction_id'].unique()

In [None]:
DF['5']['trips']['direction_id']

In [None]:
assert dfrt.apply(lambda x: x['shape_id'] in x['trip_id'], axis=1).all()

In [None]:
dfrt['shape_id_1'].unique()

In [None]:
assert dfrt['shape_id_x'].apply(len).nunique()

In [None]:
dfrt['trip_id_x'].apply(len).unique()

In [None]:
dfrt['shape_id_2'] = dfrt['shape_id_1'].apply(lambda x: x[1])
dfrt['trip_id_2'] = dfrt['trip_id_1'].apply(lambda x: x[1])


In [None]:
pd.merge(DF['3']['routes'], DF['3']['trips'], on='route_id').groupby(['route_short_name', 'trip_headsign', 'direction_id'])['shape_id'].unique()