In [1]:
import pandas as pd
import os
import shapely.geometry as sg
import geopandas as gpd
import numpy as np
from shapely.geometry import LineString
from shapely.ops import unary_union
data_directory = os.path.join('..', 'data', 'ptv', '20240224')
ROUTE_TYPES = {
    0 : 'Tram',
    1 : 'Metro',
    2 : 'Rail',
    3 : 'Bus',
    4 : 'Ferry',
    5 : 'Cable tram',
    6 : 'Gondola',
    7 : 'Funicular',
    11 : 'Trolleybus',
    12 : 'Monorail',
}
ROUTE_TYPES_LONG = {
    0 : 'Tram, Streetcar, Light rail. Any light rail or street level system within a metropolitan area.',
    1 : 'Subway, Metro. Any underground rail system within a metropolitan area.',
    2 : 'Rail. Used for intercity or long-distance travel.',
    3 : 'Bus. Used for short- and long-distance bus routes.',
    4 : 'Ferry. Used for short- and long-distance boat service.',
    5 : 'Cable tram. Used for street-level rail cars where the cable runs beneath the vehicle, e.g., cable car in San Francisco.',
    6 : 'Aerial lift, suspended cable car (e.g., gondola lift, aerial tramway). Cable transport where cabins, cars, gondolas or open chairs are suspended by means of one or more cables.',
    7 : 'Funicular. Any rail system designed for steep inclines.',
    11 : 'Trolleybus. Electric buses that draw power from overhead wires using poles.',
    12 : 'Monorail. Railway in which the track consists of a single rail or a beam.',
}

BRANCH_IDS_ALL = ['1', '2', '3', '4', '5', '6', '7', '8', '10', '11']
BRANCH_IDS = ['1', '2', '3', '4', '5', '6', '10', '11']
TABLE_NAMES = ['stop_times', 'stops', 'trips', 'routes', 'calendar', 'calendar_dates', 'agency', 'shapes']
# GTFS File Fields
# agency.txt 
# agency_id, agency_name, agency_url, agency_timezone, agency_lang
# calendar.txt 
# service_id, monday, tuesday, wednesday, thursday, friday, saturday, sunday, start_date, end_date
# calendar_dates.txt 
# service_id ,date, exception_type
# routes.txt 
# route_id, agency_id, route_short_name, route_long_name,
# route_type, route_color,route_text_color
# trips.txt 
# route_id, service_id, trip_id, shape_id, trip_headsign, direction_id
# stops.txt 
# stop_id, stop_name, stop_lat, stop_lon
# stop_times.txt 
# trip_id, arrival_time, departure_time, stop_id, stop_sequence, stop_headsign, pickup_type, drop_off_type, shape_dist_traveled
# shapes.txt 
# shape_id, shape_pt_lat, shape_pt_lon, shape_pt_sequence, shape_dist_traveled 
def get_df(branch_id, table_name):
    files = [os.path.join(data_directory, f) for f in os.listdir(data_directory) if f.split('-')[1] == str(branch_id) and f.split('-')[2] == table_name]
    if len(files) == 0:
        return None
    return pd.concat([pd.read_csv(f, keep_default_na=False, na_values=['']) for f in files])

DF = {branch_id: {table_name: get_df(branch_id, table_name) for table_name in TABLE_NAMES} for branch_id in BRANCH_IDS_ALL}
# 15s - 30s

# Assert all shape_id contains route_id
for bid in BRANCH_IDS:
    assert DF[bid]['trips'].dropna(subset=['route_id', 'shape_id']).apply(lambda x: x['route_id'] in x['shape_id'], axis=1).all(), bid

for bid in BRANCH_IDS:
    DF[bid]['stop_times'] = pd.merge(DF[bid]['stop_times'], DF[bid]['stops'], on='stop_id')

for bid in BRANCH_IDS:
    assert DF[bid]['shapes'][DF[bid]['shapes']['shape_pt_sequence'] == 1]['shape_dist_traveled'].unique() == [0], bid

# 1m 30s

In [5]:
for bid in BRANCH_IDS:
    DF[bid]['shapes'].sort_values(by=['shape_id', 'shape_pt_sequence'], inplace=True)
    DF[bid]['shapes']['points'] = list(zip(DF[bid]['shapes']['shape_pt_lon'], DF[bid]['shapes']['shape_pt_lat']))
    DF[bid]['gdf'] = DF[bid]['shapes'].groupby('shape_id').apply(lambda x: sg.LineString(x['points'].to_numpy())).rename('geometry').reset_index()

In [67]:
for bid in BRANCH_IDS:
    DF[bid]['points'] = DF[bid]['shapes'].groupby('shape_id')['points'].apply(lambda x: x.to_numpy()).reset_index()

In [72]:
maxLines = 0
for bid in BRANCH_IDS:
    maxLine = DF[bid]['points']['points'].apply(lambda x: len(x)).max()
    maxLines = max(maxLines, maxLine)

In [73]:
maxLines

8391

In [59]:
for bid in BRANCH_IDS:
    # DF[bid]['shapes']['route_name'] = DF[bid]['shapes']['shape_id'].transform(lambda x: ''.join(x.split('-')[1:-2]))
    # DF[bid]['shapes']['direction'] = DF[bid]['shapes']['shape_id'].transform(lambda x: ''.join(x.split('.')[-1]))
    DF[bid]['shapes']['route_id'] = DF[bid]['shapes']['shape_id'].str.split('.', expand=True)[0]
    

In [7]:
for bid in BRANCH_IDS:
    DF[bid]['gdf'] = gpd.GeoDataFrame(DF[bid]['gdf'], geometry='geometry')

In [8]:
for bid in BRANCH_IDS:
    DF[bid]['gdf']['route_name'] = DF[bid]['gdf']['shape_id'].transform(lambda x: ''.join(x.split('-')[1:-2]))
    DF[bid]['gdf']['direction'] = DF[bid]['gdf']['shape_id'].transform(lambda x: ''.join(x.split('.')[-1]))
    DF[bid]['gdf']['branch'] = bid
    

In [17]:
df2unaryunion = DF['2']['gdf'].groupby(['route_name', 'direction'])['geometry'].apply(lambda x: x.unary_union).reset_index()

In [60]:
DF['2']['shapes']

Unnamed: 0,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled,points,route_name,direction,branch,route_id
878,2-ALM-mjp-2.1.H,-37.826576,145.058697,1,0.00,"(145.058696941084, -37.8265761521519)",ALM,H,2,2-ALM-mjp-2
879,2-ALM-mjp-2.1.H,-37.826493,145.061858,2,277.76,"(145.061857621326, -37.8264930451824)",ALM,H,2,2-ALM-mjp-2
880,2-ALM-mjp-2.1.H,-37.826347,145.063373,3,411.80,"(145.063372528623, -37.8263467469695)",ALM,H,2,2-ALM-mjp-2
881,2-ALM-mjp-2.1.H,-37.826282,145.064931,4,548.85,"(145.064930739406, -37.8262822478073)",ALM,H,2,2-ALM-mjp-2
882,2-ALM-mjp-2.1.H,-37.826264,145.065488,5,597.82,"(145.065487919579, -37.8262643681409)",ALM,H,2,2-ALM-mjp-2
...,...,...,...,...,...,...,...,...,...,...
873,2-ain-mjp-5.2.R,-37.820984,144.954088,72,5819.98,"(144.954087557086, -37.8209838709669)",ain,R,2,2-ain-mjp-5
874,2-ain-mjp-5.2.R,-37.820366,144.953298,73,5917.57,"(144.953298021051, -37.8203663626677)",ain,R,2,2-ain-mjp-5
875,2-ain-mjp-5.2.R,-37.820146,144.953088,74,5948.19,"(144.953088249621, -37.8201464343392)",ain,R,2,2-ain-mjp-5
876,2-ain-mjp-5.2.R,-37.819472,144.952652,75,6032.39,"(144.952652472148, -37.8194719891413)",ain,R,2,2-ain-mjp-5


In [61]:
df2pd = DF['2']['shapes'].groupby(['route_name', 'direction', 'shape_pt_lat','shape_pt_lon'])['shape_dist_traveled'].unique()

In [63]:
DF['2']['shapes']['shape_id'].value_counts()

shape_id
2-HBG-mjp-4.30.R     463
2-HBG-mjp-8.20.R     463
2-HBG-mjp-8.19.R     463
2-HBG-mjp-6.20.R     463
2-HBG-mjp-5.28.R     463
                    ... 
2-FKN-mjp-11.15.R     17
2-FKN-mjp-9.15.R      17
2-MER-mjp-3.1.H       17
2-MER-mjp-3.16.R      17
2-FKN-mjp-9.1.H       17
Name: count, Length: 1966, dtype: int64

In [54]:
df2pd

shape_id         shape_pt_lat  shape_pt_lon
2-ALM-mjp-2.1.H  -37.868320    145.079656      [5923.31]
                 -37.865562    145.080159      [5613.42]
                 -37.864367    145.080395      [5478.97]
                 -37.863775    145.080558      [5411.59]
                 -37.863542    145.080655      [5384.35]
                                                 ...    
2-ain-mjp-5.2.R  -37.808025    144.970256      [2854.52]
                 -37.808002    144.969916      [2824.52]
                 -37.807995    144.969564       [2793.6]
                 -37.807808    144.942695        [41.91]
                 -37.807489    144.942443          [0.0]
Name: shape_dist_traveled, Length: 419486, dtype: object

In [31]:


# Define the coordinates for the two LineStrings
line_coords1 = [(1, 2), (2, 4), (4, 8), (8, 16), (16, 25), (25, 100), (100, 200), (200, 400)]
line_coords2 = [(25, 25), (1, 2), (8, 16), (25, 100)]

# Create LineString objects
line1 = LineString(line_coords1)
line2 = LineString(line_coords2)

# Compute the union of the LineStrings
union_result = unary_union([line1, line2])


In [37]:
print(union_result)

MULTILINESTRING ((1 2, 2 4), (2 4, 4 8), (4 8, 8 16), (8 16, 16 25, 25 100), (25 100, 100 200, 200 400), (25 25, 1 2), (8 16, 25 100))
