In [1]:
import geopandas as gpd
import pandas as pd
import sys
import os
import gtfs_kit as gtk
import numpy as np
sys.path.insert(0, r'../../../')
from quetzal.io.gtfs_reader import importer
from quetzal.io.gtfs_reader.frequencies import hhmmss_to_seconds_since_midnight 
from quetzal.model import stepmodel


import warnings
warnings.filterwarnings("ignore")

In [23]:
time_range = ['6:00:00', '8:59:00'] # PPAM
#dates = ['20191015'] # the dates must be within the feed start and end dates


In [24]:
# GTFS are read recursively between each ancestor
files=[]
gtfs_folder = 'gtfs/paris/'
if os.path.exists(gtfs_folder):
    for filename in filter(lambda x: x[-4:] == '.zip', os.listdir(gtfs_folder)):
        files.append(gtfs_folder+filename)

#files = files[0:1]
files

['gtfs/paris/ratp.zip']

In [25]:
files = ['https://storage.googleapis.com/storage/v1/b/mdb-latest/o/ca-quebec-exo-gtfs-748.zip?alt=media',
        'https://storage.googleapis.com/storage/v1/b/mdb-latest/o/ca-quebec-societe-de-transport-de-laval-gtfs-749.zip?alt=media']

In [26]:
feeds=[]
for file in files:
    print('Importing {f}.zip'.format(f=file))
    feeds.append(importer.GtfsImporter(path=file, dist_units='m'))


Importing https://storage.googleapis.com/storage/v1/b/mdb-latest/o/ca-quebec-exo-gtfs-748.zip?alt=media.zip
Importing https://storage.googleapis.com/storage/v1/b/mdb-latest/o/ca-quebec-societe-de-transport-de-laval-gtfs-749.zip?alt=media.zip


In [27]:
for i in range(len(feeds)):
    print(i)
    if 'agency_id' not in feeds[i].routes:
        print(f'add agency_id to routes in {files[i]}')
        feeds[i].routes['agency_id'] = feeds[i].agency['agency_id'].values[0]

    
    if 'pickup_type' not in feeds[i].stop_times:
        print(f'picjup_type missing in stop_times. set to 0 in {files[i]}')
        feeds[i].stop_times['pickup_type'] = 0
    
    if 'drop_off_type' not in feeds[i].stop_times:
        print(f'drop_odd_type missing in stop_times. set to 0 in {files[i]}')
        feeds[i].stop_times['drop_off_type'] = 0
        
    if 'parent_station' not in feeds[i].stops:
        print(f'parent_station missing in stops. set to NaN in {files[i]}')
        feeds[i].stops['parent_station'] = np.nan
    feeds[i].stop_times['pickup_type'].fillna(0, inplace=True)
    feeds[i].stop_times['drop_off_type'].fillna(0, inplace=True)
    
    
    '''
    if 'shape_dist_traveled' not in feeds[i].stop_times.columns:
        feeds[i] = gtk.append_dist_to_stop_times(feeds[i])
    feeds[i].stop_times.loc[(feeds[i].stop_times['stop_sequence'] == 1), 'shape_dist_traveled'] = feeds[i].stop_times[feeds[i].stop_times['stop_sequence'] == 1]['shape_dist_traveled'].fillna(0.0)

    if feeds[i].stop_times['shape_dist_traveled'].max() < 100:
        print(f'convert to meters : {files[i]}')
        feeds[i].dist_units = 'km'
        feeds[i] = gtk.convert_dist(feeds[i], new_dist_units='m')
    '''
   

    assert all(~feeds[i].routes['agency_id'].isna())
    
    feeds[i].stop_times['arrival_time'] = feeds[i].stop_times['departure_time']

0
parent_station missing in stops. set to NaN in https://storage.googleapis.com/storage/v1/b/mdb-latest/o/ca-quebec-exo-gtfs-748.zip?alt=media
1
parent_station missing in stops. set to NaN in https://storage.googleapis.com/storage/v1/b/mdb-latest/o/ca-quebec-societe-de-transport-de-laval-gtfs-749.zip?alt=media


In [28]:
available_dates =[]
for feed in feeds:
    available_dates.append([feed.calendar['start_date'].unique().min(),feed.calendar['end_date'].unique().max()])

'20231005'

In [39]:
feeds_t = []

for i, feed in enumerate(feeds):
    feed_t = feed.restrict(dates=[available_dates[i][0]], time_range=time_range)
    if len(feed_t.trips) > 0:
        feeds_t.append(feed_t)


In [40]:
for i in range(len(feeds_t)):
    if 'shape_dist_traveled' not in feeds_t[i].stop_times.columns:
        feeds_t[i] = gtk.append_dist_to_stop_times(feeds_t[i])
    feeds_t[i].stop_times.loc[(feeds_t[i].stop_times['stop_sequence'] == 1), 'shape_dist_traveled'] = feeds_t[i].stop_times[feeds_t[i].stop_times['stop_sequence'] == 1]['shape_dist_traveled'].fillna(0.0)

    if feeds_t[i].stop_times['shape_dist_traveled'].max() < 100:
        print(f'convert to meters')
        feeds_t[i].dist_units = 'km'
        feeds_t[i] = gtk.convert_dist(feeds_t[i], new_dist_units='m')


convert to meters


In [41]:
%%time
feeds_frequencies = []

for i in range(len(feeds_t)):
    print(i)
    feed_s = feeds_t[i].copy()
    feed_s.group_services()

    feed_s.build_stop_clusters(distance_threshold=50)
    feed_s.build_patterns(on='cluster_id')

    feed_frequencies = feed_s.convert_to_frequencies(time_range=time_range)
    shapes = feed_frequencies.shapes is not None
    feed_frequencies.build_links_and_nodes(log=False, 
                                           shape_dist_traveled=True, 
                                           from_shape=shapes, 
                                           stick_nodes_on_links=shapes,
                                           keep_origin_columns=['departure_time','pickup_type'],
                                           keep_destination_columns=['arrival_time','drop_off_type'])
    feeds_frequencies.append(feed_frequencies)

0


100%|███████████████████████████████████████████| 13/13 [00:01<00:00,  7.26it/s]


1


100%|███████████████████████████████████████████| 72/72 [00:16<00:00,  4.50it/s]


CPU times: user 19.9 s, sys: 41.8 ms, total: 19.9 s
Wall time: 19.9 s


In [42]:
mapping = {0:'tram', 1:'subway', 2:'rail', 3:'bus',4:'ferry',5:'cable_car',6:'gondola',7:'funicular', 700:'bus', 1501:'taxi'}
retire = ['taxi']
for feed_frequencies in feeds_frequencies:
    feed_frequencies.links['route_type'] = feed_frequencies.links['route_type'].apply(
        lambda t: mapping.get(t, np.nan)
    )
    
    assert not any(feed_frequencies.links['route_type'].isna())
    feed_frequencies.links = feed_frequencies.links[~feed_frequencies.links['route_type'].isin(retire)]

for feed_frequencies in feeds_frequencies:
    feed_frequencies.links.loc[feed_frequencies.links['time'] == 0,'time'] = 1.0

# create Model

In [43]:
columns=['trip_id','route_id','agency_id','direction_id','a','b', 'shape_dist_traveled',
                                    'link_sequence','time','headway','pickup_type', 'drop_off_type',
                                    'route_short_name','route_type','route_color','geometry']

In [57]:
sm = stepmodel.StepModel(epsg=4326, coordinates_unit='meter')

links_concat = []; nodes_concat = []
for feed_frequencies in feeds_frequencies:
    links_concat.append(feed_frequencies.links)
    nodes_concat.append(feed_frequencies.nodes)

sm.links = pd.concat(links_concat)

for col in columns:
    if col not in sm.links.columns:
        sm.links[col] = np.nan
        
sm.links = sm.links[columns]
sm.nodes = pd.concat(nodes_concat)[['stop_id','stop_name','stop_code','geometry']]

sm.nodes = sm.nodes.reset_index(drop=True).sort_index()
sm.links = sm.links.reset_index(drop=True).sort_index()


sm.nodes.loc[sm.nodes['stop_code'].isna(),'stop_code'] = sm.nodes.loc[sm.nodes['stop_code'].isna(),'stop_id'] 
sm.nodes.drop_duplicates(subset=['stop_id'], inplace=True)

sm.links['trip_id'] = sm.links['agency_id'] +'_' +sm.links['trip_id']
sm.links['route_id'] = sm.links['agency_id'] +'_' +sm.links['route_id']

sm.links = sm.links.sort_values(['route_type','trip_id']).reset_index(drop=True)

dnodes = ('node_' +sm.nodes.reset_index().set_index('stop_id')['index'].astype(str)).to_dict()
sm.nodes.index = 'node_' +sm.nodes.index.astype(str)

sm.links.index = 'link_' +sm.links.index.astype(str)

sm.links['a'] = sm.links['a'].apply(lambda a: dnodes.get(a))
sm.links['b'] = sm.links['b'].apply(lambda a: dnodes.get(a))

sm.links.drop_duplicates(subset=['trip_id','link_sequence'], inplace=True)

# Tag route with only one trip
time_slot = np.diff([hhmmss_to_seconds_since_midnight(time) for time in time_range])[0]
sm.links.loc[(time_slot/sm.links['headway']) < 2.0, 'headway'] = np.nan

sm.links = sm.links.to_crs(4326)
sm.nodes = sm.nodes.to_crs(4326)

In [47]:
BUCKET = 'quenedi-osm'
uuid = 'test'
sm.links.to_file(f's3://{BUCKET}/{uuid}/links.geojson', driver='GeoJSON')


In [108]:
df = pd.read_csv('sources.csv')

In [112]:
df[df['location.subdivision_name']=='Québec']

Unnamed: 0,mdb_source_id,data_type,entity_type,location.country_code,location.subdivision_name,location.municipality,provider,name,note,static_reference,...,urls.api_key_parameter_name,urls.latest,urls.license,location.bounding_box.minimum_latitude,location.bounding_box.maximum_latitude,location.bounding_box.minimum_longitude,location.bounding_box.maximum_longitude,location.bounding_box.extracted_on,status,features
738,739,gtfs,,CA,Québec,La Pêche,Régie intermunicipale de transport des Collines,,,,...,,https://storage.googleapis.com/storage/v1/b/md...,,45.407263,45.919647,-77.076473,-75.617139,2022-03-17T13:15:38+00:00,,
739,740,gtfs,,CA,Québec,Gatineau,Société de transport de l'Outaouais,,,,...,,https://storage.googleapis.com/storage/v1/b/md...,,45.381261,45.592886,-75.890201,-75.390856,2022-03-17T13:15:42+00:00,,
740,741,gtfs,,CA,Québec,Varennes,Exo Sorel-Varennes,,,,...,,https://storage.googleapis.com/storage/v1/b/md...,,45.524297,46.046225,-73.540802,-73.084444,2022-03-17T13:15:44+00:00,,
741,742,gtfs,,CA,Québec,,Exo Sud-ouest,,,,...,,https://storage.googleapis.com/storage/v1/b/md...,,45.251963,45.498100,-74.132096,-73.564442,2022-03-17T13:15:45+00:00,,
742,743,gtfs,,CA,Québec,,Exo La Presqu'île,,,,...,,https://storage.googleapis.com/storage/v1/b/md...,,45.350844,45.513436,-74.315165,-73.682942,2022-03-17T13:15:47+00:00,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1368,1369,gtfs-rt,tu,CA,Québec,,Exo Le Richelain,,,746.0,...,token,,https://exo.quebec/fr/a-propos/donnees-ouverte...,45.351194,45.548627,-73.565996,-73.387583,2022-03-17T13:15:53+00:00,,
1369,1370,gtfs-rt,sa,CA,Québec,,Exo Le Richelain,,,746.0,...,token,,https://exo.quebec/fr/a-propos/donnees-ouverte...,45.351194,45.548627,-73.565996,-73.387583,2022-03-17T13:15:53+00:00,,
1370,1371,gtfs-rt,vp,CA,Québec,,Exo Le Richelain,,,746.0,...,token,,https://exo.quebec/fr/a-propos/donnees-ouverte...,45.351194,45.548627,-73.565996,-73.387583,2022-03-17T13:15:53+00:00,,
1470,1471,gtfs-rt,vp,CA,Québec,La Pêche,Transcollines,,,739.0,...,,,https://github.com/transcollines/GTFS-RT/blob/...,45.407263,45.919647,-77.076473,-75.617139,2022-03-17T13:15:38+00:00,,


In [120]:
df[df['location.bounding_box.minimum_latitude'].isnull()].iloc[0]['urls.latest']

'https://storage.googleapis.com/storage/v1/b/mdb-latest/o/ar-buenos-aires-subterraneos-de-buenos-aires-subte-gtfs-6.zip?alt=media'