In [1]:
import sys
import json

# files = ['https://storage.googleapis.com/storage/v1/b/mdb-latest/o/ca-british-columbia-translink-vancouver-gtfs-1222.zip?alt=media']
files = ['stl.zip']
params = {
	'exec_id': 0,
	'files': files,
	'start_time': '6:00:00',
	'end_time': '9:00:00',
	'day': 'tuesday',
	'dates': [],
	'period': 'am',
}
# params = {
# 	'exec_id': 0,
# 	'files': files,
# 	'start_time': '12:00:00',
# 	'end_time': '15:00:00',
# 	'day': 'tuesday',
# 	'dates': [],
# 	'period': 'pm',
# }

default = {'scenario': 'test', 'training_folder': '../..', 'params': params}  # Default execution parameters
manual, argv = (True, default) if 'ipykernel' in sys.argv[0] else (False, dict(default, **json.loads(sys.argv[1])))

In [2]:
import geopandas as gpd
import pandas as pd
import gtfs_kit as gtk
import numpy as np
from quetzal.io.gtfs_reader import importer
from quetzal.model import stepmodel
import json
import os

In [3]:
# Add path to quetzal
sys.path.insert(0, '../../../../quetzal/')
import os
import numba as nb

on_lambda = bool(os.environ.get('AWS_EXECUTION_ENV'))
num_cores = nb.config.NUMBA_NUM_THREADS

In [4]:
scenario = argv['scenario']
training_folder = argv['training_folder']
# if local. add the path to the scenario scenarios/<scenario>/
if on_lambda:
	input_folder = os.path.join(training_folder, 'inputs/')
	output_folder = os.path.join(training_folder, 'outputs/')
else:
	input_folder = f'../scenarios/{scenario}/inputs/'
	output_folder = f'../scenarios/{scenario}/outputs/'
	num_cores = 4

if not os.path.exists(output_folder):
	os.makedirs(output_folder)


In [5]:
argv['params']

{'exec_id': 0,
 'files': ['stl.zip'],
 'start_time': '6:00:00',
 'end_time': '9:00:00',
 'day': 'tuesday',
 'dates': [],
 'period': 'am'}

In [None]:
exec_id = argv['params'].get('exec_id', 0)
files = argv['params'].get('files', [])
dates = argv['params'].get('dates', [])
start_time = argv['params'].get('start_time')
end_time = argv['params'].get('end_time')
day = argv['params'].get('day')

period = argv['params'].get('period', 'am')


In [7]:
DAY_DICT = {'monday': 0, 'tuesday': 1, 'wednesday': 2, 'thursday': 3, 'friday': 4, 'saturday': 5, 'sunday': 6}

In [8]:
selected_day = DAY_DICT[day]
time_range = [start_time, end_time]

In [9]:
paths = []
for file in files:
	if file.startswith('http'):
		paths.append(file)
	else:
		paths.append(os.path.join(input_folder, file))


# import

In [10]:
feeds = []
for path in paths:
	print('Importing {f}'.format(f=path))
	feeds.append(importer.GtfsImporter(path=path, dist_units='m'))

Importing ../scenarios/test/inputs/stl.zip


# 1) filling missing values

In [11]:
# 1) filling missing values
for i in range(len(feeds)):
	print('cleaning ', files[i])
	if 'agency_id' not in feeds[i].agency:
		print(f'set agency_id to agency_name in {files[i]}')
		feeds[i].agency['agency_id'] = feeds[i].agency['agency_name']

	if 'agency_id' not in feeds[i].routes:
		print(f'add agency_id to routes in {files[i]}')
		feeds[i].routes['agency_id'] = feeds[i].agency['agency_id'].values[0]

	if 'pickup_type' not in feeds[i].stop_times:
		print(f'pickup_type missing in stop_times. set to 0 in {files[i]}')
		feeds[i].stop_times['pickup_type'] = 0

	if 'drop_off_type' not in feeds[i].stop_times:
		print(f'drop_off_type missing in stop_times. set to 0 in {files[i]}')
		feeds[i].stop_times['drop_off_type'] = 0

	if 'parent_station' not in feeds[i].stops:
		print(f'parent_station missing in stops. set to NaN in {files[i]}')
		feeds[i].stops['parent_station'] = np.nan
	if 'stop_code' not in feeds[i].stops:
		print(f'stop_code missing in stops. set to NaN in {files[i]}')
		feeds[i].stops['stop_code'] = np.nan

	feeds[i].stop_times['pickup_type'].fillna(0, inplace=True)
	feeds[i].stop_times['drop_off_type'].fillna(0, inplace=True)
	feeds[i].stop_times['arrival_time'] = feeds[i].stop_times['departure_time']

cleaning  stl.zip
parent_station missing in stops. set to NaN in stl.zip


# get dates if day is used

In [12]:
# if dates is not provided as inputs. (we have a day)
# get it from first dates of each GTFS
if len(dates) == 0:
	for feed in feeds:
		try:
			min_date = feed.calendar['start_date'].unique().min()
			max_date = feed.calendar['end_date'].unique().max()
		except:
			min_date = feed.calendar_dates['date'].unique().min()
			max_date = feed.calendar_dates['date'].unique().max()

		# get date range
		s = pd.date_range(min_date, max_date, freq='D').to_series()
		try:
			# get dayofweek selected and take first one
			s = s[s.dt.dayofweek == selected_day][0]
			# format  ex: ['20231011'] and append
			dates.append(f'{s.year}{str(s.month).zfill(2)}{str(s.day).zfill(2)}')
		except:
			print('date not available. use', min_date)
			dates.append(min_date)

# 2) restric feed to date

In [144]:
feeds_t = []
print('restrict feed')
for i, feed in enumerate(feeds):
	feed_t = feed.restrict(dates=[dates[i]], time_range=time_range)
	if len(feed_t.trips) > 0:
		feeds_t.append(feed_t)

restrict feed


# 3) add shape_dist_traveled to shapes and stop_times

In [145]:
print('add shape_dist_traveled to shapes')
for feed in feeds_t:
	if feed.shapes is None:
		print('no shapes in gtfs')
		continue
	elif 'shape_dist_traveled' not in feed.shapes.columns:
		feed.append_dist_to_shapes()
	elif any(feed.shapes['shape_dist_traveled'].isnull()):
		feed.append_dist_to_shapes()

print('add shape_dist_traveled to stop_times')
for feed in feeds_t:
	if feed.shapes is None:
		print('no shapes in gtfs cannot add to stop_times')
		continue
	elif 'shape_dist_traveled' not in feed.stop_times.columns:
		feed.append_dist_to_stop_times_fast()
	else:
		nan_sequence = feed.stop_times[feed.stop_times['shape_dist_traveled'].isnull()]['stop_sequence'].unique()
		# if there but all nan are at seq=1. just fill wwith 0.
		if all(seq == 1 for seq in nan_sequence):
			feed.stop_times['shape_dist_traveled'] = feed.stop_times['shape_dist_traveled'].fillna(0)
		else:
			feed.append_dist_to_stop_times_fast()

	if feed.stop_times['shape_dist_traveled'].max() < 100:
		print(f'convert to meters')
		feed.dist_units = 'km'
		feed = gtk.convert_dist(feed, new_dist_units='m')

add shape_dist_traveled to shapes
add shape_dist_traveled to stop_times


# 4) build links and nodes.

In [146]:
feeds_frequencies = []
for i in range(len(feeds_t)):
	print('Building links and nodes ', files[i])
	feed_s = feeds_t[i].copy()
	feed_s.group_services()

	feed_s.build_stop_clusters(distance_threshold=50)
	feed_s.build_patterns(on='cluster_id')

	feed_frequencies = feed_s.convert_to_frequencies(time_range=time_range)
	shapes = feed_frequencies.shapes is not None
	feed_frequencies.build_links_and_nodes(
		log=False,
		shape_dist_traveled=shapes,
		from_shape=shapes,
		stick_nodes_on_links=shapes,
		keep_origin_columns=['departure_time', 'pickup_type'],
		keep_destination_columns=['arrival_time', 'drop_off_type'],
		num_cores=num_cores,
	)
	feeds_frequencies.append(feed_frequencies)

Building links and nodes  stl.zip


# 5) rename route_type.

In [147]:
with open('route_type.json') as file:
	mapping = json.load(file)
	mapping = {int(key): item for key, item in mapping.items()}

retire = ['taxi']
for feed_frequencies in feeds_frequencies:
	feed_frequencies.links['route_type'] = feed_frequencies.links['route_type'].apply(lambda t: mapping.get(t, np.nan))
	# assert not any(feed_frequencies.links['route_type'].isna())
	feed_frequencies.links = feed_frequencies.links[~feed_frequencies.links['route_type'].isin(retire)]


In [148]:
for feed_frequencies in feeds_frequencies:
	feed_frequencies.links.loc[feed_frequencies.links['time'] == 0, 'time'] = 1.0

# finish model

In [149]:
sm = stepmodel.StepModel(epsg=4326, coordinates_unit='meter')

links_concat = []
nodes_concat = []
for feed_frequencies in feeds_frequencies:
	links_concat.append(feed_frequencies.links.to_crs(4326))
	nodes_concat.append(feed_frequencies.nodes.to_crs(4326))

# nothing to export. export empty geojson
if len(links_concat) == 0:
	links = gpd.GeoDataFrame(columns=['feature'], geometry='feature', crs=4326)
	nodes = gpd.GeoDataFrame(columns=['feature'], geometry='feature', crs=4326)
	links.to_file(os.path.join(output_folder, 'links.geojson'), driver='GeoJSON')
	nodes.to_file(os.path.join(output_folder, 'nodes.geojson'), driver='GeoJSON')
	end_of_notebook

In [None]:
columns = [
	'trip_id',
	'route_id',
	'agency_id',
	'direction_id',
	'a',
	'b',
	'shape_dist_traveled',
	'link_sequence',
	'time',
	'headway',
	'pickup_type',
	'drop_off_type',
	'route_short_name',
	'route_type',
	'route_color',
	'geometry',
]


sm.links = pd.concat(links_concat)
for col in columns:
	if col not in sm.links.columns:
		sm.links[col] = np.nan

sm.links = sm.links[columns]
sm.nodes = pd.concat(nodes_concat)[['stop_id', 'stop_name', 'stop_code', 'geometry']]

sm.nodes = sm.nodes.reset_index(drop=True).sort_index()
sm.links = sm.links.reset_index(drop=True).sort_index()

sm.nodes.loc[sm.nodes['stop_code'].isna(), 'stop_code'] = sm.nodes.loc[sm.nodes['stop_code'].isna(), 'stop_id']
sm.nodes.drop_duplicates(subset=['stop_id'], inplace=True)

sm.links['trip_id'] = sm.links['agency_id'].astype(str) + '_' + sm.links['trip_id'].astype(str)
sm.links['route_id'] = sm.links['agency_id'].astype(str) + '_' + sm.links['route_id'].astype(str)

sm.links = sm.links.sort_values(['route_type', 'trip_id']).reset_index(drop=True)

dnodes = ('node_' + sm.nodes.reset_index().set_index('stop_id')['index'].astype(str)).to_dict()
sm.nodes.index = 'node_' + sm.nodes.index.astype(str)

sm.links.index = 'link_' + sm.links.index.astype(str)

sm.links['a'] = sm.links['a'].apply(lambda a: dnodes.get(a))
sm.links['b'] = sm.links['b'].apply(lambda a: dnodes.get(a))

sm.links.drop_duplicates(subset=['trip_id', 'link_sequence'], inplace=True)

# Tag route with only one trip
# time_slot = np.diff([hhmmss_to_seconds_since_midnight(time) for time in time_range])[0]
# sm.links.loc[(time_slot/sm.links['headway']) < 2.0, 'headway'] = np.nan

sm.links = sm.links.to_crs(4326)
sm.nodes = sm.nodes.to_crs(4326)

# add speed, add length.
epsg = importer.get_epsg(sm.nodes.iloc[0]['geometry'].y, sm.nodes.iloc[0]['geometry'].x)
sm.links['length'] = sm.links.to_crs(epsg).length
sm.links['speed'] = (sm.links['length'] / sm.links['time']) * 3.6
# regarder quetzal_transit pour voir les valeurs necessaires.

print('Saving')
sm.links.to_file(os.path.join(output_folder, 'links.geojson'), driver='GeoJSON')
sm.nodes.to_file(os.path.join(output_folder, 'nodes.geojson'), driver='GeoJSON')


Saving on S3


In [13]:
end_of_notebook

NameError: name 'end_of_notebook' is not defined

In [153]:
am = gpd.read_file(os.path.join(output_folder, 'links_am.geojson'))
pm = gpd.read_file(os.path.join(output_folder, 'links_pm.geojson'))

In [157]:
len(am)

4797

In [159]:
len(pm)

4388

In [169]:
trips_am = am.groupby('trip_id')[['index']].agg(len)

In [170]:
trips_pm = pm.groupby('trip_id')[['index']].agg(len)

In [None]:
test = trips_am.merge(trips_pm, left_index=True, right_index=True, how='outer')

In [None]:
test[test['index_x'] != test['index_y']]

Unnamed: 0_level_0,index_x,index_y
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1
STL_JUIN19151S_1,41.0,
STL_JUIN19151S_2,30.0,
STL_JUIN1920E_1,17.0,
STL_JUIN19252O_0,58.0,
STL_JUIN1927N_1,38.0,
STL_JUIN1933N_1,28.0,
STL_JUIN19360N_0,,32.0
STL_JUIN1936E_0,37.0,
STL_JUIN19402O_0,57.0,
STL_JUIN1940O_1,5.0,


In [183]:
trips_am

Unnamed: 0_level_0,index
trip_id,Unnamed: 1_level_1
STL_JUIN1912E_0,3
STL_JUIN1912O_0,9
STL_JUIN19144E_0,41
STL_JUIN19144O_0,42
STL_JUIN19151N_0,53
...,...
STL_JUIN19902S_0,23
STL_JUIN19903N_0,55
STL_JUIN19903S_0,56
STL_JUIN19925S_0,44
