In [95]:
import dask.dataframe as dd
import pandas as pd
import json
import os

# Download and extract GTFS data from IDFM

In [96]:
# GTFS data (used for timetable)
STATIC_GTFS_URL="https://eu.ftp.opendatasoft.com/stif/GTFS/IDFM-gtfs.zip"
STATIC_GTFS_FILE_PATH="raw_data/gtfs.zip"
STATIC_GTFS_PATH="raw_data/gtfs"

!mkdir -p $STATIC_GTFS_PATH
!wget $STATIC_GTFS_URL -O $STATIC_GTFS_FILE_PATH
!unzip -o $STATIC_GTFS_FILE_PATH -d $STATIC_GTFS_PATH

--2023-12-15 21:57:28--  https://eu.ftp.opendatasoft.com/stif/GTFS/IDFM-gtfs.zip
Resolving eu.ftp.opendatasoft.com (eu.ftp.opendatasoft.com)... 52.18.186.238
Connecting to eu.ftp.opendatasoft.com (eu.ftp.opendatasoft.com)|52.18.186.238|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 89473601 (85M) [application/zip]
Saving to: ‘raw_data/gtfs.zip’


2023-12-15 21:57:31 (33,9 MB/s) - ‘raw_data/gtfs.zip’ saved [89473601/89473601]

Archive:  raw_data/gtfs.zip
  inflating: raw_data/gtfs/transfers.txt  
  inflating: raw_data/gtfs/agency.txt  
  inflating: raw_data/gtfs/calendar_dates.txt  
  inflating: raw_data/gtfs/calendar.txt  
  inflating: raw_data/gtfs/stops.txt  
  inflating: raw_data/gtfs/trips.txt  
  inflating: raw_data/gtfs/routes.txt  
  inflating: raw_data/gtfs/stop_extensions.txt  
  inflating: raw_data/gtfs/stop_times.txt  
  inflating: raw_data/gtfs/pathways.txt  


# Loading and parsing GTFS data

In [111]:
calendar_path = os.path.join(STATIC_GTFS_PATH, 'calendar.txt')
calendar = pd.read_csv(calendar_path, dtype={'service_id': 'string',
                                             'monday': 'bool',
                                             'tuesday': 'bool',
                                             'wednesday': 'bool',
                                             'thursday': 'bool',
                                             'friday': 'bool',
                                             'saturday': 'bool',
                                             'sunday': 'bool',
                                             'start_date': 'int',
                                             'end_date': 'int'
                                             })
calendar = calendar.set_index('service_id')
calendar.head()

Unnamed: 0_level_0,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
service_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
IDFM:TN:1,False,False,False,False,False,True,False,20231216,20231216
IDFM:TN:10,False,True,True,True,True,True,True,20231217,20231231
IDFM:TN:11,False,False,False,False,False,True,True,20231217,20231231
IDFM:TN:100,True,True,True,True,True,False,False,20240108,20240112
IDFM:TN:211,True,True,False,True,True,False,False,20231218,20240112


In [112]:
stops_path = os.path.join(STATIC_GTFS_PATH, 'stops.txt')
stops = pd.read_csv(stops_path, dtype={'stop_id': 'string',
                                       'stop_code': 'string',
                                       'stop_name': 'string',
                                       'stop_desc': 'string',
                                       'stop_lon': 'float',
                                       'stop_lat': 'float',
                                       'zone_id': 'string',
                                       'stop_url': 'string',
                                       'location_type': 'string',
                                       'parent_station': 'string',
                                       'stop_timezone': 'string',
                                       'level_id': 'string',
                                       'wheelchair_boarding': 'int',
                                       'platform_code': 'string', })
stops.head()

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lon,stop_lat,zone_id,stop_url,location_type,parent_station,stop_timezone,level_id,wheelchair_boarding,platform_code
0,IDFM:monomodalStopPlace:47052,,Courcelle-sur-Yvette,,2.097722,48.701314,5,,0,IDFM:62951,,,1,
1,IDFM:monomodalStopPlace:43246,,La Ferté-sous-Jouarre,,3.124705,48.950782,5,,0,IDFM:68918,,,0,
2,IDFM:monomodalStopPlace:47187,,Verneuil-l'Étang,,2.825084,48.644389,5,,0,IDFM:62471,,,0,
3,IDFM:monomodalStopPlace:43228,,Les Baconnets,,2.288108,48.739846,4,,0,IDFM:69622,,,1,
4,IDFM:monomodalStopPlace:47955,,Saint-Ouen-l'Aumône Quartier de l'Église,,2.103469,49.039094,5,,0,IDFM:66731,,,0,


In [113]:
trips_path = os.path.join(STATIC_GTFS_PATH, 'trips.txt')
trips = dd.read_csv(trips_path, dtype={'route_id': 'string',
                                       'service_id': 'string',
                                       'trip_id': 'string',
                                       'trip_headsign': 'string',
                                       'trip_short_name': 'string',
                                       'direction_id': 'string',
                                       'wheelchair_accessible': 'int',
                                       'bikes_allowed': 'int'})

trips['route_short_id'] = trips['route_id'].apply(lambda x: x.split(':')[-1],
                                                  meta=('route_id', 'string'))
trips.head()                                                  

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,wheelchair_accessible,bikes_allowed,route_short_id
0,IDFM:C01737,IDFM:TN:0006ccf7-fe69-3b7b-9b31-56a2c10b5b54,IDFM:TN:SNCF:00cdb671-9318-4654-82d6-0866849b124e,TOLI,121056,1,,,1,0,C01737
1,IDFM:C01737,IDFM:TN:0006ccf7-fe69-3b7b-9b31-56a2c10b5b54,IDFM:TN:SNCF:ceee40b6-b882-4c49-9b24-cff88762e073,TOLI,121050,1,,,1,0,C01737
2,IDFM:C01737,IDFM:TN:0006ccf7-fe69-3b7b-9b31-56a2c10b5b54,IDFM:TN:SNCF:2144ff39-35ce-4e49-8e8d-580f8b49d41b,TOLI,121060,1,,,1,0,C01737
3,IDFM:C01737,IDFM:TN:0006ccf7-fe69-3b7b-9b31-56a2c10b5b54,IDFM:TN:SNCF:5d7b824c-8657-43a1-a581-4e57ba146d4e,TOLI,121044,1,,,1,0,C01737
4,IDFM:C01729,IDFM:TN:0009737c-6586-3940-ae3f-f0b0da09a336,IDFM:TN:SNCF:17a5c477-1dd4-4d62-a2a0-7543fce8f213,TAVA,118405,1,,,1,0,C01729


In [114]:
stop_times_path = os.path.join(STATIC_GTFS_PATH, 'stop_times.txt')
stop_times = dd.read_csv(stop_times_path, dtype={'trip_id': 'string',
                                                 'arrival_time': 'string',
                                                 'departure_time': 'string',
                                                 'stop_id': 'string',
                                                 'stop_sequence': 'int',
                                                 'pickup_type': 'int',
                                                 'drop_off_type': 'int',
                                                 'local_zone_id': 'string',
                                                 'stop_headsign': 'string',
                                                 'timepoint': 'string'})
stop_times.head()                                                 

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,local_zone_id,stop_headsign,timepoint
0,IDFM:TN:SNCF:00cdb671-9318-4654-82d6-0866849b124e,20:20:00,20:20:00,IDFM:monomodalStopPlace:411440,1,0,1,,,1
1,IDFM:TN:SNCF:00cdb671-9318-4654-82d6-0866849b124e,20:26:30,20:27:30,IDFM:monomodalStopPlace:411329,2,0,0,,,1
2,IDFM:TN:SNCF:00cdb671-9318-4654-82d6-0866849b124e,20:30:20,20:31:20,IDFM:monomodalStopPlace:411326,3,0,0,,,1
3,IDFM:TN:SNCF:00cdb671-9318-4654-82d6-0866849b124e,20:35:40,20:36:40,IDFM:monomodalStopPlace:411332,4,0,0,,,1
4,IDFM:TN:SNCF:00cdb671-9318-4654-82d6-0866849b124e,20:39:40,20:40:40,IDFM:monomodalStopPlace:47533,5,0,0,,,1


# Get time table for relevant lines (metros, tramways, trains)
## Determine relevant lines

In [115]:
# Get all lines from GTFS
all_lines = set(trips['route_short_id'].values.compute())

# Get list of lines with shortest_paths computed
computed_lines_path = os.path.join('data', 'shortest_paths')
computed_lines = {x.split('.')[0] for x in os.listdir(computed_lines_path)}

# Get relevant lines
lines = computed_lines.intersection(all_lines)
print(lines)



{'C01740', 'C01679', 'C01730', 'C01386', 'C01391', 'C01741', 'C01378', 'C01377', 'C01383', 'C01376', 'C01387', 'C01381', 'C01999', 'C01373', 'C02528', 'C01375', 'C01743', 'C01382', 'C01731', 'C01684', 'C01389', 'C01379', 'C01738', 'C01739', 'C01729', 'C01728', 'C01374', 'C01742', 'C01774', 'C01390', 'C01372', 'C01736', 'C01795', 'C01371', 'C01384', 'C01727', 'C01380', 'C01794', 'C02344', 'C01843', 'C02317', 'C01737'}


## Enrich trips with calendar data
Store dataframe in memory for faster join after.

In [116]:
trips = trips[trips['route_short_id'].isin(lines)]
trips = trips.set_index('service_id')
trips = trips.join(calendar, how='inner')
trips = trips.reset_index().set_index('trip_id')
trips.head()

Unnamed: 0_level_0,service_id,route_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,wheelchair_accessible,bikes_allowed,route_short_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
IDFM:KT9:101150-C02317-100-783,IDFM:10068,IDFM:C02317,Porte de Choisy,,0,,,1,0,C02317,False,True,True,True,True,True,False,20231215,20240113
IDFM:KT9:101150-C02317-1003-800,IDFM:100116,IDFM:C02317,Orly - Gaston Viens,,1,,,1,0,C02317,False,False,False,False,False,True,False,20231216,20240113
IDFM:KT9:101150-C02317-1012-800,IDFM:100116,IDFM:C02317,Porte de Choisy,,0,,,1,0,C02317,False,False,False,False,False,True,False,20231216,20240113
IDFM:KT9:101150-C02317-1023-800,IDFM:100116,IDFM:C02317,Porte de Choisy,,0,,,1,0,C02317,False,False,False,False,False,True,False,20231216,20240113
IDFM:KT9:101150-C02317-1026-800,IDFM:100116,IDFM:C02317,Orly - Gaston Viens,,1,,,1,0,C02317,False,False,False,False,False,True,False,20231216,20240113


In [117]:
# Store dataframe in memory
trips = trips.compute()

In [118]:
trips.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Index: 96654 entries, IDFM:KT9:101150-C02317-100-783 to IDFM:TN:SNCF:ffffb14c-aa9f-438c-915e-1259e1b53e85
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   service_id             96654 non-null  string 
 1   route_id               96654 non-null  string 
 2   trip_headsign          96654 non-null  string 
 3   trip_short_name        28838 non-null  string 
 4   direction_id           96654 non-null  string 
 5   block_id               0 non-null      float64
 6   shape_id               0 non-null      float64
 7   wheelchair_accessible  96654 non-null  int64  
 8   bikes_allowed          96654 non-null  int64  
 9   route_short_id         96654 non-null  object 
 10  monday                 96654 non-null  bool   
 11  tuesday                96654 non-null  bool   
 12  wednesday              96654 non-null  bool   
 13  thursday               96654 non-null 

## Enrich time table with trip data
Join stop_times dataframe with trips dataframe.

In [119]:
trips_id = set(trips.index.values)
stop_times = stop_times[stop_times['trip_id'].isin(trips_id)]
stop_times = stop_times.compute()

In [120]:
stop_times = stop_times.set_index('trip_id').join(trips, how='inner',
                                                  lsuffix='stop_times_',
                                                  rsuffix='trips_')
stop_times = stop_times.reset_index()
stop_times.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2047890 entries, 0 to 2047889
Data columns (total 29 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   trip_id                string 
 1   arrival_time           string 
 2   departure_time         string 
 3   stop_id                string 
 4   stop_sequence          int64  
 5   pickup_type            int64  
 6   drop_off_type          int64  
 7   local_zone_id          string 
 8   stop_headsign          string 
 9   timepoint              string 
 10  service_id             string 
 11  route_id               string 
 12  trip_headsign          string 
 13  trip_short_name        string 
 14  direction_id           string 
 15  block_id               float64
 16  shape_id               float64
 17  wheelchair_accessible  int64  
 18  bikes_allowed          int64  
 19  route_short_id         object 
 20  monday                 bool   
 21  tuesday                bool   
 22  wednesday         

## Saving data for each line
Parquet is faster to load/write than csv.

In [121]:
for line in lines:
    print(f'Saving line {line}')
    l_stop_times = stop_times[stop_times['route_short_id'] == line]
    save_directory = os.path.join('data', 'timetable')
    if not os.path.exists(save_directory):
        os.mkdir(save_directory)
    l_stop_times.to_parquet(os.path.join(save_directory, line))

Saving line C01740
Saving line C01679
Saving line C01730
Saving line C01386
Saving line C01391
Saving line C01741
Saving line C01378
Saving line C01377
Saving line C01383
Saving line C01376
Saving line C01387
Saving line C01381
Saving line C01999
Saving line C01373
Saving line C02528
Saving line C01375
Saving line C01743
Saving line C01382
Saving line C01731
Saving line C01684
Saving line C01389
Saving line C01379
Saving line C01738
Saving line C01739
Saving line C01729
Saving line C01728
Saving line C01374
Saving line C01742
Saving line C01774
Saving line C01390
Saving line C01372
Saving line C01736
Saving line C01795
Saving line C01371
Saving line C01384
Saving line C01727
Saving line C01380
Saving line C01794
Saving line C02344
Saving line C01843
Saving line C02317
Saving line C01737
