In [1]:
import pandas as pd
import os

# Download and extract GTFS data from IDFM

In [2]:
# GTFS data (used for timetable)
STATIC_GTFS_URL="https://eu.ftp.opendatasoft.com/stif/GTFS/IDFM-gtfs.zip"
STATIC_GTFS_FILE_PATH="raw_data/gtfs.zip"
STATIC_GTFS_PATH="raw_data/gtfs"

In [3]:
!mkdir -p $STATIC_GTFS_PATH
!wget $STATIC_GTFS_URL -O $STATIC_GTFS_FILE_PATH
!unzip -o $STATIC_GTFS_FILE_PATH -d $STATIC_GTFS_PATH

--2024-07-21 10:15:59--  https://eu.ftp.opendatasoft.com/stif/GTFS/IDFM-gtfs.zip
Resolving eu.ftp.opendatasoft.com (eu.ftp.opendatasoft.com)... 52.18.186.238
Connecting to eu.ftp.opendatasoft.com (eu.ftp.opendatasoft.com)|52.18.186.238|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 86061887 (82M) [application/zip]
Saving to: ‘raw_data/gtfs.zip’


2024-07-21 10:16:02 (34,1 MB/s) - ‘raw_data/gtfs.zip’ saved [86061887/86061887]

Archive:  raw_data/gtfs.zip
  inflating: raw_data/gtfs/transfers.txt  
  inflating: raw_data/gtfs/ticketing_deep_links.txt  
  inflating: raw_data/gtfs/agency.txt  
  inflating: raw_data/gtfs/calendar_dates.txt  
  inflating: raw_data/gtfs/calendar.txt  
  inflating: raw_data/gtfs/stops.txt  
  inflating: raw_data/gtfs/trips.txt  
  inflating: raw_data/gtfs/routes.txt  
  inflating: raw_data/gtfs/stop_extensions.txt  
  inflating: raw_data/gtfs/stop_times.txt  
  inflating: raw_data/gtfs/pathways.txt  


# Loading and parsing GTFS data

In [37]:
calendar_path = os.path.join(STATIC_GTFS_PATH, 'calendar.txt')
calendar = pd.read_csv(calendar_path, dtype={'service_id': 'category',
                                             'monday': 'bool',
                                             'tuesday': 'bool',
                                             'wednesday': 'bool',
                                             'thursday': 'bool',
                                             'friday': 'bool',
                                             'saturday': 'bool',
                                             'sunday': 'bool',
                                             'start_date': 'int',
                                             'end_date': 'int'
                                             })
calendar = calendar.set_index('service_id')
calendar.head()

Unnamed: 0_level_0,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
service_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
IDFM:1,True,True,True,True,True,False,False,20240719,20240816
IDFM:438,True,True,True,False,False,False,False,20240720,20240731
IDFM:552,False,False,True,False,False,False,False,20240724,20240731
IDFM:621,True,True,False,False,False,False,False,20240722,20240730
IDFM:792,True,True,False,True,True,False,False,20240801,20240816


In [38]:
stops_path = os.path.join(STATIC_GTFS_PATH, 'stops.txt')
stops = pd.read_csv(stops_path, dtype={'stop_id': 'string',
                                       'stop_code': 'string',
                                       'stop_name': 'string',
                                       'stop_desc': 'string',
                                       'stop_lon': 'float',
                                       'stop_lat': 'float',
                                       'zone_id': 'category',
                                       'stop_url': 'string',
                                       'location_type': 'category',
                                       'parent_station': 'string',
                                       'stop_timezone': 'category',
                                       'level_id': 'string',
                                       'wheelchair_boarding': 'category',
                                       'platform_code': 'category', })
stops = stops.set_index('stop_id')
stops = stops.drop(columns=['stop_code', 'stop_desc', 'stop_url', 'location_type', 'parent_station', 'stop_timezone', 'level_id'])
stops.head()

Unnamed: 0_level_0,stop_name,stop_lon,stop_lat,zone_id,wheelchair_boarding,platform_code
stop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
IDFM:monomodalStopPlace:46689,Viroflay Rive Droite,2.167655,48.805473,3,1,
IDFM:monomodalStopPlace:46647,Achères Ville,2.077727,48.970644,5,1,
IDFM:monomodalStopPlace:474069,Lozère,2.212718,48.705912,4,1,
IDFM:monomodalStopPlace:474082,Noisy-le-Grand - Mont d'Est,2.550027,48.840885,4,1,
IDFM:monomodalStopPlace:45102,Châtelet les Halles,2.346977,48.861745,1,1,


In [39]:
trips_path = os.path.join(STATIC_GTFS_PATH, 'trips.txt')
trips = pd.read_csv(trips_path, dtype={'trip_id': 'string',
                                       'route_id': 'category',
                                       'service_id': 'category',
                                       'trip_headsign': 'string',
                                       'trip_short_name': 'string',
                                       'direction_id': 'string',
                                       'wheelchair_accessible': 'category',
                                       'bikes_allowed': 'category'})

trips = trips.drop(columns=['block_id', 'shape_id'])
trips['route_short_id'] = trips['route_id'].str.split(':').str[-1]
trips.head()

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id,wheelchair_accessible,bikes_allowed,route_short_id
0,IDFM:C01527,IDFM:1,IDFM:FSQY:145555-C01527-529-C01527-44713-7967928,Saint-Exupéry,,0,1,0,C01527
1,IDFM:C01527,IDFM:1,IDFM:FSQY:145555-C01527-529-C01527-44713-7967922,Saint-Exupéry,,0,1,0,C01527
2,IDFM:C00031,IDFM:1,IDFM:N4_MOBILITES:152581-C00031-18976623,Gare d'Ozoir,,1,2,0,C00031
3,IDFM:C02630,IDFM:1,IDFM:Transdev_Nord_Seine_Saint-Denis:147254-C0...,Gare d'Aulnay-sous-Bois,,1,0,0,C02630
4,IDFM:C02630,IDFM:1,IDFM:Transdev_Nord_Seine_Saint-Denis:147254-C0...,Gare d'Aulnay-sous-Bois,,1,0,0,C02630


In [40]:
stop_times_path = os.path.join(STATIC_GTFS_PATH, 'stop_times.txt')
stop_times = pd.read_csv(stop_times_path, dtype={'trip_id': 'string',
                                                 'arrival_time': 'string',
                                                 'departure_time': 'string',
                                                 'stop_id': 'string',
                                                 'stop_sequence': 'int',
                                                 'pickup_type': 'category',
                                                 'drop_off_type': 'category',
                                                 'local_zone_id': 'category',
                                                 'stop_headsign': 'string',
                                                 'timepoint': 'string'})
stop_times = stop_times.drop(columns=['timepoint', 'drop_off_type', 'pickup_type', 'stop_headsign', 'local_zone_id'])
stop_times.head()                                                 

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,local_zone_id
0,IDFM:FSQY:145555-C01527-529-C01527-44713-7967928,07:52:00,07:52:00,IDFM:21058,0,
1,IDFM:FSQY:145555-C01527-529-C01527-44713-7967928,07:53:00,07:53:00,IDFM:4418,1,
2,IDFM:FSQY:145555-C01527-529-C01527-44713-7967928,07:54:00,07:54:00,IDFM:4419,2,
3,IDFM:FSQY:145555-C01527-529-C01527-44713-7967928,07:56:00,07:56:00,IDFM:20732,3,
4,IDFM:FSQY:145555-C01527-529-C01527-44713-7967928,07:57:00,07:57:00,IDFM:20733,4,


# Get time table for relevant lines (metros, tramways, trains)
## Determine relevant lines

In [41]:
# Get all lines from GTFS
all_lines = set(trips['route_short_id'].values)

# Get list of line ids with shortest_paths computed
# computed_lines_path = os.path.join('data', 'shortest_paths')
# computed_lines = {x.split('.')[0] for x in os.listdir(computed_lines_path)}

# # Filter on these ids
# lines = computed_lines.intersection(all_lines)
# print(lines)

lines = all_lines

## Enrich trips with calendar data

In [42]:
trips = trips.set_index('service_id')
trips = trips.join(calendar, how='inner')
trips = trips.reset_index().set_index('trip_id')
trips.head()

Unnamed: 0_level_0,service_id,route_id,trip_headsign,trip_short_name,direction_id,wheelchair_accessible,bikes_allowed,route_short_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
IDFM:FSQY:145555-C01527-529-C01527-44713-7967928,IDFM:1,IDFM:C01527,Saint-Exupéry,,0,1,0,C01527,True,True,True,True,True,False,False,20240719,20240816
IDFM:FSQY:145555-C01527-529-C01527-44713-7967922,IDFM:1,IDFM:C01527,Saint-Exupéry,,0,1,0,C01527,True,True,True,True,True,False,False,20240719,20240816
IDFM:N4_MOBILITES:152581-C00031-18976623,IDFM:1,IDFM:C00031,Gare d'Ozoir,,1,2,0,C00031,True,True,True,True,True,False,False,20240719,20240816
IDFM:Transdev_Nord_Seine_Saint-Denis:147254-C02630-18625218,IDFM:1,IDFM:C02630,Gare d'Aulnay-sous-Bois,,1,0,0,C02630,True,True,True,True,True,False,False,20240719,20240816
IDFM:Transdev_Nord_Seine_Saint-Denis:147254-C02630-18625219,IDFM:1,IDFM:C02630,Gare d'Aulnay-sous-Bois,,1,0,0,C02630,True,True,True,True,True,False,False,20240719,20240816


In [43]:
trips.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Index: 456022 entries, IDFM:FSQY:145555-C01527-529-C01527-44713-7967928 to IDFM:TN:SNCF:65b6e232-70d5-46bb-8c88-300718df9e22
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype   
---  ------                 --------------   -----   
 0   service_id             456022 non-null  object  
 1   route_id               456022 non-null  category
 2   trip_headsign          456022 non-null  string  
 3   trip_short_name        40831 non-null   string  
 4   direction_id           456022 non-null  string  
 5   wheelchair_accessible  456022 non-null  category
 6   bikes_allowed          456022 non-null  category
 7   route_short_id         456022 non-null  object  
 8   monday                 456022 non-null  bool    
 9   tuesday                456022 non-null  bool    
 10  wednesday              456022 non-null  bool    
 11  thursday               456022 non-null  bool    
 12  friday                 456022 non-null  boo

## Enrich time table with stops data
Join stop_times dataframe with stops dataframe.

In [44]:
stop_times = stop_times.set_index('stop_id').join(stops, how='inner',
                                                  lsuffix='stop_times_',
                                                  rsuffix='stop_')
stop_times = stop_times.reset_index()
stop_times.head()

Unnamed: 0,stop_id,trip_id,arrival_time,departure_time,stop_sequence,local_zone_id,stop_name,stop_lon,stop_lat,zone_id,wheelchair_boarding,platform_code
0,IDFM:21058,IDFM:FSQY:145555-C01527-529-C01527-44713-7967928,07:52:00,07:52:00,0,,Trappes Gare,2.005466,48.774304,5,1,
1,IDFM:4418,IDFM:FSQY:145555-C01527-529-C01527-44713-7967928,07:53:00,07:53:00,1,,Observatoire,2.010532,48.772682,5,1,
2,IDFM:4419,IDFM:FSQY:145555-C01527-529-C01527-44713-7967928,07:54:00,07:54:00,2,,Le Buisson de la Couldre,2.014157,48.77166,5,1,
3,IDFM:20732,IDFM:FSQY:145555-C01527-529-C01527-44713-7967928,07:56:00,07:56:00,3,,Carnac,2.017859,48.769451,5,1,
4,IDFM:20733,IDFM:FSQY:145555-C01527-529-C01527-44713-7967928,07:57:00,07:57:00,4,,Jean Goujon,2.017815,48.766023,5,1,


## Enrich time table with trip data
Join stop_times dataframe with trips dataframe.

In [45]:
stop_times = stop_times.set_index('trip_id').join(trips, how='inner',
                                                  lsuffix='stop_times_',
                                                  rsuffix='trips_')
stop_times = stop_times.reset_index()
stop_times.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9869181 entries, 0 to 9869180
Data columns (total 29 columns):
 #   Column                 Dtype   
---  ------                 -----   
 0   trip_id                string  
 1   stop_id                string  
 2   arrival_time           string  
 3   departure_time         string  
 4   stop_sequence          int64   
 5   local_zone_id          category
 6   stop_name              string  
 7   stop_lon               float64 
 8   stop_lat               float64 
 9   zone_id                category
 10  wheelchair_boarding    category
 11  platform_code          category
 12  service_id             object  
 13  route_id               category
 14  trip_headsign          string  
 15  trip_short_name        string  
 16  direction_id           string  
 17  wheelchair_accessible  category
 18  bikes_allowed          category
 19  route_short_id         object  
 20  monday                 bool    
 21  tuesday                bool    

# Export each line time table

In [48]:
# Convert route_short_id to category type as further operation are much faster (x50 gain!)
stop_times['route_short_id'] = stop_times['route_short_id'].astype("category")
stop_times =  stop_times.set_index('route_short_id')

for line in lines:
    print(f'Saving line {line}')

    l_stop_times = stop_times[stop_times.index == line]
    save_directory = os.path.join('data', 'timetable')
    if not os.path.exists(save_directory):
        os.mkdir(save_directory)
    
    # Parquet is more optimized than csv for disk usage and loading time
    l_stop_times.to_parquet(os.path.join(save_directory, f'{line}.parquet'))

Saving line C01244
Saving line C01837
Saving line C01520
Saving line C00092
Saving line C01642
Saving line C01044
Saving line C02085
Saving line C00922
Saving line C00651
Saving line C01852
Saving line C01542
Saving line C02633
Saving line C00941
Saving line C01263
Saving line C02673
Saving line C01188
Saving line C00763
Saving line C02152
Saving line C01241
Saving line C01185
Saving line C01613
Saving line C00336
Saving line C00267
Saving line C02258
Saving line C01505
Saving line C00793
Saving line C00528
Saving line C00165
Saving line C00774
Saving line C00663
Saving line C00666
Saving line C01629
Saving line C01152
Saving line C02651
Saving line C00452
Saving line C00198
Saving line C01755
Saving line C01160
Saving line C02709
Saving line C01397
Saving line C01288
Saving line C01083
Saving line C00050
Saving line C02249
Saving line C00580
Saving line C01442
Saving line C01043
Saving line C00159
Saving line C00747
Saving line C00609
Saving line C00696
Saving line C01311
Saving line 

In [47]:
stop_times[(stop_times['route_short_id'] == 'C01371') & (stop_times['trip_id'] == 'IDFM:RATP:146197-C01371-COU_RATP_5083931_1078872_11')]

Unnamed: 0,trip_id,stop_id,arrival_time,departure_time,stop_sequence,local_zone_id,stop_name,stop_lon,stop_lat,zone_id,...,route_short_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
14669,IDFM:RATP:146197-C01371-COU_RATP_5083931_10788...,IDFM:22101,06:11:00,06:11:00,0,,La Défense (Grande Arche),2.237988,48.891818,2,...,C01371,True,True,True,True,True,False,False,20240719,20240816
14670,IDFM:RATP:146197-C01371-COU_RATP_5083931_10788...,IDFM:463170,06:12:00,06:12:00,1,,Esplanade de la Défense,2.249931,48.88835,2,...,C01371,True,True,True,True,True,False,False,20240719,20240816
14671,IDFM:RATP:146197-C01371-COU_RATP_5083931_10788...,IDFM:463044,06:14:00,06:14:00,2,,Pont de Neuilly,2.258523,48.885499,2,...,C01371,True,True,True,True,True,False,False,20240719,20240816
14672,IDFM:RATP:146197-C01371-COU_RATP_5083931_10788...,IDFM:22100,06:15:00,06:15:00,3,,Les Sablons,2.271905,48.881291,2,...,C01371,True,True,True,True,True,False,False,20240719,20240816
14673,IDFM:RATP:146197-C01371-COU_RATP_5083931_10788...,IDFM:463257,06:17:00,06:17:00,4,,Porte Maillot,2.28229,48.877806,1,...,C01371,True,True,True,True,True,False,False,20240719,20240816
14674,IDFM:RATP:146197-C01371-COU_RATP_5083931_10788...,IDFM:463121,06:18:00,06:18:00,5,,Argentine,2.289435,48.875667,1,...,C01371,True,True,True,True,True,False,False,20240719,20240816
14675,IDFM:RATP:146197-C01371-COU_RATP_5083931_10788...,IDFM:22086,06:20:00,06:20:00,6,,Charles de Gaulle - Etoile,2.295117,48.873929,1,...,C01371,True,True,True,True,True,False,False,20240719,20240816
14676,IDFM:RATP:146197-C01371-COU_RATP_5083931_10788...,IDFM:22084,06:21:00,06:21:00,7,,George V,2.30076,48.872038,1,...,C01371,True,True,True,True,True,False,False,20240719,20240816
14677,IDFM:RATP:146197-C01371-COU_RATP_5083931_10788...,IDFM:22082,06:22:00,06:22:00,8,,Franklin D. Roosevelt,2.30936,48.869165,1,...,C01371,True,True,True,True,True,False,False,20240719,20240816
14678,IDFM:RATP:146197-C01371-COU_RATP_5083931_10788...,IDFM:22090,06:24:00,06:24:00,9,,Champs-Élysées - Clemenceau,2.314397,48.867584,1,...,C01371,True,True,True,True,True,False,False,20240719,20240816
