In [1]:
import pandas as pd
import os

# Download and extract GTFS data from IDFM

In [2]:
# GTFS data (used for timetable)
STATIC_GTFS_URL="https://eu.ftp.opendatasoft.com/stif/GTFS/IDFM-gtfs.zip"
STATIC_GTFS_FILE_PATH="raw_data/gtfs.zip"
STATIC_GTFS_PATH="raw_data/gtfs"

!mkdir -p $STATIC_GTFS_PATH
!wget $STATIC_GTFS_URL -O $STATIC_GTFS_FILE_PATH
!unzip -o $STATIC_GTFS_FILE_PATH -d $STATIC_GTFS_PATH

--2024-07-20 17:13:54--  https://eu.ftp.opendatasoft.com/stif/GTFS/IDFM-gtfs.zip
Resolving eu.ftp.opendatasoft.com (eu.ftp.opendatasoft.com)... 52.18.186.238
Connecting to eu.ftp.opendatasoft.com (eu.ftp.opendatasoft.com)|52.18.186.238|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 87330559 (83M) [application/zip]
Saving to: ‘raw_data/gtfs.zip’

raw_data/gtfs.zip     1%[                    ]   1,47M   954KB/s               ^C
Archive:  raw_data/gtfs.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of raw_data/gtfs.zip or
        raw_data/gtfs.zip.zip, and cannot find raw_data/gtfs.zip.ZIP, period.


# Loading and parsing GTFS data

In [34]:
calendar_path = os.path.join(STATIC_GTFS_PATH, 'calendar.txt')
calendar = pd.read_csv(calendar_path, dtype={'service_id': 'string',
                                             'monday': 'bool',
                                             'tuesday': 'bool',
                                             'wednesday': 'bool',
                                             'thursday': 'bool',
                                             'friday': 'bool',
                                             'saturday': 'bool',
                                             'sunday': 'bool',
                                             'start_date': 'int',
                                             'end_date': 'int'
                                             })
calendar = calendar.set_index('service_id')
calendar.head()

Unnamed: 0_level_0,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
service_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
IDFM:1,True,True,True,True,True,False,False,20240719,20240816
IDFM:438,True,True,True,False,False,False,False,20240720,20240731
IDFM:552,False,False,True,False,False,False,False,20240724,20240731
IDFM:621,True,True,False,False,False,False,False,20240722,20240730
IDFM:792,True,True,False,True,True,False,False,20240801,20240816


In [35]:
stops_path = os.path.join(STATIC_GTFS_PATH, 'stops.txt')
stops = pd.read_csv(stops_path, dtype={'stop_id': 'string',
                                       'stop_code': 'string',
                                       'stop_name': 'string',
                                       'stop_desc': 'string',
                                       'stop_lon': 'float',
                                       'stop_lat': 'float',
                                       'zone_id': 'string',
                                       'stop_url': 'string',
                                       'location_type': 'string',
                                       'parent_station': 'string',
                                       'stop_timezone': 'string',
                                       'level_id': 'string',
                                       'wheelchair_boarding': 'int',
                                       'platform_code': 'string', })
stops.head()

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lon,stop_lat,zone_id,stop_url,location_type,parent_station,stop_timezone,level_id,wheelchair_boarding,platform_code
0,IDFM:monomodalStopPlace:46689,,Viroflay Rive Droite,,2.167655,48.805473,3,,0,IDFM:422420,,,1,
1,IDFM:monomodalStopPlace:46647,,Achères Ville,,2.077727,48.970644,5,,0,IDFM:73604,,,1,
2,IDFM:monomodalStopPlace:474069,,Lozère,,2.212718,48.705912,4,,0,IDFM:63029,,,1,
3,IDFM:monomodalStopPlace:474082,,Noisy-le-Grand - Mont d'Est,,2.550027,48.840885,4,,0,IDFM:412697,,,1,
4,IDFM:monomodalStopPlace:45102,,Châtelet les Halles,,2.346977,48.861745,1,,0,IDFM:474151,,,1,


In [36]:
trips_path = os.path.join(STATIC_GTFS_PATH, 'trips.txt')
trips = pd.read_csv(trips_path, dtype={'route_id': 'string',
                                       'service_id': 'string',
                                       'trip_id': 'string',
                                       'trip_headsign': 'string',
                                       'trip_short_name': 'string',
                                       'direction_id': 'string',
                                       'wheelchair_accessible': 'int',
                                       'bikes_allowed': 'int'})

trips['route_short_id'] = trips['route_id'].str.split(':').str[-1]
trips.head()

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,wheelchair_accessible,bikes_allowed,route_short_id
0,IDFM:C01527,IDFM:1,IDFM:FSQY:145555-C01527-529-C01527-44713-7967928,Saint-Exupéry,,0,,,1,0,C01527
1,IDFM:C01527,IDFM:1,IDFM:FSQY:145555-C01527-529-C01527-44713-7967922,Saint-Exupéry,,0,,,1,0,C01527
2,IDFM:C00031,IDFM:1,IDFM:N4_MOBILITES:152581-C00031-18976623,Gare d'Ozoir,,1,,,2,0,C00031
3,IDFM:C02630,IDFM:1,IDFM:Transdev_Nord_Seine_Saint-Denis:147254-C0...,Gare d'Aulnay-sous-Bois,,1,,,0,0,C02630
4,IDFM:C02630,IDFM:1,IDFM:Transdev_Nord_Seine_Saint-Denis:147254-C0...,Gare d'Aulnay-sous-Bois,,1,,,0,0,C02630


In [37]:
stop_times_path = os.path.join(STATIC_GTFS_PATH, 'stop_times.txt')
stop_times = pd.read_csv(stop_times_path, dtype={'trip_id': 'string',
                                                 'arrival_time': 'string',
                                                 'departure_time': 'string',
                                                 'stop_id': 'string',
                                                 'stop_sequence': 'int',
                                                 'pickup_type': 'int',
                                                 'drop_off_type': 'int',
                                                 'local_zone_id': 'string',
                                                 'stop_headsign': 'string',
                                                 'timepoint': 'string'})
stop_times.head()                                                 

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,local_zone_id,stop_headsign,timepoint
0,IDFM:FSQY:145555-C01527-529-C01527-44713-7967928,07:52:00,07:52:00,IDFM:21058,0,0,1,,,1
1,IDFM:FSQY:145555-C01527-529-C01527-44713-7967928,07:53:00,07:53:00,IDFM:4418,1,0,0,,,1
2,IDFM:FSQY:145555-C01527-529-C01527-44713-7967928,07:54:00,07:54:00,IDFM:4419,2,0,0,,,1
3,IDFM:FSQY:145555-C01527-529-C01527-44713-7967928,07:56:00,07:56:00,IDFM:20732,3,0,0,,,1
4,IDFM:FSQY:145555-C01527-529-C01527-44713-7967928,07:57:00,07:57:00,IDFM:20733,4,0,0,,,1


# Get time table for relevant lines (metros, tramways, trains)
## Determine relevant lines

In [38]:
# Get all lines from GTFS
all_lines = set(trips['route_short_id'].values)

# Get list of line ids with shortest_paths computed
# computed_lines_path = os.path.join('data', 'shortest_paths')
# computed_lines = {x.split('.')[0] for x in os.listdir(computed_lines_path)}

# # Filter on these ids
# lines = computed_lines.intersection(all_lines)
# print(lines)

lines = all_lines

## Enrich trips with calendar data
Store dataframe in memory for faster join after.

In [39]:
trips = trips.set_index('service_id')
trips = trips.join(calendar, how='inner')
trips = trips.reset_index().set_index('trip_id')
trips.head()

Unnamed: 0_level_0,service_id,route_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,wheelchair_accessible,bikes_allowed,route_short_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
IDFM:FSQY:145555-C01527-529-C01527-44713-7967928,IDFM:1,IDFM:C01527,Saint-Exupéry,,0,,,1,0,C01527,True,True,True,True,True,False,False,20240719,20240816
IDFM:FSQY:145555-C01527-529-C01527-44713-7967922,IDFM:1,IDFM:C01527,Saint-Exupéry,,0,,,1,0,C01527,True,True,True,True,True,False,False,20240719,20240816
IDFM:N4_MOBILITES:152581-C00031-18976623,IDFM:1,IDFM:C00031,Gare d'Ozoir,,1,,,2,0,C00031,True,True,True,True,True,False,False,20240719,20240816
IDFM:Transdev_Nord_Seine_Saint-Denis:147254-C02630-18625218,IDFM:1,IDFM:C02630,Gare d'Aulnay-sous-Bois,,1,,,0,0,C02630,True,True,True,True,True,False,False,20240719,20240816
IDFM:Transdev_Nord_Seine_Saint-Denis:147254-C02630-18625219,IDFM:1,IDFM:C02630,Gare d'Aulnay-sous-Bois,,1,,,0,0,C02630,True,True,True,True,True,False,False,20240719,20240816


In [40]:
# Store dataframe in memory
# trips = trips.compute()

In [41]:
trips.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Index: 456078 entries, IDFM:FSQY:145555-C01527-529-C01527-44713-7967928 to IDFM:TN:SNCF:92b17187-0598-4d05-b469-4b187f79f42b
Data columns (total 19 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   service_id             456078 non-null  string 
 1   route_id               456078 non-null  string 
 2   trip_headsign          456078 non-null  string 
 3   trip_short_name        40887 non-null   string 
 4   direction_id           456078 non-null  string 
 5   block_id               0 non-null       float64
 6   shape_id               0 non-null       float64
 7   wheelchair_accessible  456078 non-null  int64  
 8   bikes_allowed          456078 non-null  int64  
 9   route_short_id         456078 non-null  object 
 10  monday                 456078 non-null  bool   
 11  tuesday                456078 non-null  bool   
 12  wednesday              456078 non-null  bool   
 13  thur

## Enrich time table with trip data
Join stop_times dataframe with trips dataframe.

In [42]:
# trips_id = set(trips.index.values)
# stop_times = stop_times[stop_times['trip_id'].isin(trips_id)]
# stop_times = stop_times.compute()

In [43]:
stop_times = stop_times.set_index('trip_id').join(trips, how='inner',
                                                  lsuffix='stop_times_',
                                                  rsuffix='trips_')
stop_times = stop_times.reset_index()
stop_times.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9870167 entries, 0 to 9870166
Data columns (total 29 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   trip_id                string 
 1   arrival_time           string 
 2   departure_time         string 
 3   stop_id                string 
 4   stop_sequence          int64  
 5   pickup_type            int64  
 6   drop_off_type          int64  
 7   local_zone_id          string 
 8   stop_headsign          string 
 9   timepoint              string 
 10  service_id             string 
 11  route_id               string 
 12  trip_headsign          string 
 13  trip_short_name        string 
 14  direction_id           string 
 15  block_id               float64
 16  shape_id               float64
 17  wheelchair_accessible  int64  
 18  bikes_allowed          int64  
 19  route_short_id         object 
 20  monday                 bool   
 21  tuesday                bool   
 22  wednesday         

# Export each line time table

In [45]:
# Convert route_short_id to category type as further operation are much faster (x50!)
stop_times['route_short_id'] = stop_times['route_short_id'].astype("category")

for line in lines:
    print(f'Saving line {line}')

    l_stop_times = stop_times[stop_times.index == line]
    save_directory = os.path.join('data', 'timetable')
    if not os.path.exists(save_directory):
        os.mkdir(save_directory)
    

    l_stop_times.to_parquet(os.path.join(save_directory, line))
    # l_stop_times.to_csv(os.path.join(save_directory, line))

Saving line C01544
Saving line C01195
Saving line C00697
Saving line C01080
Saving line C01479
Saving line C00150
Saving line C01295
Saving line C00087
Saving line C02615
Saving line C00556
Saving line C01205
Saving line C01104
Saving line C01183
Saving line C00233
Saving line C01538
Saving line C00609
Saving line C00385
Saving line C01246
Saving line C01273
Saving line C01756
Saving line C00187
Saving line C00553
Saving line C02057
Saving line C01591
Saving line C00231
Saving line C01731
Saving line C02040
Saving line C01212
Saving line C01391
Saving line C02589
Saving line C01587
Saving line C00093
Saving line C02131
Saving line C01795
Saving line C00688
Saving line C01675
Saving line C01257
Saving line C01595
Saving line C01136
Saving line C02107
Saving line C01145
Saving line C00521
Saving line C02469
Saving line C00211
Saving line C01821
Saving line C01527
Saving line C02372
Saving line C00416
Saving line C01190
Saving line C00139
Saving line C02587
Saving line C01227
Saving line 

In [47]:
stop_times[(stop_times['route_short_id'] == 'C01371') & (stop_times['trip_id'] == 'IDFM:RATP:146197-C01371-COU_RATP_5083931_1078872_11')]

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,local_zone_id,stop_headsign,timepoint,...,route_short_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
14669,IDFM:RATP:146197-C01371-COU_RATP_5083931_10788...,06:11:00,06:11:00,IDFM:22101,0,0,1,,,1,...,C01371,True,True,True,True,True,False,False,20240719,20240816
14670,IDFM:RATP:146197-C01371-COU_RATP_5083931_10788...,06:12:00,06:12:00,IDFM:463170,1,0,0,,,1,...,C01371,True,True,True,True,True,False,False,20240719,20240816
14671,IDFM:RATP:146197-C01371-COU_RATP_5083931_10788...,06:14:00,06:14:00,IDFM:463044,2,0,0,,,1,...,C01371,True,True,True,True,True,False,False,20240719,20240816
14672,IDFM:RATP:146197-C01371-COU_RATP_5083931_10788...,06:15:00,06:15:00,IDFM:22100,3,0,0,,,1,...,C01371,True,True,True,True,True,False,False,20240719,20240816
14673,IDFM:RATP:146197-C01371-COU_RATP_5083931_10788...,06:17:00,06:17:00,IDFM:463257,4,0,0,,,1,...,C01371,True,True,True,True,True,False,False,20240719,20240816
14674,IDFM:RATP:146197-C01371-COU_RATP_5083931_10788...,06:18:00,06:18:00,IDFM:463121,5,0,0,,,1,...,C01371,True,True,True,True,True,False,False,20240719,20240816
14675,IDFM:RATP:146197-C01371-COU_RATP_5083931_10788...,06:20:00,06:20:00,IDFM:22086,6,0,0,,,1,...,C01371,True,True,True,True,True,False,False,20240719,20240816
14676,IDFM:RATP:146197-C01371-COU_RATP_5083931_10788...,06:21:00,06:21:00,IDFM:22084,7,0,0,,,1,...,C01371,True,True,True,True,True,False,False,20240719,20240816
14677,IDFM:RATP:146197-C01371-COU_RATP_5083931_10788...,06:22:00,06:22:00,IDFM:22082,8,0,0,,,1,...,C01371,True,True,True,True,True,False,False,20240719,20240816
14678,IDFM:RATP:146197-C01371-COU_RATP_5083931_10788...,06:24:00,06:24:00,IDFM:22090,9,0,0,,,1,...,C01371,True,True,True,True,True,False,False,20240719,20240816
