In [27]:
import dask.dataframe as dd
import pandas as pd
import json
import os

# Download and extract GTFS data from IDFM

In [28]:
# GTFS data (used for timetable)
STATIC_GTFS_URL="https://eu.ftp.opendatasoft.com/stif/GTFS/IDFM-gtfs.zip"
STATIC_GTFS_FILE_PATH="raw_data/gtfs.zip"
STATIC_GTFS_PATH="raw_data/gtfs"

!mkdir -p $STATIC_GTFS_PATH
!wget $STATIC_GTFS_URL -O $STATIC_GTFS_FILE_PATH
!unzip -o $STATIC_GTFS_FILE_PATH -d $STATIC_GTFS_PATH

--2024-03-13 20:15:31--  https://eu.ftp.opendatasoft.com/stif/GTFS/IDFM-gtfs.zip
Resolving eu.ftp.opendatasoft.com (eu.ftp.opendatasoft.com)... 52.18.186.238
Connecting to eu.ftp.opendatasoft.com (eu.ftp.opendatasoft.com)|52.18.186.238|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 79522672 (76M) [application/zip]
Saving to: ‘raw_data/gtfs.zip’


2024-03-13 20:15:34 (33,6 MB/s) - ‘raw_data/gtfs.zip’ saved [79522672/79522672]

Archive:  raw_data/gtfs.zip
  inflating: raw_data/gtfs/transfers.txt  
  inflating: raw_data/gtfs/agency.txt  
  inflating: raw_data/gtfs/calendar_dates.txt  
  inflating: raw_data/gtfs/calendar.txt  
  inflating: raw_data/gtfs/stops.txt  
  inflating: raw_data/gtfs/trips.txt  
  inflating: raw_data/gtfs/routes.txt  
  inflating: raw_data/gtfs/stop_extensions.txt  
  inflating: raw_data/gtfs/stop_times.txt  
  inflating: raw_data/gtfs/pathways.txt  


# Loading and parsing GTFS data

In [29]:
calendar_path = os.path.join(STATIC_GTFS_PATH, 'calendar.txt')
calendar = pd.read_csv(calendar_path, dtype={'service_id': 'string',
                                             'monday': 'bool',
                                             'tuesday': 'bool',
                                             'wednesday': 'bool',
                                             'thursday': 'bool',
                                             'friday': 'bool',
                                             'saturday': 'bool',
                                             'sunday': 'bool',
                                             'start_date': 'int',
                                             'end_date': 'int'
                                             })
calendar = calendar.set_index('service_id')
calendar.head()

Unnamed: 0_level_0,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
service_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
IDFM:1,False,True,True,True,True,False,False,20240313,20240405
IDFM:10,False,True,False,True,True,False,False,20240314,20240405
IDFM:894,False,False,False,True,False,False,True,20240314,20240411
IDFM:959,True,True,True,False,True,True,False,20240313,20240410
IDFM:1000,True,True,True,True,True,False,False,20240313,20240411


In [30]:
stops_path = os.path.join(STATIC_GTFS_PATH, 'stops.txt')
stops = pd.read_csv(stops_path, dtype={'stop_id': 'string',
                                       'stop_code': 'string',
                                       'stop_name': 'string',
                                       'stop_desc': 'string',
                                       'stop_lon': 'float',
                                       'stop_lat': 'float',
                                       'zone_id': 'string',
                                       'stop_url': 'string',
                                       'location_type': 'string',
                                       'parent_station': 'string',
                                       'stop_timezone': 'string',
                                       'level_id': 'string',
                                       'wheelchair_boarding': 'int',
                                       'platform_code': 'string', })
stops.head()

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lon,stop_lat,zone_id,stop_url,location_type,parent_station,stop_timezone,level_id,wheelchair_boarding,platform_code
0,IDFM:2921,,Gare de Breuillet Village,,2.171832,48.564763,4,,0,IDFM:59940,Europe/Paris,,1,
1,IDFM:478605,,Charbonneau,,2.285846,48.55871,5,,0,IDFM:478604,Europe/Paris,,2,
2,IDFM:7559,,Temple,,2.245499,48.948594,4,,0,IDFM:65073,Europe/Paris,,0,
3,IDFM:6445,,Rue Albert Bertin,,3.417994,48.784692,5,,0,IDFM:67789,Europe/Paris,,2,
4,IDFM:31666,,Les Chaises,,1.692453,48.650324,5,,0,IDFM:60759,Europe/Paris,,2,


In [31]:
trips_path = os.path.join(STATIC_GTFS_PATH, 'trips.txt')
trips = dd.read_csv(trips_path, dtype={'route_id': 'string',
                                       'service_id': 'string',
                                       'trip_id': 'string',
                                       'trip_headsign': 'string',
                                       'trip_short_name': 'string',
                                       'direction_id': 'string',
                                       'wheelchair_accessible': 'int',
                                       'bikes_allowed': 'int'})

trips['route_short_id'] = trips['route_id'].apply(lambda x: x.split(':')[-1],
                                                  meta=('route_id', 'string'))
trips.head()                                                  

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,wheelchair_accessible,bikes_allowed,route_short_id
0,IDFM:C00085,IDFM:1,IDFM:TRANSDEV_COTEAUX_DE_LA_MARNE:133170-C0008...,Gare de La Varenne Chennevières,,1,,,1,0,C00085
1,IDFM:C01310,IDFM:1,IDFM:RATP:125959-C01310-COU_RATP_5089931_31491...,Le Vert de Maison RER,,0,,,0,0,C01310
2,IDFM:C01172,IDFM:1,IDFM:RATP:119669-C01172-COU_RATP_5086980_31081...,Avenue Ronsard,,1,,,1,0,C01172
3,IDFM:C01172,IDFM:1,IDFM:RATP:119669-C01172-COU_RATP_5086980_31081...,Avenue Ronsard,,1,,,1,0,C01172
4,IDFM:C01172,IDFM:1,IDFM:RATP:119669-C01172-COU_RATP_5086980_31081...,Avenue Ronsard,,1,,,1,0,C01172


In [32]:
stop_times_path = os.path.join(STATIC_GTFS_PATH, 'stop_times.txt')
stop_times = dd.read_csv(stop_times_path, dtype={'trip_id': 'string',
                                                 'arrival_time': 'string',
                                                 'departure_time': 'string',
                                                 'stop_id': 'string',
                                                 'stop_sequence': 'int',
                                                 'pickup_type': 'int',
                                                 'drop_off_type': 'int',
                                                 'local_zone_id': 'string',
                                                 'stop_headsign': 'string',
                                                 'timepoint': 'string'})
stop_times.head()                                                 

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,local_zone_id,stop_headsign,timepoint
0,IDFM:TRANSDEV_COTEAUX_DE_LA_MARNE:133170-C0008...,09:43:00,09:43:00,IDFM:427801,0,0,1,,,1
1,IDFM:TRANSDEV_COTEAUX_DE_LA_MARNE:133170-C0008...,09:44:00,09:44:00,IDFM:421128,1,0,0,,,1
2,IDFM:TRANSDEV_COTEAUX_DE_LA_MARNE:133170-C0008...,09:45:00,09:45:00,IDFM:421180,2,0,0,,,1
3,IDFM:TRANSDEV_COTEAUX_DE_LA_MARNE:133170-C0008...,09:46:00,09:46:00,IDFM:419110,3,0,0,,,1
4,IDFM:TRANSDEV_COTEAUX_DE_LA_MARNE:133170-C0008...,09:47:00,09:47:00,IDFM:421158,4,0,0,,,1


# Get time table for relevant lines (metros, tramways, trains)
## Determine relevant lines

In [33]:
# Get all lines from GTFS
all_lines = set(trips['route_short_id'].values.compute())

# Get list of lines with shortest_paths computed
computed_lines_path = os.path.join('data', 'shortest_paths')
computed_lines = {x.split('.')[0] for x in os.listdir(computed_lines_path)}

# Get relevant lines
lines = computed_lines.intersection(all_lines)
print(lines)



{'C01389', 'C01377', 'C01795', 'C01727', 'C01382', 'C01376', 'C01378', 'C01741', 'C01729', 'C01381', 'C01999', 'C02317', 'C01375', 'C01373', 'C01379', 'C01387', 'C01679', 'C01391', 'C01739', 'C01371', 'C01742', 'C01737', 'C02528', 'C01794', 'C02529', 'C01774', 'C01390', 'C01374', 'C01684', 'C01372', 'C01728', 'C01740', 'C02344', 'C01738', 'C01383', 'C01736', 'C01731', 'C01380', 'C01843', 'C01386', 'C01743', 'C01384', 'C01730'}


## Enrich trips with calendar data
Store dataframe in memory for faster join after.

In [34]:
trips = trips[trips['route_short_id'].isin(lines)]
trips = trips.set_index('service_id')
trips = trips.join(calendar, how='inner')
trips = trips.reset_index().set_index('trip_id')
trips.head()

Unnamed: 0_level_0,service_id,route_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,wheelchair_accessible,bikes_allowed,route_short_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
IDFM:KT9:132296-C02317-100-783,IDFM:107205,IDFM:C02317,Porte de Choisy,,0,,,1,0,C02317,True,True,True,True,True,True,False,20240313,20240411
IDFM:KT9:132296-C02317-1003-800,IDFM:100060,IDFM:C02317,Orly - Gaston Viens,,1,,,1,0,C02317,False,False,False,False,False,True,False,20240316,20240406
IDFM:KT9:132296-C02317-1012-800,IDFM:100060,IDFM:C02317,Porte de Choisy,,0,,,1,0,C02317,False,False,False,False,False,True,False,20240316,20240406
IDFM:KT9:132296-C02317-1023-800,IDFM:100060,IDFM:C02317,Porte de Choisy,,0,,,1,0,C02317,False,False,False,False,False,True,False,20240316,20240406
IDFM:KT9:132296-C02317-1026-800,IDFM:100060,IDFM:C02317,Orly - Gaston Viens,,1,,,1,0,C02317,False,False,False,False,False,True,False,20240316,20240406


In [35]:
# Store dataframe in memory
trips = trips.compute()

In [36]:
trips.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Index: 99101 entries, IDFM:KT9:132296-C02317-100-783 to IDFM:TN:SNCF:ffff6e91-be82-4456-b593-95a625e9d7dd
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   service_id             99101 non-null  string 
 1   route_id               99101 non-null  string 
 2   trip_headsign          99101 non-null  string 
 3   trip_short_name        46713 non-null  string 
 4   direction_id           99101 non-null  string 
 5   block_id               0 non-null      float64
 6   shape_id               0 non-null      float64
 7   wheelchair_accessible  99101 non-null  int64  
 8   bikes_allowed          99101 non-null  int64  
 9   route_short_id         99101 non-null  string 
 10  monday                 99101 non-null  bool   
 11  tuesday                99101 non-null  bool   
 12  wednesday              99101 non-null  bool   
 13  thursday               99101 non-null 

## Enrich time table with trip data
Join stop_times dataframe with trips dataframe.

In [37]:
trips_id = set(trips.index.values)
stop_times = stop_times[stop_times['trip_id'].isin(trips_id)]
stop_times = stop_times.compute()

In [38]:
stop_times = stop_times.set_index('trip_id').join(trips, how='inner',
                                                  lsuffix='stop_times_',
                                                  rsuffix='trips_')
stop_times = stop_times.reset_index()
stop_times.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1996593 entries, 0 to 1996592
Data columns (total 29 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   trip_id                string 
 1   arrival_time           string 
 2   departure_time         string 
 3   stop_id                string 
 4   stop_sequence          int64  
 5   pickup_type            int64  
 6   drop_off_type          int64  
 7   local_zone_id          string 
 8   stop_headsign          string 
 9   timepoint              string 
 10  service_id             string 
 11  route_id               string 
 12  trip_headsign          string 
 13  trip_short_name        string 
 14  direction_id           string 
 15  block_id               float64
 16  shape_id               float64
 17  wheelchair_accessible  int64  
 18  bikes_allowed          int64  
 19  route_short_id         string 
 20  monday                 bool   
 21  tuesday                bool   
 22  wednesday         

## Saving data for each line
Parquet is faster to load/write than csv.

In [39]:
for line in lines:
    print(f'Saving line {line}')
    l_stop_times = stop_times[stop_times['route_short_id'] == line]
    save_directory = os.path.join('data', 'timetable')
    if not os.path.exists(save_directory):
        os.mkdir(save_directory)
    l_stop_times.to_parquet(os.path.join(save_directory, line))

Saving line C01389
Saving line C01377
Saving line C01795
Saving line C01727
Saving line C01382
Saving line C01376
Saving line C01378
Saving line C01741
Saving line C01729
Saving line C01381
Saving line C01999
Saving line C02317
Saving line C01375
Saving line C01373
Saving line C01379
Saving line C01387
Saving line C01679
Saving line C01391
Saving line C01739
Saving line C01371
Saving line C01742
Saving line C01737
Saving line C02528
Saving line C01794
Saving line C02529
Saving line C01774
Saving line C01390
Saving line C01374
Saving line C01684
Saving line C01372
Saving line C01728
Saving line C01740
Saving line C02344
Saving line C01738
Saving line C01383
Saving line C01736
Saving line C01731
Saving line C01380
Saving line C01843
Saving line C01386
Saving line C01743
Saving line C01384
Saving line C01730
