In [1]:
import polars as pl
import os

# Download and extract GTFS data from IDFM

In [2]:
# GTFS data (used for timetable)
STATIC_GTFS_URL="https://eu.ftp.opendatasoft.com/stif/GTFS/IDFM-gtfs.zip"
STATIC_GTFS_FILE_PATH="raw_data/gtfs.zip"
STATIC_GTFS_PATH="raw_data/gtfs"

In [3]:
!mkdir -p $STATIC_GTFS_PATH
!wget $STATIC_GTFS_URL -O $STATIC_GTFS_FILE_PATH
!unzip -o $STATIC_GTFS_FILE_PATH -d $STATIC_GTFS_PATH

--2024-08-06 08:33:58--  https://eu.ftp.opendatasoft.com/stif/GTFS/IDFM-gtfs.zip
Resolving eu.ftp.opendatasoft.com (eu.ftp.opendatasoft.com)... 52.18.186.238
Connecting to eu.ftp.opendatasoft.com (eu.ftp.opendatasoft.com)|52.18.186.238|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 99296989 (95M) [application/zip]
Saving to: ‘raw_data/gtfs.zip’


2024-08-06 08:34:01 (34,3 MB/s) - ‘raw_data/gtfs.zip’ saved [99296989/99296989]

Archive:  raw_data/gtfs.zip
  inflating: raw_data/gtfs/transfers.txt  
  inflating: raw_data/gtfs/ticketing_deep_links.txt  
  inflating: raw_data/gtfs/agency.txt  
  inflating: raw_data/gtfs/calendar_dates.txt  
  inflating: raw_data/gtfs/calendar.txt  
  inflating: raw_data/gtfs/stops.txt  
  inflating: raw_data/gtfs/trips.txt  
  inflating: raw_data/gtfs/routes.txt  
  inflating: raw_data/gtfs/stop_extensions.txt  
  inflating: raw_data/gtfs/stop_times.txt  
  inflating: raw_data/gtfs/pathways.txt  


# Loading and parsing GTFS data

In [11]:
calendar_path = os.path.join(STATIC_GTFS_PATH, 'calendar.txt')
calendar_dtype = {'service_id': pl.Categorical,
                  'monday': pl.Int16,
                  'tuesday': pl.Int16,
                  'wednesday': pl.Int16,
                  'thursday': pl.Int16,
                  'friday': pl.Int16,
                  'saturday': pl.Int16,
                  'sunday': pl.Int16,
                  'start_date': pl.String,
                  'end_date': pl.String
                  }
calendar = pl.scan_csv(calendar_path, schema_overrides=calendar_dtype).select(list(calendar_dtype.keys()))
calendar = (
    calendar
    .with_columns(pl.col('monday').cast(pl.Boolean))
    .with_columns(pl.col('tuesday').cast(pl.Boolean))
    .with_columns(pl.col('tuesday').cast(pl.Boolean))
    .with_columns(pl.col('tuesday').cast(pl.Boolean))
    .with_columns(pl.col('wednesday').cast(pl.Boolean))
    .with_columns(pl.col('thursday').cast(pl.Boolean))
    .with_columns(pl.col('friday').cast(pl.Boolean))
    .with_columns(pl.col('saturday').cast(pl.Boolean))
    .with_columns(pl.col('sunday').cast(pl.Boolean))
    .with_columns(pl.col("start_date").cast(pl.String).str.to_date("%Y%m%d"))
    .with_columns(pl.col("end_date").cast(pl.String).str.to_date("%Y%m%d"))
)
calendar.head().collect()

service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
cat,bool,bool,bool,bool,bool,bool,bool,date,date
"""IDFM:1""",True,True,False,False,False,False,False,2024-09-02,2024-09-03
"""IDFM:1000""",True,True,True,True,True,True,True,2024-08-05,2024-09-03
"""IDFM:3200""",True,False,False,False,False,False,True,2024-08-05,2024-09-01
"""IDFM:3566""",True,False,False,False,False,False,False,2024-08-05,2024-08-26
"""IDFM:4880""",True,True,False,True,True,False,False,2024-08-05,2024-08-30


In [12]:
stops_path = os.path.join(STATIC_GTFS_PATH, 'stops.txt')
stops_dtype = {'stop_id': pl.String,
               'stop_code': pl.String,
               'stop_name': pl.String,
               'stop_desc': pl.String,
               'stop_lon': pl.Float64,
               'stop_lat': pl.Float64,
               'zone_id': pl.Categorical,
               'stop_url': pl.String,
               'location_type': pl.Categorical,
               'parent_station': pl.String,
               'stop_timezone': pl.Categorical,
               'level_id': pl.String,
               'wheelchair_boarding': pl.Categorical,
               'platform_code': pl.Categorical}
stops = pl.scan_csv(stops_path, schema_overrides=stops_dtype).select(list(stops_dtype.keys()))
stops = stops.drop(['stop_code', 'stop_desc', 'stop_url', 'location_type',
                   'parent_station', 'stop_timezone', 'level_id'])
stops.head().collect()

stop_id,stop_name,stop_lon,stop_lat,zone_id,wheelchair_boarding,platform_code
str,str,f64,f64,cat,cat,cat
"""IDFM:monomodalStopPlace:46689""","""Viroflay Rive Droite""",2.167655,48.805473,"""3""","""1""",
"""IDFM:monomodalStopPlace:46647""","""Achères Ville""",2.077727,48.970644,"""5""","""1""",
"""IDFM:monomodalStopPlace:474069""","""Lozère""",2.212718,48.705912,"""4""","""1""",
"""IDFM:monomodalStopPlace:474082""","""Noisy-le-Grand - Mont d'Est""",2.550027,48.840885,"""4""","""1""",
"""IDFM:monomodalStopPlace:45102""","""Châtelet les Halles""",2.346977,48.861745,"""1""","""1""",


In [19]:
trips_path = os.path.join(STATIC_GTFS_PATH, 'trips.txt')
trips_dtype = {'trip_id': pl.String,
               'route_id': pl.Categorical,
               'service_id': pl.Categorical,
               'trip_headsign': pl.String,
               'trip_short_name': pl.String,
               'direction_id': pl.String,
               'wheelchair_accessible': pl.Categorical,
               'bikes_allowed': pl.Categorical}
trips = pl.scan_csv(trips_path, schema_overrides=trips_dtype).select(list(trips_dtype.keys()))
trips.head().collect()

trip_id,route_id,service_id,trip_headsign,trip_short_name,direction_id,wheelchair_accessible,bikes_allowed
str,cat,cat,str,str,str,cat,cat
"""IDFM:RATP:144143-C01254-COU_RA…","""IDFM:C01254""","""IDFM:1""","""Gare d'Argenteuil""",,"""0""","""1""","""0"""
"""IDFM:FSO:154723-C02085-826-260""","""IDFM:C02085""","""IDFM:1""","""Neuville RER""",,"""0""","""1""","""0"""
"""IDFM:N4_MOBILITES:152581-C0003…","""IDFM:C00030""","""IDFM:1""","""Gare de Roissy""",,"""0""","""2""","""0"""
"""IDFM:Transdev_Nord_Seine_Saint…","""IDFM:C01587""","""IDFM:1""","""Gare de Sevran Beaudottes""",,"""0""","""2""","""0"""
"""IDFM:Transdev_Nord_Seine_Saint…","""IDFM:C01587""","""IDFM:1""","""Gare de Sevran Beaudottes""",,"""0""","""2""","""0"""


In [20]:
stop_times_path = os.path.join(STATIC_GTFS_PATH, 'stop_times.txt')
stop_times_dtype = {'trip_id': pl.String,
                    'arrival_time': pl.String,
                    'departure_time': pl.String,
                    'stop_id': pl.String,
                    'stop_sequence': pl.Int16
                    }
stop_times = pl.scan_csv(stop_times_path, schema_overrides=stop_times_dtype).select(list(stop_times_dtype.keys()))
stop_times.head().collect()

# TODO: Parse times

trip_id,arrival_time,departure_time,stop_id,stop_sequence
str,str,str,str,i16
"""IDFM:RATP:144143-C01254-COU_RA…","""11:10:00""","""11:10:00""","""IDFM:24930""",0
"""IDFM:RATP:144143-C01254-COU_RA…","""11:11:00""","""11:11:00""","""IDFM:23974""",1
"""IDFM:RATP:144143-C01254-COU_RA…","""11:13:00""","""11:13:00""","""IDFM:41489""",2
"""IDFM:RATP:144143-C01254-COU_RA…","""11:14:00""","""11:14:00""","""IDFM:23815""",3
"""IDFM:RATP:144143-C01254-COU_RA…","""11:17:00""","""11:17:00""","""IDFM:23972""",4


In [26]:
routes_path = os.path.join(STATIC_GTFS_PATH, 'routes.txt')
routes_dtype = {'route_id': pl.Categorical,
                'agency_id': pl.Categorical,
                'route_short_name': pl.String,
                'route_long_name': pl.String,
                'route_type': pl.Categorical,
                'route_color': pl.String,
                'route_text_color': pl.String}
routes = pl.scan_csv(routes_path, schema_overrides=routes_dtype).select(list(routes_dtype.keys()))

routes = routes.with_columns(
    pl
    .when(pl.col('route_type') == '0').then(pl.lit('TRAM'))
    .when(pl.col('route_type') == '1').then(pl.lit('METRO'))
    .when(pl.col('route_type') == '2').then(pl.lit('TRAIN'))
    .when(pl.col('route_type') == '3').then(pl.lit('BUS'))

    .alias('route_type2')
    .cast(pl.Categorical)
)

routes.head().collect()

route_id,agency_id,route_short_name,route_long_name,route_type,route_color,route_text_color,route_type2
cat,cat,str,str,cat,str,str,cat
"""IDFM:C01624""","""IDFM:1080""","""409""","""409""","""3""","""00812C""","""FFFFFF""","""BUS"""
"""IDFM:C02313""","""IDFM:1081""","""01""","""01""","""3""","""CFE2F3""","""FFFFFF""","""BUS"""
"""IDFM:C00029""","""IDFM:81""","""502""","""502""","""3""","""FBE324""","""000000""","""BUS"""
"""IDFM:C00632""","""IDFM:64""","""2234""","""2234""","""3""","""640082""","""FFFFFF""","""BUS"""
"""IDFM:C00164""","""IDFM:1081""","""3""","""3""","""3""","""FAE91A""","""000000""","""BUS"""


In [31]:
timetable = (
    stop_times
    .join(trips, on='trip_id')
    .join(calendar, on='service_id')
    .join(stops, on='stop_id')
)
timetable.collect()

trip_id,arrival_time,departure_time,stop_id,stop_sequence,route_id,service_id,trip_headsign,trip_short_name,direction_id,wheelchair_accessible,bikes_allowed,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date,stop_name,stop_lon,stop_lat,zone_id,wheelchair_boarding,platform_code
str,str,str,str,i16,cat,cat,str,str,str,cat,cat,bool,bool,bool,bool,bool,bool,bool,date,date,str,f64,f64,cat,cat,cat
"""IDFM:RATP:144143-C01254-COU_RA…","""11:10:00""","""11:10:00""","""IDFM:24930""",0,"""IDFM:C01254""","""IDFM:1""","""Gare d'Argenteuil""",,"""0""","""1""","""0""",true,true,false,false,false,false,false,2024-09-02,2024-09-03,"""Gare de Sartrouville - RER""",2.157601,48.93817,"""4""","""1""",
"""IDFM:RATP:144143-C01254-COU_RA…","""11:11:00""","""11:11:00""","""IDFM:23974""",1,"""IDFM:C01254""","""IDFM:1""","""Gare d'Argenteuil""",,"""0""","""1""","""0""",true,true,false,false,false,false,false,2024-09-02,2024-09-03,"""Poste""",2.160429,48.940609,"""4""","""1""",
"""IDFM:RATP:144143-C01254-COU_RA…","""11:13:00""","""11:13:00""","""IDFM:41489""",2,"""IDFM:C01254""","""IDFM:1""","""Gare d'Argenteuil""",,"""0""","""1""","""0""",true,true,false,false,false,false,false,2024-09-02,2024-09-03,"""Stalingrad""",2.164896,48.941234,"""4""","""1""",
"""IDFM:RATP:144143-C01254-COU_RA…","""11:14:00""","""11:14:00""","""IDFM:23815""",3,"""IDFM:C01254""","""IDFM:1""","""Gare d'Argenteuil""",,"""0""","""1""","""0""",true,true,false,false,false,false,false,2024-09-02,2024-09-03,"""Jean Mermoz""",2.166549,48.940355,"""4""","""1""",
"""IDFM:RATP:144143-C01254-COU_RA…","""11:17:00""","""11:17:00""","""IDFM:23972""",4,"""IDFM:C01254""","""IDFM:1""","""Gare d'Argenteuil""",,"""0""","""1""","""0""",true,true,false,false,false,false,false,2024-09-02,2024-09-03,"""Voltaire""",2.169674,48.938731,"""4""","""1""",
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""IDFM:TN:SNCF:be6ef317-6a72-45e…","""06:26:00""","""06:26:00""","""IDFM:monomodalStopPlace:462394""",16,"""IDFM:C02372""","""IDFM:TN:8926001""","""Gare du Nord""","""847900""","""1""","""0""","""0""",false,true,false,false,false,false,false,2024-08-06,2024-08-06,"""Gare du Nord""",2.356693,48.882307,"""1""","""1""",
"""IDFM:TN:SNCF:6e655c39-ec06-419…","""05:20:00""","""05:20:00""","""IDFM:monomodalStopPlace:411440""",15,"""IDFM:C02372""","""IDFM:TN:8926001""","""Gare du Nord""","""848502""","""1""","""0""","""0""",false,true,false,false,false,false,false,2024-08-06,2024-08-06,"""Creil""",2.468929,49.264186,,"""0""",
"""IDFM:TN:SNCF:6e655c39-ec06-419…","""05:26:00""","""05:27:30""","""IDFM:monomodalStopPlace:411428""",16,"""IDFM:C02372""","""IDFM:TN:8926001""","""Gare du Nord""","""848502""","""1""","""0""","""0""",false,true,false,false,false,false,false,2024-08-06,2024-08-06,"""Chantilly - Gouvieux""",2.459516,49.18703,,"""0""",
"""IDFM:TN:SNCF:6e655c39-ec06-419…","""05:31:30""","""05:33:00""","""IDFM:472016""",17,"""IDFM:C02372""","""IDFM:TN:8926001""","""Gare du Nord""","""848502""","""1""","""0""","""0""",false,true,false,false,false,false,false,2024-08-06,2024-08-06,"""Orry-la-Ville - Coye""",2.49001,49.138694,,"""0""",


# Export all lines time table

In [16]:
stop_times.to_parquet(os.path.join(save_directory, f'all.parquet'))