In [1]:
import requests
import zipfile
import pandas as pd

def access_static_gtfs(url: str) -> bytes:
    response = requests.get(url)
    response.raise_for_status()
    return response.content

def extract_static_gtfs(zip: bytes, table: str = "stops.txt") -> pd.DataFrame:
    with zipfile.ZipFile(io.BytesIO(zip)) as zf:
        table_data = zf.read(table)  # Returns bytes

    table_df = pd.read_csv(io.BytesIO(table_data))
    return table_df

url = "https://rrgtfsfeeds.s3.amazonaws.com/gtfs_subway.zip"
zip_bytes = access_static_gtfs(url)

In [3]:
import io

with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
    print(zf.namelist())

stop_times = extract_static_gtfs(zip_bytes, table="stop_times.txt")
shapes = extract_static_gtfs(zip_bytes, table="shapes.txt")
stops = extract_static_gtfs(zip_bytes, table="stops.txt")
transfers = extract_static_gtfs(zip_bytes, table="transfers.txt")
trips = extract_static_gtfs(zip_bytes, table="trips.txt")

['agency.txt', 'calendar_dates.txt', 'calendar.txt', 'routes.txt', 'shapes.txt', 'stop_times.txt', 'stops.txt', 'transfers.txt', 'trips.txt']


In [7]:
trips.head(10)

Unnamed: 0,route_id,trip_id,service_id,trip_headsign,direction_id,shape_id
0,1,AFA25GEN-1038-Sunday-00_000600_1..S03R,Sunday,South Ferry,1,1..S03R
1,1,AFA25GEN-1038-Sunday-00_002600_1..S03R,Sunday,South Ferry,1,1..S03R
2,1,AFA25GEN-1038-Sunday-00_004600_1..S03R,Sunday,South Ferry,1,1..S03R
3,1,AFA25GEN-1038-Sunday-00_006600_1..S03R,Sunday,South Ferry,1,1..S03R
4,1,AFA25GEN-1038-Sunday-00_007200_1..N03R,Sunday,Van Cortlandt Park-242 St,0,1..N03R
5,1,AFA25GEN-1038-Sunday-00_008600_1..S03R,Sunday,South Ferry,1,1..S03R
6,1,AFA25GEN-1038-Sunday-00_009200_1..N03R,Sunday,Van Cortlandt Park-242 St,0,1..N03R
7,1,AFA25GEN-1038-Sunday-00_010600_1..S03R,Sunday,South Ferry,1,1..S03R
8,1,AFA25GEN-1038-Sunday-00_011200_1..N03R,Sunday,Van Cortlandt Park-242 St,0,1..N03R
9,1,AFA25GEN-1038-Sunday-00_012600_1..S03R,Sunday,South Ferry,1,1..S03R


In [124]:
(
    stop_times
    .assign(
        id=lambda df: df["trip_id"].str.extract(r'_(.*?)$')
    )
)

Unnamed: 0,trip_id,stop_id,arrival_time,departure_time,stop_sequence,id
0,AFA25GEN-1038-Sunday-00_000600_1..S03R,101S,00:06:00,00:06:00,1,000600_1..S03R
1,AFA25GEN-1038-Sunday-00_000600_1..S03R,103S,00:07:30,00:07:30,2,000600_1..S03R
2,AFA25GEN-1038-Sunday-00_000600_1..S03R,104S,00:09:00,00:09:00,3,000600_1..S03R
3,AFA25GEN-1038-Sunday-00_000600_1..S03R,106S,00:10:30,00:10:30,4,000600_1..S03R
4,AFA25GEN-1038-Sunday-00_000600_1..S03R,107S,00:12:00,00:12:00,5,000600_1..S03R
...,...,...,...,...,...,...
562592,SIR-FA2017-SI017-Weekday-08_147100_SI..N03R,S27N,25:03:00,25:03:00,17,147100_SI..N03R
562593,SIR-FA2017-SI017-Weekday-08_147100_SI..N03R,S28N,25:06:00,25:06:00,18,147100_SI..N03R
562594,SIR-FA2017-SI017-Weekday-08_147100_SI..N03R,S29N,25:08:00,25:08:00,19,147100_SI..N03R
562595,SIR-FA2017-SI017-Weekday-08_147100_SI..N03R,S30N,25:10:00,25:10:00,20,147100_SI..N03R


In [100]:
(shapes)

Unnamed: 0,shape_id,shape_pt_sequence,shape_pt_lat,shape_pt_lon
0,1..N03R,0,40.702068,-74.013664
1,1..N03R,1,40.703199,-74.014792
2,1..N03R,2,40.703226,-74.014820
3,1..N03R,3,40.703253,-74.014846
4,1..N03R,4,40.703280,-74.014870
...,...,...,...,...
149829,SI..S07R,685,40.513696,-74.250493
149830,SI..S07R,686,40.513579,-74.250706
149831,SI..S07R,687,40.513458,-74.250917
149832,SI..S07R,688,40.513334,-74.251124


In [102]:
(
    shapes
    .merge(
        stops.loc[lambda df: df["location_type"].eq(1)], left_on=["shape_pt_lat", "shape_pt_lon"], right_on=["stop_lat", "stop_lon"], how="left"
    )
    .loc[lambda df: df["stop_id"].notnull(), :]
)

Unnamed: 0,shape_id,shape_pt_sequence,shape_pt_lat,shape_pt_lon,stop_id,stop_name,stop_lat,stop_lon,location_type,parent_station
0,1..N03R,0,40.702068,-74.013664,142,South Ferry,40.702068,-74.013664,1.0,
34,1..N03R,34,40.707513,-74.013783,139,Rector St,40.707513,-74.013783,1.0,
40,1..N03R,40,40.711835,-74.012188,138,WTC Cortlandt,40.711835,-74.012188,1.0,
77,1..N03R,77,40.715478,-74.009266,137,Chambers St,40.715478,-74.009266,1.0,
94,1..N03R,94,40.719318,-74.006886,136,Franklin St,40.719318,-74.006886,1.0,
...,...,...,...,...,...,...,...,...,...,...
149797,SI..S07R,526,40.533674,-74.191794,S16,Huguenot,40.533674,-74.191794,1.0,
149813,SI..S07R,542,40.525507,-74.200064,S15,Prince's Bay,40.525507,-74.200064,1.0,
149860,SI..S07R,589,40.522410,-74.217847,S14,Pleasant Plains,40.522410,-74.217847,1.0,
149892,SI..S07R,621,40.519631,-74.229141,S13,Richmond Valley,40.519631,-74.229141,1.0,


In [69]:
(
    stop_times
    .loc[lambda df: df["trip_id"].str.contains("_6")]
)

Unnamed: 0,trip_id,stop_id,arrival_time,departure_time,stop_sequence
133396,AFA25GEN-6031-Sunday-00_000700_6..S01R,601S,00:07:00,00:07:00,1
133397,AFA25GEN-6031-Sunday-00_000700_6..S01R,602S,00:09:30,00:09:30,2
133398,AFA25GEN-6031-Sunday-00_000700_6..S01R,603S,00:11:00,00:11:00,3
133399,AFA25GEN-6031-Sunday-00_000700_6..S01R,604S,00:12:30,00:12:30,4
133400,AFA25GEN-6031-Sunday-00_000700_6..S01R,606S,00:13:30,00:13:30,5
...,...,...,...,...,...
177042,AFA25GEN-6091-Weekday-00_150300_6..N01R,606N,25:59:30,25:59:30,34
177043,AFA25GEN-6091-Weekday-00_150300_6..N01R,604N,26:00:30,26:00:30,35
177044,AFA25GEN-6091-Weekday-00_150300_6..N01R,603N,26:02:00,26:02:00,36
177045,AFA25GEN-6091-Weekday-00_150300_6..N01R,602N,26:03:30,26:03:30,37


In [None]:
from nyct_gtfs import NYCTFeed

TRIP_COLUMNS = {
    "updated_at": "timestamp",
    "trip_id": "varchar",
    "start_date": "date",
    "route_id": "varchar",
    "direction": "varchar",
    "location": "varchar",
    "location_status": "varchar",
    "headsign_text": "varchar",
    "departure_time": "timestamp",
    "underway": "boolean",
    "train_assigned": "boolean",
    "last_position_update": "timestamp",
    "current_stop_sequence_index": "integer",
    "num_stops_left": "integer",
    "has_delay_alert": "boolean",
}

UPDATE_COLUMNS = {
    "updated_at": "timestamp",
    "trip_id": "varchar",
    "stop_id": "varchar",
    "arrival": "timestamp",
    "departure": "timestamp"
}

def extract_trips_data(trips, updated_at) -> list[tuple]:
    return [
        (
            updated_at,
            trip.trip_id,
            trip.start_date,
            trip.route_id,
            trip.direction,
            trip.location,
            trip.location_status,
            trip.headsign_text,
            trip.departure_time,
            trip.underway,
            trip.train_assigned,
            trip.last_position_update,
            trip.current_stop_sequence_index,
            len(trip.stop_time_updates),
            trip.has_delay_alert
        ) for trip in trips
    ]

def extract_stop_time_update_data(update, trip_id, updated_at) -> tuple:
    return (
        updated_at,
        trip_id,
        update.stop_id,
        update.arrival,
        update.departure
    )

In [None]:
all_trips = []
trip_updates = []

for source in ["1", "A", "B", "G", "J", "L", "N", "SIR"]:
    feed = NYCTFeed(source)
    trips_list = feed.trips
    all_trips.extend(extract_trips_data(trips_list, feed.last_generated))
    for trip in trips_list:
        n_updates = len(trip.stop_time_updates)
        if n_updates == 0:
            continue
        elif n_updates == 1:
            updates = [
                extract_stop_time_update_data(trip.stop_time_updates[0], trip.trip_id, feed.last_generated)
            ]
        else:
            updates = [
                extract_stop_time_update_data(trip.stop_time_updates[i], trip.trip_id, feed.last_generated)
                for i in [0, -1]
            ]
        trip_updates.extend(updates)

raw_trips = pd.DataFrame(data=all_trips, columns=TRIP_COLUMNS.keys())
raw_stop_time_updates = pd.DataFrame(data=trip_updates, columns=UPDATE_COLUMNS.keys())

In [None]:
raw_trips

In [None]:
raw_stop_time_updates