In [1]:
from constants import *
import requests
from google.transit import gtfs_realtime_pb2
from google.protobuf.json_format import MessageToDict
import pandas as pd
from datetime import datetime
import time
import glob
import os

In [2]:
# Get the stop id -> stop name data
stop_id_to_name = pd.read_csv('Data/google_transit/stops.txt')
stop_lookup_dict = stop_id_to_name.set_index('stop_id')['stop_name'].to_dict()

In [3]:
# Create dataframe/database
db = pd.DataFrame(columns=['trip_id',
                           'line',
                           'stop_id', 
                           'stop_name', 
                           'arrival_time', 
                           'departure_time', 
                           'trip_date', 
                           'update_time'
                          ])

In [7]:
def poll_trip_updates():
    url = SUBWAY_FEEDS['1']
    data_update_time = datetime.now()
    res = requests.get(url, headers={"x-api-key": API_KEY})
    feed = gtfs_realtime_pb2.FeedMessage()
    feed.ParseFromString(res.content)
    
    message_array_all = MessageToDict(feed).get('entity')
    trip_updates = list(filter(lambda x: 'tripUpdate' in x, message_array_all))
    
    return trip_updates, data_update_time

In [5]:
def parse_trip_update(trip_updates, update_time):
    message_data = []
    for message in trip_updates:
        trip_update = message.get('tripUpdate')
        trip_date = trip_update.get('trip').get('startDate')
        trip_id = trip_update.get('trip').get('tripId')
        line = trip_update.get('trip').get('routeId')
        
        stop_time_updates = trip_update.get('stopTimeUpdate')
        for stop_time_update in stop_time_updates:
            stop_id = stop_time_update.get('stopId')
            stop_name = stop_lookup_dict.get(stop_id)
            
            stop_update_info_all = {'trip_id': trip_id,
                                    'trip_date': trip_date,
                                    'line': line,
                                    'stop_id': stop_id,
                                    'stop_name': stop_name,
                                    'update_time': update_time}
            
            if 'arrival' in stop_time_update:
                arrival_time = stop_time_update.get('arrival').get('time')
                arrival_time_readable = datetime.fromtimestamp(int(arrival_time))
                stop_update_info_all['arrival_time'] = arrival_time_readable
            else:
                stop_update_info_all['arrival_time'] = None
                
            if 'departure' in stop_time_update:
                departure_time = stop_time_update.get('departure').get('time')
                departure_time_readable = datetime.fromtimestamp(int(departure_time))
                stop_update_info_all['departure_time'] = departure_time_readable
            else:
                stop_update_info_all['departure_time'] = None
                
            message_data.append(stop_update_info_all)
    
    return message_data

In [8]:
#Pull an instance of data
raw_trip_updates, data_update_time = poll_trip_updates()
message_data = parse_trip_update(raw_trip_updates, data_update_time)
db = pd.concat([db, pd.DataFrame(data=message_data)], ignore_index=True)
display(db)

Unnamed: 0,trip_id,line,stop_id,stop_name,arrival_time,departure_time,trip_date,update_time
0,144300_6..N04X001,6,415N,149 St-Grand Concourse,2023-06-07 00:34:37,NaT,20230607,2023-06-07 03:32:08.449037
1,145700_6..N04X001,6,415N,149 St-Grand Concourse,2023-06-07 00:56:23,NaT,20230607,2023-06-07 03:32:08.449037
2,147200_6..N04X001,6,415N,149 St-Grand Concourse,2023-06-07 01:09:52,NaT,20230607,2023-06-07 03:32:08.449037
3,148700_6..N04X001,6,415N,149 St-Grand Concourse,2023-06-07 01:34:42,NaT,20230607,2023-06-07 03:32:08.449037
4,018500_6..S04X102,6,640S,Brooklyn Bridge-City Hall,2023-06-07 03:33:43,NaT,20230607,2023-06-07 03:32:08.449037
...,...,...,...,...,...,...,...,...
1625,117050_7..N,7,707N,Junction Blvd,2023-06-06 19:59:10,2023-06-06 19:59:30,20230606,2023-06-07 03:32:08.449037
1626,117050_7..N,7,706N,103 St-Corona Plaza,2023-06-06 20:00:40,2023-06-06 20:01:00,20230606,2023-06-07 03:32:08.449037
1627,117050_7..N,7,705N,111 St,2023-06-06 20:02:10,2023-06-06 20:02:30,20230606,2023-06-07 03:32:08.449037
1628,117050_7..N,7,702N,Mets-Willets Point,2023-06-06 20:04:40,2023-06-06 20:05:00,20230606,2023-06-07 03:32:08.449037


In [12]:
db['line'].unique()

array(['6', '1', '2', '3', '4', '5', '7', '7X'], dtype=object)

In [11]:
time = datetime.now()
display(db.query("stop_name=='Fulton St' and arrival_time>@time"))

Unnamed: 0,trip_id,line,stop_id,stop_name,arrival_time,departure_time,trip_date,update_time
361,013900_2..S08R,2,229S,Fulton St,2023-06-07 03:39:04,2023-06-07 03:39:04,20230607,2023-06-07 03:32:08.449037
422,015900_2..S08R,2,229S,Fulton St,2023-06-07 03:59:57,2023-06-07 03:59:57,20230607,2023-06-07 03:32:08.449037
509,017900_2..S08R,2,229S,Fulton St,2023-06-07 04:17:59,2023-06-07 04:17:59,20230607,2023-06-07 03:32:08.449037
532,019350_2..N08R,2,229N,Fulton St,2023-06-07 03:40:46,2023-06-07 03:40:46,20230607,2023-06-07 03:32:08.449037
611,019900_2..S08R,2,229S,Fulton St,2023-06-07 04:32:22,2023-06-07 04:32:22,20230607,2023-06-07 03:32:08.449037
646,021350_2..N08R,2,229N,Fulton St,2023-06-07 04:03:30,2023-06-07 04:03:30,20230607,2023-06-07 03:32:08.449037
733,021900_2..S08R,2,229S,Fulton St,2023-06-07 04:50:00,2023-06-07 04:50:00,20230607,2023-06-07 03:32:08.449037
768,023350_2..N08R,2,229N,Fulton St,2023-06-07 04:23:30,2023-06-07 04:23:30,20230607,2023-06-07 03:32:08.449037
855,023400_2..S08R,2,229S,Fulton St,2023-06-07 05:05:00,2023-06-07 05:05:00,20230607,2023-06-07 03:32:08.449037
1007,017500_4..S13X038,4,418S,Fulton St,2023-06-07 03:46:57,2023-06-07 03:46:57,20230607,2023-06-07 03:32:08.449037


In [6]:
def print_trip_updates(trip_updates):
    for message in trip_updates:
        trip_update = message.get('tripUpdate')
        trip_date = trip_update.get('trip').get('startDate')
        trip_id = trip_update.get('trip').get('tripId')

        print('Trip date: {}, Trip Id: {}\n'.format(trip_date, trip_id))

        stop_time_updates = trip_update.get('stopTimeUpdate')
        for stop_time_update in stop_time_updates:
            stop_id = stop_time_update.get('stopId')
            stop_name = stop_lookup_dict.get(stop_id)
            print('Stop Location: {} '.format(stop_name))

            if 'arrival' in stop_time_update:
                arrival_time = stop_time_update.get('arrival').get('time')
                arrival_time_readable = datetime.fromtimestamp(int(arrival_time))
                print('Arrival time: {}'.format(arrival_time_readable))

            if 'departure' in stop_time_update:
                departure_time = stop_time_update.get('departure').get('time')
                departure_time_readable = datetime.fromtimestamp(int(departure_time))
                print('Departure time: {}'.format(departure_time_readable))

        print("---------------------\n")

In [7]:
def poll_and_update():
    raw_trip_updates, data_update_time = poll_trip_updates()
    message_data = parse_trip_update(raw_trip_updates, data_update_time)
    date = data_update_time.date()
    time = data_update_time.time()
    pd.DataFrame(data=message_data).to_csv(
        'Data/train_timing_data/{}/train_data_{}.csv'.format(date, time), index=False)

In [111]:
for i in range(120):
    poll_and_update()
    time.sleep(60)

In [8]:
#Gather up data
path = r'Data/train_timing_data/2023-03-30'
all_files = glob.glob(os.path.join(path, "*.csv"))
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

In [9]:
df['update_time'] = pd.to_datetime(df['update_time'])
df['arrival_time'] = pd.to_datetime(df['arrival_time'])

In [142]:
df.query("trip_id=='082550_1..N03R' and stop_name=='72 St'").sort_values('update_time')

Unnamed: 0,trip_id,trip_date,line,stop_id,stop_name,update_time,arrival_time,departure_time
313635,082550_1..N03R,20230330,1,123N,72 St,2023-03-30 13:13:24.270506,2023-03-30 14:08:00,2023-03-30 14:08:00
5888,082550_1..N03R,20230330,1,123N,72 St,2023-03-30 13:14:24.494670,2023-03-30 14:08:00,2023-03-30 14:08:00
447505,082550_1..N03R,20230330,1,123N,72 St,2023-03-30 13:15:24.815882,2023-03-30 14:08:00,2023-03-30 14:08:00
432980,082550_1..N03R,20230330,1,123N,72 St,2023-03-30 13:16:25.046732,2023-03-30 14:08:00,2023-03-30 14:08:00
180094,082550_1..N03R,20230330,1,123N,72 St,2023-03-30 13:17:25.349553,2023-03-30 14:08:00,2023-03-30 14:08:00
348417,082550_1..N03R,20230330,1,123N,72 St,2023-03-30 13:18:25.649604,2023-03-30 14:08:00,2023-03-30 14:08:00
487434,082550_1..N03R,20230330,1,123N,72 St,2023-03-30 13:19:25.910463,2023-03-30 14:08:00,2023-03-30 14:08:00
219484,082550_1..N03R,20230330,1,123N,72 St,2023-03-30 13:20:26.214493,2023-03-30 14:08:00,2023-03-30 14:08:00
397905,082550_1..N03R,20230330,1,123N,72 St,2023-03-30 13:21:26.436638,2023-03-30 14:08:00,2023-03-30 14:08:00
263811,082550_1..N03R,20230330,1,123N,72 St,2023-03-30 13:22:26.755814,2023-03-30 14:08:00,2023-03-30 14:08:00


In [144]:
time = datetime.now()
df.query("stop_name=='Fulton St' and arrival_time<@time").groupby('line').agg({'arrival_time': 'max', 'update_time': 'max'})

Unnamed: 0_level_0,arrival_time,update_time
line,Unnamed: 1_level_1,Unnamed: 2_level_1
2,2023-03-30 15:48:00,2023-03-30 14:14:41.294504
3,2023-03-30 15:22:30,2023-03-30 14:14:41.294504
4,2023-03-30 15:33:30,2023-03-30 14:14:41.294504
5,2023-03-30 15:39:30,2023-03-30 14:14:41.294504


In [151]:
import functools
functools.reduce(lambda x, y: x + y, [[1,2,3], [3,4,5]], [])

[1, 2, 3, 3, 4, 5]

In [99]:
#db.query("trip_id=='096100_1..S03R' and stop_name=='Chambers St'")