In [490]:
import os.path
from os import listdir
from os.path import isfile, join
from google.transit import gtfs_realtime_pb2 as gtfs_rt
from protobuf_to_dict import protobuf_to_dict
from datetime import datetime
import numpy as np
import pandas as pd

pd.set_option('max_colwidth', 200)

### Read rolled-up hourly files

In [491]:
%%time

df = pd.read_pickle('hourly/20190601_07.pkl')
df.columns = df.columns.str.replace('trip_update.trip.', '')
df.columns = df.columns.str.replace('.', '_')

df.arrival_time = pd.to_datetime(df.arrival_time, unit='s').dt.tz_localize('UTC').dt.tz_convert('America/New_York')
df.departure_time = pd.to_datetime(df.departure_time, unit='s').dt.tz_localize('UTC').dt.tz_convert('America/New_York')

# Join with static GTFS data.
stop_df = pd.read_csv('google_transit/stops.txt')
df = df.merge(stop_df[['stop_id', 'stop_name']], on='stop_id')

df.head()

CPU times: user 51.3 s, sys: 3.29 s, total: 54.6 s
Wall time: 3min 2s


Unnamed: 0,arrival_time,departure_time,schedule_relationship,stop_id,id,route_id,start_date,start_time,trip_id,stop_name
0,2019-06-01 07:54:25-04:00,2019-06-01 07:54:40-04:00,0.0,L28S,1,L,20190601,,043550_L..S,E 105 St
1,2019-06-01 08:16:18-04:00,2019-06-01 08:16:33-04:00,0.0,L28S,7,L,20190601,,045550_L..S,E 105 St
2,2019-06-01 08:12:17-04:00,2019-06-01 08:12:32-04:00,0.0,L28S,11,L,20190601,,046350_L..S,E 105 St
3,2019-06-01 08:40:45-04:00,2019-06-01 08:41:00-04:00,0.0,L28S,15,L,20190601,,047550_L..S,E 105 St
4,2019-06-01 08:30:45-04:00,2019-06-01 08:31:00-04:00,0.0,L28S,19,L,20190601,,048350_L..S,E 105 St


In [439]:
df[df.route_id == 'A'].stop_name.unique()

array(['Beach 67 St', 'Beach 60 St', 'Beach 44 St', 'Beach 36 St',
       'Beach 25 St', 'Far Rockaway - Mott Av', 'Dyckman St',
       'Inwood - 207 St', '125 St', '145 St', '168 St', '175 St',
       '181 St', '190 St', 'Howard Beach - JFK Airport', 'Broad Channel',
       'Grant Av', '80 St', '88 St', 'Rockaway Blvd', '104 St', '111 St',
       'Ozone Park - Lefferts Blvd', 'Hoyt - Schermerhorn Sts',
       'Jay St - MetroTech', 'High St', 'Fulton St', 'Chambers St',
       'Canal St', 'W 4 St - Wash Sq', '14 St', '34 St - Penn Station',
       '42 St - Port Authority Bus Terminal', '59 St - Columbus Circle',
       'Euclid Av', 'Aqueduct - N Conduit Av', 'Nostrand Av', 'Utica Av',
       'Broadway Jct', 'Aqueduct Racetrack'], dtype=object)

In [481]:
%%time

df[df.stop_name == 'Canal St'].head()

CPU times: user 248 ms, sys: 4.69 ms, total: 252 ms
Wall time: 267 ms


Unnamed: 0,arrival_time,departure_time,schedule_relationship,stop_id,id,route_id,start_date,start_time,trip_id,stop_name
123113,2019-06-01 07:33:25-04:00,2019-06-01 07:33:25-04:00,,M20S,000005J,J,20190601,07:12:03,043205_J..S,Canal St
123114,2019-06-01 07:44:25-04:00,2019-06-01 07:44:25-04:00,,M20S,000009J,J,20190601,07:22:07,044211_J..S,Canal St
123115,2019-06-01 07:56:30-04:00,2019-06-01 07:56:30-04:00,,M20S,000015J,J,20190601,07:33:30,045350_J..S,Canal St
123116,2019-06-01 08:07:30-04:00,2019-06-01 08:07:30-04:00,,M20S,000025J,J,20190601,07:44:30,046450_J..S,Canal St
123117,2019-06-01 08:17:30-04:00,2019-06-01 08:17:30-04:00,,M20S,000027J,J,20190601,07:54:30,047450_J..S,Canal St


In [485]:
# TODO Look at the original feed to see why there are duplicate rows?
# File name should be somewhere around 7:12 EST -> 12:12 GMT
df2 = df[df.trip_id == '043205_J..S']

In [489]:
df2.drop_duplicates().sort_values(['trip_id', 'id'])

Unnamed: 0,arrival_time,departure_time,schedule_relationship,stop_id,id,route_id,start_date,start_time,trip_id,stop_name
124359,2019-06-01 07:34:49-04:00,2019-06-01 07:34:49-04:00,,M20S,000001J,J,20190601,07:12:03,043205_J..S,Canal St
124486,2019-06-01 07:34:59-04:00,2019-06-01 07:34:59-04:00,,M20S,000001J,J,20190601,07:12:03,043205_J..S,Canal St
126141,2019-06-01 07:36:14-04:00,2019-06-01 07:36:14-04:00,,M21S,000001J,J,20190601,07:12:03,043205_J..S,Chambers St
126223,2019-06-01 07:35:55-04:00,2019-06-01 07:35:55-04:00,,M21S,000001J,J,20190601,07:12:03,043205_J..S,Chambers St
126352,2019-06-01 07:36:07-04:00,2019-06-01 07:36:07-04:00,,M21S,000001J,J,20190601,07:12:03,043205_J..S,Chambers St
126993,2019-06-01 07:37:49-04:00,2019-06-01 07:37:49-04:00,,M22S,000001J,J,20190601,07:12:03,043205_J..S,Fulton St
127011,2019-06-01 07:37:59-04:00,2019-06-01 07:37:59-04:00,,M22S,000001J,J,20190601,07:12:03,043205_J..S,Fulton St
127742,2019-06-01 07:37:04-04:00,2019-06-01 07:37:04-04:00,,M22S,000001J,J,20190601,07:12:03,043205_J..S,Fulton St
128030,2019-06-01 07:37:19-04:00,2019-06-01 07:37:19-04:00,,M22S,000001J,J,20190601,07:12:03,043205_J..S,Fulton St
128114,2019-06-01 07:37:25-04:00,2019-06-01 07:37:25-04:00,,M22S,000001J,J,20190601,07:12:03,043205_J..S,Fulton St


In [462]:
# Compute average and median time from port auth to canal

#df[df.stop_name == 'Canal St']

In [368]:
def parse_data(realtime_data):
    df = pd.DataFrame.from_dict(realtime_data)
    
    # Horizontally explode trip_update dict
    df = df['trip_update'].apply(pd.Series)
    # Horizontally explode trip_id dict
    df = pd.concat([df.trip.apply(pd.Series), df.stop_time_update], axis=1)
    # Get rid of null stop_time_updates
    df = df[~df.stop_time_update.isnull()]

    # Explode out each row of stop_time_update
    idx = df.index.repeat(df['stop_time_update'].str.len())
    df1 = pd.DataFrame({'stop_time_update': np.concatenate(df.stop_time_update.values)})
    df1.index = idx
    df = df1.join(df.drop('stop_time_update', axis=1), how='left')

    # Explode horizontally arrival,departure, and stop_id
    df1 = df.stop_time_update.apply(pd.Series)
    df = pd.concat([df.drop(['stop_time_update'], axis=1), df1], axis=1)

    # Convert arrival and departure times to datetime.
    df.arrival = df.arrival.apply(pd.Series)
    df.arrival = pd.to_datetime(df.arrival, unit='s')

    df.departure = df.departure.apply(pd.Series)
    df.departure = pd.to_datetime(df.departure, unit='s')
    return df

In [269]:
df = pd.DataFrame.from_dict(realtime_data)
# Dump all vehicle updates
df = df[df.vehicle.isnull()].drop(columns=['vehicle'])
# Horizontally explode trip_update dict
df = df['trip_update'].apply(pd.Series)
# Horizontally explode trip_id dict
df = pd.concat([df.trip.apply(pd.Series), df.stop_time_update], axis=1)
# Get rid of null stop_time_updates
df = df[~df.stop_time_update.isnull()]

# Explode out each row of stop_time_update
idx = df.index.repeat(df['stop_time_update'].str.len())
df1 = pd.DataFrame({'stop_time_update': np.concatenate(df.stop_time_update.values)})
df1.index = idx
df = df1.join(df.drop('stop_time_update', axis=1), how='left')

# Explode horizontally arrival,departure, and stop_id
df1 = df.stop_time_update.apply(pd.Series)
df = pd.concat([df.drop(['stop_time_update'], axis=1), df1], axis=1)

# Convert arrival and departure times to datetime.
df.arrival = df.arrival.apply(pd.Series)
df.arrival = pd.to_datetime(df.arrival, unit='s')

df.departure = df.departure.apply(pd.Series)
df.departure = pd.to_datetime(df.departure, unit='s')

df.head()

Unnamed: 0,trip_id,start_time,start_date,route_id,arrival,departure,stop_id
0,035551_A..S,05:55:31,20190601,A,2019-06-01 11:30:09,2019-06-01 11:30:09,A40S
0,035551_A..S,05:55:31,20190601,A,2019-06-01 11:32:39,2019-06-01 11:32:39,A41S
0,035551_A..S,05:55:31,20190601,A,2019-06-01 11:34:39,2019-06-01 11:34:39,A42S
0,035551_A..S,05:55:31,20190601,A,2019-06-01 11:39:39,2019-06-01 11:39:39,A46S
0,035551_A..S,05:55:31,20190601,A,2019-06-01 11:42:09,2019-06-01 11:42:09,A48S
