## Roll up GTFS data into Hourly files

In [3]:
%load_ext autoreload
%autoreload 2

from typing import List, Dict, Tuple
from google.transit import gtfs_realtime_pb2 as gtfs_rt
from protobuf_to_dict import protobuf_to_dict
import pandas as pd
import numpy as np
import sys, os
from os import listdir
from os.path import isfile, join
import re
from datetime import datetime

from pandas.io.json import json_normalize
from itertools import groupby

sys.path.insert(0, os.path.realpath('/content/jupyter/mta-accessibility/notebooks/routing'))
import GTFS_Utils as gu

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
#gu.get_file_names('201906/20190601', 'ace', '20190601')[:3]
#gu.load_gtfs(['201906/20190601/gtfs_ace_20190601_072836.gtfs'])

typing.Tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]


In [74]:

def parse_updates(realtime_data: List[Dict]) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Returns trip_updates, vehicle_updates
    """    
    vdf = df = json_normalize(realtime_data)    
    
    # Parse trip updates
    df = df[~df['trip_update.stop_time_update'].isnull()]
    idx = df.index.repeat(df['trip_update.stop_time_update'].str.len())
    df1 = pd.DataFrame({'trip_update.stop_time_update': np.concatenate(df['trip_update.stop_time_update'].values)})
    df1.index = idx
    df = df1.join(df.drop('trip_update.stop_time_update', axis=1), how='left')
    df = df.reset_index(drop=True)
    d2 = json_normalize(df['trip_update.stop_time_update'])
    df = pd.concat([d2, df.drop('trip_update.stop_time_update', axis=1)], axis=1)
    df = df.drop(list(df.filter(regex='vehicle')), axis=1) 
        
    # Parse vehicle updates
    vdf = vdf[~vdf['vehicle.trip.trip_id'].isnull()]
    vdf = vdf.drop(list(vdf.filter(regex='trip_update')), axis=1)
    return df, vdf

In [4]:
%%time

file_names = gu.get_file_names('/content/jupyter/20190601', 'ace', '20190601')
feed = gtfs_rt.FeedMessage()


realtime_data = gu.load_gtfs(file_names[:1000])
realtime_data = [d for d in realtime_data if 'vehicle' in d]

vdf = json_normalize(realtime_data)    
vdf.columns = vdf.columns.str.replace('vehicle\.', '')
# vdf.columns = vdf.columns.str.replace('trip\.', '')

vdf.head()




  feed.ParseFromString(raw_str)


CPU times: user 1min 6s, sys: 2.12 s, total: 1min 8s
Wall time: 1min 8s


Unnamed: 0,id,current_status,current_stop_sequence,stop_id,timestamp,trip.route_id,trip.start_date,trip.start_time,trip.trip_id
0,000002A,,29.0,H04,1559389000.0,A,20190601,05:55:31,035551_A..S
1,000004A,,28.0,H03,1559389000.0,A,20190601,06:03:01,036301_A..S
2,000006A,,28.0,A24,,A,20190601,06:11:30,037150_A..N
3,000008A,,27.0,A05,1559389000.0,A,20190601,06:22:00,038200_A..N
4,000010A,,20.0,A48,,A,20190601,06:29:35,038958_A..S


In [10]:
#vdf.groupby('trip.trip_id')['stop_id'].count()
vdf

trip.trip_id
000000_FS..S    43
000050_E..N     43
000200_A..N     40
000300_H..S     38
000950_A..S     33
000950_FS..N    33
001250_E..N     31
001350_E..S     30
002000_A..N     27
002000_FS..S    27
002000_H..N     27
002200_A..N     25
002450_E..N     22
002600_H..S     22
002950_A..S     20
002950_FS..N    20
003350_E..S     18
003750_E..N     15
004000_A..N     13
004000_FS..S    13
004150_H..N     10
004200_A..N     10
004950_A..S      8
004950_E..N      8
004950_FS..N     8
005000_H..S      8
005350_E..S      3
005650_A..S      2
005650_H..N      2
010198_H..N     19
                ..
140550_E..S     29
140600_E..N     73
140900_FS..N    22
140950_FS..N    48
141000_A..S     54
141100_E..S     54
141146_H..N     19
141150_H..N     50
141200_A..N     68
141350_A..S     17
141650_E..N     65
141650_E..S     16
141800_A..N     65
141900_H..S     56
142000_FS..S    53
142200_A..S     52
142300_E..S     50
142400_E..S     10
142500_A..S     10
142700_A..N     46
142785_A..N      9

In [21]:
%%time
import warnings
import tables

# TODO add better logging of corrupted GTFS files.
# TODO fix this sketch suppressed warning.
warnings.simplefilter('ignore', tables.NaturalNameWarning)

# Take GTFS files and group them by hour. Write them back as pickled dataframes.
def create_hourly_files(data_dir: str, output_dir: str):
    file_name_pat = re.compile('gtfs_([a-zA-Z0-9]+)_([0-9]+)_([0-9]+)\.gtfs')
        
    # All files in given data_dir
    all_files = [f for f in listdir(data_dir) if isfile(join(data_dir, f))]
        
    # Sort key on the hour part of the timestamp
    def sort_key(x):
        file_name_pat = re.compile('.*gtfs_([a-zA-Z0-9]+)_([0-9]+)_([0-9]+)\.gtfs')
        regex_group = file_name_pat.match(x)
        time = regex_group.group(3)
        return time[:2]

    date = data_dir.partition('/')[2]
    all_files = list(map(lambda x: '{}/{}'.format(data_dir, x), all_files))
    all_files.sort(key=sort_key)
    for hour,g in groupby(all_files, key=sort_key):    
        output_name = '{}_{}.pkl'.format(date, hour)
        output_path = output_dir + '/' + output_name

        start_time = datetime.now()
        realtime_data = gu.load_gtfs(list(g))
        df = parse_trip_updates(realtime_data)   
        df.to_pickle(output_path)
        end_time = datetime.now()        
        print('Processing file {} took {}'.format(output_name, (end_time - start_time)))


CPU times: user 61.5 ms, sys: 24.2 ms, total: 85.7 ms
Wall time: 1.25 s


In [45]:
%%time

parent_dir = '201906'

dirs = [f for f in listdir(parent_dir) if not isfile(data_dir)]
dirs.remove('zipped')
dirs.remove('20190601')

for data_dir in dirs:
    start_time = datetime.now()
    create_hourly_files(parent_dir + '/' + data_dir, 'hourly')
    end_time = datetime.now()        
    print('----Processing dir {} took {}'.format(data_dir, (end_time - start_time)))


Processing file 20190629_03.pkl took 0:00:00.065217
Processing file 20190629_04.pkl took 0:01:44.459757
Processing file 20190629_05.pkl took 0:02:58.726642
Processing file 20190629_06.pkl took 0:03:37.850944
Processing file 20190629_07.pkl took 0:04:04.695024
Processing file 20190629_08.pkl took 0:04:08.131597
Processing file 20190629_09.pkl took 0:04:01.155779
Processing file 20190629_10.pkl took 0:03:43.858287
Processing file 20190629_11.pkl took 0:03:34.815877
Processing file 20190629_12.pkl took 0:03:33.339774
Processing file 20190629_13.pkl took 0:03:54.627952
Error with file 201906/20190629/gtfs_ace_20190629_140835.gtfs
Processing file 20190629_14.pkl took 0:03:42.289809
Processing file 20190629_15.pkl took 0:03:40.690832
Error with file 201906/20190629/gtfs_nqrw_20190629_161144.gtfs
Processing file 20190629_16.pkl took 0:03:35.248919
Processing file 20190629_17.pkl took 0:03:29.295154


  from ipykernel import kernelapp as app


Processing file 20190629_18.pkl took 0:03:11.339049
Error with file 201906/20190629/gtfs_ace_20190629_193812.gtfs
Processing file 20190629_19.pkl took 0:02:52.909652
Processing file 20190629_20.pkl took 0:02:41.842424
Processing file 20190629_21.pkl took 0:02:32.457324
Processing file 20190629_22.pkl took 0:02:25.944983
Processing file 20190629_23.pkl took 0:02:05.238049
----Processing dir 20190629 took 1:05:40.150022
Processing file 20190602_03.pkl took 0:00:00.066349
Processing file 20190602_04.pkl took 0:01:37.059183
Processing file 20190602_05.pkl took 0:02:26.101203
Error with file 201906/20190602/gtfs_7_20190602_062811.gtfs
Processing file 20190602_06.pkl took 0:02:38.304160
Processing file 20190602_07.pkl took 0:03:02.649863
Processing file 20190602_08.pkl took 0:03:11.004930
Processing file 20190602_09.pkl took 0:03:20.661430
Processing file 20190602_10.pkl took 0:03:11.521656
Error with file 201906/20190602/gtfs_nqrw_20190602_115534.gtfs
Processing file 20190602_11.pkl took 0:

KeyError: 'trip_update.stop_time_update'