## Roll up GTFS data into Hourly files

In [26]:
%load_ext autoreload
%autoreload 2

from typing import List, Dict, Tuple
from google.transit import gtfs_realtime_pb2 as gtfs_rt
from protobuf_to_dict import protobuf_to_dict
import pandas as pd
import numpy as np
import sys, os
from os import listdir
from os.path import isfile, join
import re
from datetime import datetime

from pandas.io.json import json_normalize
from itertools import groupby

sys.path.insert(0, os.path.realpath('/content/jupyter/mta-accessibility/notebooks/routing'))
import GTFS_Utils as gu
from gcs_utils import gcs_util

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
import logging

class Plog:    
    def __init__(self):        
        log = logging.getLogger('gtfs_parser')
        log.setLevel(logging.DEBUG)

        filename = "parser{}.log".format(datetime.now().strftime("%Y%m%d-%I%M%S"))
        # create file handler which logs even debug messages
        fh = logging.FileHandler(filename)
        fh.setLevel(logging.DEBUG)

        # create console handler with a higher log level
        ch = logging.StreamHandler()
        ch.setLevel(logging.ERROR)

        # create formatter and add it to the handlers
        formatter = logging.Formatter('%(asctime)s-%(levelname)s: %(message)s')
        fh.setFormatter(formatter)
        ch.setFormatter(formatter)

        if log.hasHandlers():
            log.handlers.clear()
        # add the handlers to the logger
        log.addHandler(fh)
        log.addHandler(ch)

        self.log_instance = log
                    
    def error(self, *args, **kwargs):
        self.log_instance.error(*args, **kwargs)
    def info(self, *args, **kwargs):
        self.log_instance.info(*args, **kwargs)
    def warn(self, *args, **kwargs):
        self.log_instance.warning(*args, **kwargs)
        
log = Plog()

gcs = gcs_util()
gcs.list_dirs()

['/',
 '201906/',
 'hourly_gtfs/',
 'hourly_vehicle_update/',
 'merged_hourly_gtfs/',
 'test/',
 'tripify/']

In [34]:

def parse_vehicle_updates(realtime_data: List[Dict]) -> pd.DataFrame:
    df = json_normalize(realtime_data)    
    df = df[df['vehicle.trip.trip_id'].notnull()]        
    return df.drop(list(df.filter(regex='trip_update')), axis=1)

def parse_updates(realtime_data: List[Dict]) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Returns trip_updates, vehicle_updates
    """    
    vdf = df = json_normalize(realtime_data)    
    
    # Parse trip updates
    df = df[~df['trip_update.stop_time_update'].isnull()]
    idx = df.index.repeat(df['trip_update.stop_time_update'].str.len())
    df1 = pd.DataFrame({'trip_update.stop_time_update': np.concatenate(df['trip_update.stop_time_update'].values)})
    df1.index = idx
    df = df1.join(df.drop('trip_update.stop_time_update', axis=1), how='left')
    df = df.reset_index(drop=True)
    d2 = json_normalize(df['trip_update.stop_time_update'])
    df = pd.concat([d2, df.drop('trip_update.stop_time_update', axis=1)], axis=1)
    df = df.drop(list(df.filter(regex='vehicle')), axis=1) 
        
    # Parse vehicle updates
    vdf = vdf[vdf['vehicle.trip.trip_id'].notnull()]
    vdf = vdf.drop(list(vdf.filter(regex='trip_update')), axis=1)
    return df, vdf

In [35]:
%%time
import warnings
import tables

warnings.simplefilter('ignore', tables.NaturalNameWarning)

# Take GTFS files and group them by hour. Write them back as pickled dataframes.
def create_hourly_files(data_dir: str, output_dir: str):
    file_name_pat = re.compile('gtfs_([a-zA-Z0-9]+)_([0-9]+)_([0-9]+)\.gtfs')
        
    # All files in given data_dir
    all_files = [f for f in listdir(data_dir) if isfile(join(data_dir, f))]
    
    # Filter out all L train files. Schema is different than the rest.
    L_train_pat = re.compile('.*gtfs_L_([0-9]+)_([0-9]+)\.gtfs')
    all_files = [l for l in all_files if not L_train_pat.match(l)]
    
    # Sort key on the hour part of the timestamp
    def sort_key(x):
        file_name_pat = re.compile('.*gtfs_([a-zA-Z0-9]+)_([0-9]+)_([0-9]+)\.gtfs')
        regex_group = file_name_pat.match(x)
        time = regex_group.group(3)
        return time[:2]

    date = data_dir.split('/')[-1]    
    all_files = list(map(lambda x: '{}/{}'.format(data_dir, x), all_files))
    all_files.sort(key=sort_key)
        
    # Stops data
    stop_df = pd.read_csv('stops.txt')[['stop_id', 'stop_name']]    
        
    previous_df = None
    for hour,g in groupby(all_files, key=sort_key):                
        output_name = '{}_{}.pkl'.format(date, hour)
        output_path = output_dir + '/' + output_name        
        start_time = datetime.now()                         
        files = list(g)        
        try:
            realtime_data = gu.load_gtfs(files)
            df = parse_vehicle_updates(realtime_data).drop_duplicates()                                                         
            df.columns = df.columns.str.replace("vehicle\.", '')
            df.columns = df.columns.str.replace("trip\.", '')                
            df = df.merge(stop_df, on='stop_id')
            df['timestamp'] = pd.to_datetime(df.timestamp, unit='s')
            gcs.upload_dataframe(df, output_path)
            end_time = datetime.now()        
            log.info('Processing file {} took {}'.format(output_name, (end_time - start_time)))
        except Exception as e:
            log.error("{}-{}".format(output_name, str(e)))
        

CPU times: user 40 µs, sys: 0 ns, total: 40 µs
Wall time: 47.7 µs


In [None]:
%%time

parent_dir = '/content/jupyter/201906'

dirs = [f for f in listdir(parent_dir) if not isfile(f)]
dirs.remove('zipped')
#dirs.remove('20190601')
dirs.remove('20190629')

for data_dir in dirs:    
    log.info(f"Starting ---{data_dir}---")
    start_time = datetime.now()
    create_hourly_files(parent_dir + '/' + data_dir, "hourly_vehicle_update")    
    end_time = datetime.now()        
    log.info("Processing dir {} took {}".format(data_dir, (end_time - start_time)))


2020-04-03 06:22:41,607-ERROR: 20190602_04.pkl-Error with file /content/jupyter/201906/20190602/gtfs_bdfm_20190602_045828.gtfs <built-in method ParseFromString of FeedMessage object at 0x7f83e17628a0> returned a result with an error set
2020-04-03 06:24:05,201-ERROR: 20190602_06.pkl-Error with file /content/jupyter/201906/20190602/gtfs_7_20190602_062811.gtfs Error parsing message
2020-04-03 06:24:05,436-ERROR: 20190602_07.pkl-Error with file /content/jupyter/201906/20190602/gtfs_j_20190602_070153.gtfs <built-in method ParseFromString of FeedMessage object at 0x7f83e17628a0> returned a result with an error set
2020-04-03 06:24:22,384-ERROR: 20190602_08.pkl-Error with file /content/jupyter/201906/20190602/gtfs_g_20190602_080208.gtfs <built-in method ParseFromString of FeedMessage object at 0x7f83e17628a0> returned a result with an error set
2020-04-03 06:28:32,130-ERROR: 20190602_11.pkl-Error with file /content/jupyter/201906/20190602/gtfs_nqrw_20190602_115534.gtfs Error parsing message


In [10]:
realtime_data = gu.load_gtfs(["/content/jupyter/201906/20190629/gtfs_g_20190629_093355.gtfs"])
df1 = parse_vehicle_updates(realtime_data)

Index(['id', 'trip_update.stop_time_update', 'trip_update.trip.route_id',
       'trip_update.trip.start_date', 'trip_update.trip.start_time',
       'trip_update.trip.trip_id', 'vehicle.current_stop_sequence',
       'vehicle.stop_id', 'vehicle.timestamp', 'vehicle.trip.route_id',
       'vehicle.trip.start_date', 'vehicle.trip.start_time',
       'vehicle.trip.trip_id'],
      dtype='object')


In [10]:
df1.head()

Unnamed: 0,id,trip_update.stop_time_update,trip_update.trip.route_id,trip_update.trip.start_date,trip_update.trip.start_time,trip_update.trip.trip_id,vehicle.current_stop_sequence,vehicle.stop_id,vehicle.timestamp,vehicle.trip.route_id,vehicle.trip.start_date,vehicle.trip.start_time,vehicle.trip.trip_id
1,000002G,,,,,,17.0,F27,,G,20190629,08:56:30,053650_G..S
3,000004G,,,,,,14.0,F21,,G,20190629,09:06:30,054650_G..S
5,000006G,,,,,,10.0,G35,1561815000.0,G,20190629,09:16:30,055650_G..S
7,000008G,,,,,,12.0,G33,1561815000.0,G,20190629,09:13:00,055300_G..N
9,000010G,,,,,,4.0,G29,1561815000.0,G,20190629,09:26:30,056650_G..S


In [12]:
df1.drop(list(df1.filter(regex='trip_update')), axis=1).head()

Unnamed: 0,id,vehicle.current_stop_sequence,vehicle.stop_id,vehicle.timestamp,vehicle.trip.route_id,vehicle.trip.start_date,vehicle.trip.start_time,vehicle.trip.trip_id
1,000002G,17.0,F27,,G,20190629,08:56:30,053650_G..S
3,000004G,14.0,F21,,G,20190629,09:06:30,054650_G..S
5,000006G,10.0,G35,1561815000.0,G,20190629,09:16:30,055650_G..S
7,000008G,12.0,G33,1561815000.0,G,20190629,09:13:00,055300_G..N
9,000010G,4.0,G29,1561815000.0,G,20190629,09:26:30,056650_G..S


In [28]:
realtime_data = gu.load_gtfs(["/content/jupyter/201906/20190629/gtfs_L_20190629_035944.gtfs"])
df2 = parse_vehicle_updates(realtime_data)

Index(['id', 'trip_update.stop_time_update', 'trip_update.trip.route_id',
       'trip_update.trip.start_date', 'trip_update.trip.trip_id',
       'vehicle.current_status', 'vehicle.current_stop_sequence',
       'vehicle.timestamp', 'vehicle.trip.route_id', 'vehicle.trip.start_date',
       'vehicle.trip.trip_id'],
      dtype='object')


In [40]:
ls = ["gtfs_L_20190629_035944.gtfs", "gtfs_g_20190629_093355.gtfs", "/content/jupyter/201906/20190629/gtfs_L_20190629_035944.gtfs"]
file_name_pat = re.compile('gtfs_L_([0-9]+)_([0-9]+)\.gtfs')
[l for l in ls if not file_name_pat.match(l)]

['gtfs_g_20190629_093355.gtfs',
 '/content/jupyter/201906/20190629/gtfs_L_20190629_035944.gtfs']

In [27]:
df2.head()

Unnamed: 0,id,vehicle.current_status,vehicle.current_stop_sequence,vehicle.timestamp,vehicle.trip.route_id,vehicle.trip.start_date,vehicle.trip.trip_id
1,2,2.0,18.0,1561795000.0,L,20190629,020300_L..N
3,4,1.0,12.0,1561795000.0,L,20190629,021250_L..S
5,6,2.0,12.0,1561795000.0,L,20190629,022300_L..N
7,8,1.0,1.0,1561795000.0,L,20190629,023250_L..S
9,10,1.0,0.0,1561795000.0,L,20190629,024300_L..N
