# Notebook with GTFS methods

Goals: 

* Make a way to calculate the scheduled number of current active trips given a date, time, and route. 
    - Take datetime and find what services are active on that date 
    - Find what trips run on those services + route 
    - Find which of those trips are "in progress" per stop_times
* ~Output most common shape by route~

In [4]:
# imports 

import boto3
import os
import pandas as pd
import zipfile
import requests
import pendulum
from io import BytesIO
import shapely
import geopandas

In [5]:
# if "private", will assume you have write permissions and allow you to write; else will not attempt to write files
BUCKET_TYPE = "private"

In [6]:
# local 
# CTA_GTFS = zipfile.ZipFile('cta_gtfs_20220509.zip')
# s3
# follow https://pythonguides.com/download-zip-file-from-url-using-python/
# CTA_GTFS = zipfile.ZipFile(BytesIO(requests.get('https://chn-ghost-buses-public.s3.us-east-2.amazonaws.com/cta_static_gtfs/cta_gtfs_20220509.zip').content))
# cta website

# VERSION_ID = '20220718'

RTD_GTFS = zipfile.ZipFile('../../utils/utils/gtfs/google_transit_20231504.zip') # 

In [7]:
class GTFSFeed:
   """ Static GTFS management """
   def __init__(self, gtfs_zipfile):
        self.gtfs_zipfile = gtfs_zipfile
        self.feed_start = None
        self.feed_end = None
        try: 
            with self.gtfs_zipfile.open('stops.txt') as file:
                    self.stops = pd.read_csv(file, dtype = 'object')
                    print("stops.txt loaded")
            with self.gtfs_zipfile.open('stop_times.txt') as file:
                    self.stop_times = pd.read_csv(file, dtype = 'object')
                    print("stop_times.txt loaded")
            with self.gtfs_zipfile.open('routes.txt') as file:
                    self.routes = pd.read_csv(file, dtype = 'object')
                    print("routes.txt loaded")
            with self.gtfs_zipfile.open('trips.txt') as file:
                    self.trips = pd.read_csv(file, dtype = 'object')
                    print("trips.txt loaded")
        except KeyError as e:
            print("GTFS is missing required file")
            print(e)
        if 'calendar.txt' in self.gtfs_zipfile.namelist():
                with self.gtfs_zipfile.open('calendar.txt') as file:
                        self.calendar = pd.read_csv(file, dtype = 'object')
                        print("calendar.txt loaded")
        else:
            print("no calendar.txt found")
        if 'calendar_dates.txt' in self.gtfs_zipfile.namelist():
                with self.gtfs_zipfile.open('calendar_dates.txt') as file:
                        self.calendar_dates = pd.read_csv(file, dtype = 'object')
                        print("calendar_dates.txt loaded")
        else:
            print("no calendar_dates.txt found")
        if 'shapes.txt' in self.gtfs_zipfile.namelist():
                with self.gtfs_zipfile.open('shapes.txt') as file:
                        self.shapes = pd.read_csv(file, dtype = 'object')
                        print("shapes.txt loaded")
        else:
            print("no shapes.txt found")
        if 'feed_info.txt' in self.gtfs_zipfile.namelist():
                with self.gtfs_zipfile.open('feed_info.txt') as file:
                        self.feed_info = pd.read_csv(file, dtype = 'object')
                        print("feed_info.txt loaded")
                self.feed_start = pd.to_datetime(self.feed_info['feed_start_date'][0])
                self.feed_end = pd.to_datetime(self.feed_info['feed_end_date'][0])
        else:
            print("no feed_info.txt found")
            

In [8]:
data = GTFSFeed(RTD_GTFS)

stops.txt loaded
stop_times.txt loaded
routes.txt loaded
trips.txt loaded
calendar.txt loaded
calendar_dates.txt loaded
shapes.txt loaded
feed_info.txt loaded


In [9]:
data.feed_end

Timestamp('2023-08-19 00:00:00')

In [10]:
# TODO: Convert calen
data.calendar
# data.calendar_dates
# data.trips
# data.stop_times

Unnamed: 0,service_id,start_date,end_date,monday,tuesday,wednesday,thursday,friday,saturday,sunday
0,SA_merged_114569136,20230528,20230819,0,0,0,0,0,1,0
1,SA_3340,20230403,20230501,0,0,0,0,0,1,0
2,MT_3340,20230403,20230501,1,1,1,1,0,0,0
3,WK_3340,20230403,20230501,1,1,1,1,1,0,0
4,FR_merged_114569130,20230528,20230819,0,0,0,0,1,0,0
5,MT_merged_114569127,20230108,20230527,1,1,1,1,0,0,0
6,FR_merged_114569123,20230108,20230527,0,0,0,0,1,0,0
7,FR_3340,20230403,20230501,0,0,0,0,1,0,0
8,WK_merged_114569132,20230528,20230819,1,1,1,1,1,0,0
9,WK_merged_114569125,20230108,20230527,1,1,1,1,1,0,0


In [11]:
def get_hour(s):
    parts = s.split(':')
    assert len(parts)==3
    hour = int(parts[0])
    if hour >= 24:
        hour -= 24
    return hour

In [12]:

data.calendar['start_date_dt'] = pd.to_datetime(data.calendar['start_date'], format='%Y%m%d' ).dt.tz_localize('America/Denver')
data.calendar['end_date_dt'] = pd.to_datetime(data.calendar['end_date'], format='%Y%m%d' ).dt.tz_localize('America/Denver')
data.calendar_dates['date_dt'] = pd.to_datetime(data.calendar_dates['date'], format='%Y%m%d' ).dt.tz_localize('America/Denver')

# extract hour from stop_times timestamps 
data.stop_times['arrival_hour'] = data.stop_times.arrival_time.apply(lambda x: get_hour(x))
data.stop_times['departure_hour'] = data.stop_times.departure_time.apply(lambda x: get_hour(x))
data.calendar


Unnamed: 0,service_id,start_date,end_date,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date_dt,end_date_dt
0,SA_merged_114569136,20230528,20230819,0,0,0,0,0,1,0,2023-05-28 00:00:00-06:00,2023-08-19 00:00:00-06:00
1,SA_3340,20230403,20230501,0,0,0,0,0,1,0,2023-04-03 00:00:00-06:00,2023-05-01 00:00:00-06:00
2,MT_3340,20230403,20230501,1,1,1,1,0,0,0,2023-04-03 00:00:00-06:00,2023-05-01 00:00:00-06:00
3,WK_3340,20230403,20230501,1,1,1,1,1,0,0,2023-04-03 00:00:00-06:00,2023-05-01 00:00:00-06:00
4,FR_merged_114569130,20230528,20230819,0,0,0,0,1,0,0,2023-05-28 00:00:00-06:00,2023-08-19 00:00:00-06:00
5,MT_merged_114569127,20230108,20230527,1,1,1,1,0,0,0,2023-01-08 00:00:00-07:00,2023-05-27 00:00:00-06:00
6,FR_merged_114569123,20230108,20230527,0,0,0,0,1,0,0,2023-01-08 00:00:00-07:00,2023-05-27 00:00:00-06:00
7,FR_3340,20230403,20230501,0,0,0,0,1,0,0,2023-04-03 00:00:00-06:00,2023-05-01 00:00:00-06:00
8,WK_merged_114569132,20230528,20230819,1,1,1,1,1,0,0,2023-05-28 00:00:00-06:00,2023-08-19 00:00:00-06:00
9,WK_merged_114569125,20230108,20230527,1,1,1,1,1,0,0,2023-01-08 00:00:00-07:00,2023-05-27 00:00:00-06:00


In [13]:
# def format_dates_hours(data):
#     # convert string dates to actual datetimes in calendar.txt and calendar_dates.txt
#     data.calendar['start_date_dt'] = data.calendar['start_date'].apply(lambda x: pendulum.from_format(x, 'YYYYMMDD', tz='America/Denver'))
#     data.calendar['end_date_dt'] = data.calendar['end_date'].apply(lambda x: pendulum.from_format(x, 'YYYYMMDD', tz='America/Denver'))
#     data.calendar_dates['date_dt'] = data.calendar_dates['date'].apply(lambda x: pendulum.from_format(x, 'YYYYMMDD', tz='America/Denver'))
    
#     # extract hour from stop_times timestamps 
#     data.stop_times['arrival_hour'] = data.stop_times.arrival_time.apply(lambda x: get_hour(x))
#     data.stop_times['departure_hour'] = data.stop_times.departure_time.apply(lambda x: get_hour(x))
    
#     return data

In [14]:
# data = format_dates_hours(data)

In [15]:
calendar_date_range = pd.DataFrame(pd.date_range(data.feed_start, data.feed_end, tz = 'America/Denver'), columns = ['raw_date'])
    
#     # cross join calendar index with actual calendar to get all combos of possible dates & services 
calendar_cross = calendar_date_range.merge(data.calendar, how = "cross")

#     # extract day of week from date index date
calendar_cross['dayofweek'] = calendar_cross['raw_date'].dt.dayofweek
calendar_cross

    # take wide calendar data (one col per day of week) and make it long (one row per day of week)
scheduled_service = calendar_cross.melt(id_vars = ['raw_date', 'start_date', 'end_date', 'service_id', 'dayofweek'], var_name = 'cal_dayofweek', value_name = 'cal_val')

# #     # map the calendar input strings to day of week integers to align w pandas dayofweek output
scheduled_service['cal_daynum'] = scheduled_service['cal_dayofweek'].map({
    'monday': 0,
    'tuesday': 1,
    'wednesday': 2,
    'thursday': 3,
    'friday': 4,
    'saturday': 5,
    'sunday': 6
})
scheduled_service = scheduled_service[(scheduled_service.dayofweek == scheduled_service.cal_daynum) & 
                                   (scheduled_service.start_date <= scheduled_service.raw_date) &
                                   (scheduled_service.end_date >= scheduled_service.raw_date)]
    
# now merge in calendar dates to the datetime index to get overrides
scheduled_service = scheduled_service.merge(data.calendar_dates, how = 'outer', left_on = ['raw_date', 'service_id'], right_on = ['date_dt', 'service_id'])

# # now add a service happened flag for dates where the schedule indicates that this service occurred
# # i.e.: calendar has a service indicator of 1 and there's no exception type from calendar_dates
# # OR calendar_dates has exception type of 1
# # otherwise no service 
# # https://stackoverflow.com/questions/21415661/logical-operators-for-boolean-indexing-in-pandas
scheduled_service['scheduled_service_flag'] = ((scheduled_service['cal_val'] == '1') & 
                                        scheduled_service['exception_type'].isnull()) | (scheduled_service['exception_type'] == '1')
# Note: Really a "service_scheduled", not "service_happened"



# # now fill in rows where calendar_dates had a date outside the bounds of the datetime index, so raw_date is always populated
scheduled_service['raw_date'] = scheduled_service['raw_date'].fillna(scheduled_service['date_dt'])

# # filter to only rows where service occurred
scheduled_service = scheduled_service[scheduled_service.scheduled_service_flag]

# # join trips to only service that occurred
trips_scheduled = data.trips.merge(scheduled_service, how = 'left', on = 'service_id')


# 
# # get only the trip / hour combos that actually occurred
# Drop this, since it's assuming one-stop-per-trip
# trip_stop_hours = data.stop_times[['trip_id', 'arrival_hour']].drop_duplicates()

# # now join
# # result has one row per date + row from trips.txt (incl. route) + hour
# trip_summary = trips_happened.merge(trip_stop_hours, how = "left", on = "trip_id")

In [16]:
# Stops - reduce fields to not-derivable 
# stops_scheduled = 
trips_scheduled.head()
trips_scheduled_keep_cols = ['block_id',	'route_id',	'direction_id',	'trip_headsign',	'shape_id',	'service_id',	'trip_id',	'raw_date']
stop_time_keep_cols = ['trip_id',	'arrival_time',	'departure_time',	'stop_id',	'stop_sequence',	'stop_headsign',	'pickup_type',	'drop_off_type',	'shape_dist_traveled',	'timepoint'] 
# Note that 'shape_dist_traveled',	'timepoint' are not used for RTD, but could be useful elsewhere
data.stop_times
stops_scheduled = trips_scheduled[trips_scheduled_keep_cols].merge(data.stop_times[stop_time_keep_cols], on = 'trip_id' )

In [17]:
data.stop_times.astype({'stop_sequence': 'int32'}).groupby('trip_id')['stop_sequence'].agg('max').median() #.plot(kind='hist') # value_counts() #plot(kind='hist')

38.0

In [18]:
import swifter

In [19]:
# Transform trips + stops into easily-worked-with final datasets


# Stops:
# * Convert arrival_time and departure_time into proper datetimes. Note: Slow!
# Is there a quicker way to run this? String processing only, then to_datetime?? ~3 hours per column right now.
def combine_day_stop(raw_dt, stop_time):
    """ Only treat as string unless not possible elsewhere - to_datetime is incredibly slow in apply """
    try:
        if int(stop_time[0:2]) > 23:
            # Replace hour with (hour-24). Add one day to raw_dt.
            hour_replace = str(int(stop_time[0:2]) - 24).zfill(2)
            stop_time = hour_replace + stop_time[2:]
            arrival_time = pd.to_datetime(f"{raw_dt.date() + pd.Timedelta('1d')}T{stop_time}").tz_localize(raw_dt.tz)
        else:
            arrival_time = pd.to_datetime(f"{raw_dt.date()}T{stop_time}").tz_localize(raw_dt.tz) # f"{raw_dt.date()} {stop_time}" 
        return arrival_time
    except:
        return None
# Parallel-process "apply". Can't easily be vectorized.
stops_scheduled['arrival_datetime'] = stops_scheduled.swifter.apply(lambda x: combine_day_stop(x.raw_date, x.arrival_time), axis = 1)
stops_scheduled['departure_datetime'] = stops_scheduled.swifter.apply(lambda x: combine_day_stop(x.raw_date, x.departure_time), axis = 1)

# # stops_scheduled['arrival_datetime'] = stops_scheduled.apply(lambda x: combine_day_stop(x.raw_date, x.arrival_time), axis = 1)
# # stops_scheduled['departure_datetime'] = stops_scheduled.apply(lambda x: combine_day_stop(x.raw_date, x.departure_time), axis = 1)
# stops_scheduled['arrival_datetime'] = pd.to_datetime(stops_scheduled['arrival_datetime']).tz_localize('America/Denver')
# stops_scheduled['departure_datetime'] = pd.to_datetime(stops_scheduled['departure_datetime']).tz_localize('America/Denver')



Pandas Apply:   0%|          | 0/59346093 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/59346093 [00:00<?, ?it/s]

In [20]:
# Trips:
# * rename raw_date to scheduled_service_date
trips_scheduled = trips_scheduled[trips_scheduled_keep_cols]
trips_scheduled_rename_dict = {'raw_date':'scheduled_service_date'}
trips_scheduled = trips_scheduled.rename(trips_scheduled_rename_dict, axis = 1)

# Stops ctd
# * Rename "raw_date" to "service_date"
stops_scheduled_rename_dict = {'raw_date':'service_date'}
stops_scheduled = stops_scheduled.rename(stops_scheduled_rename_dict, axis = 1)


In [23]:
# Export
trips_scheduled.to_parquet('trips_scheduled.parquet')
stops_scheduled.to_parquet('stops_scheduled.parquet')


In [24]:
trips_scheduled.to_parquet(f's3://rtd-ghost-buses-{BUCKET_TYPE}/schedules/trips_scheduled.parquet', index = False)
stops_scheduled.to_parquet(f's3://rtd-ghost-buses-{BUCKET_TYPE}/schedules/stops_scheduled.parquet', index = False)
# Output
# stops_scheduled - dataframe with all possible scheduled stops - ['trip_id',	'arrival_time',	'departure_time',	'stop_id',	'stop_sequence',	'stop_headsign',	'pickup_type',	'drop_off_type',	'shape_dist_traveled',	'timepoint'] 
# trips_scheduled - dataframe with all possible scheduled trips - ['block_id',	'route_id',	'direction_id',	'trip_headsign',	'shape_id',	'service_id',	'trip_id',	'raw_date']

In [27]:
trips_scheduled.head()

Unnamed: 0,block_id,route_id,direction_id,trip_headsign,shape_id,service_id,trip_id,scheduled_service_date
0,b_76 4,76,0,US36 & Bfld,1241914,SU_merged_114569133,114458891,2023-05-29 00:00:00-06:00
1,b_76 4,76,0,US36 & Bfld,1241914,SU_merged_114569133,114458891,2023-07-04 00:00:00-06:00
2,b_76 4,76,0,US36 & Bfld,1241914,SU_merged_114569133,114458891,2023-05-28 00:00:00-06:00
3,b_76 4,76,0,US36 & Bfld,1241914,SU_merged_114569133,114458891,2023-06-04 00:00:00-06:00
4,b_76 4,76,0,US36 & Bfld,1241914,SU_merged_114569133,114458891,2023-06-11 00:00:00-06:00


In [28]:
stops_scheduled.head()

Unnamed: 0,block_id,route_id,direction_id,trip_headsign,shape_id,service_id,trip_id,service_date,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint,arrival_datetime,departure_datetime
0,b_76 4,76,0,US36 & Bfld,1241914,SU_merged_114569133,114458891,2023-05-29 00:00:00-06:00,08:22:00,08:22:00,24858,1,,,1.0,,,2023-05-29 08:22:00-06:00,2023-05-29 08:22:00-06:00
1,b_76 4,76,0,US36 & Bfld,1241914,SU_merged_114569133,114458891,2023-05-29 00:00:00-06:00,08:22:45,08:22:45,25312,2,,,,,,2023-05-29 08:22:45-06:00,2023-05-29 08:22:45-06:00
2,b_76 4,76,0,US36 & Bfld,1241914,SU_merged_114569133,114458891,2023-05-29 00:00:00-06:00,08:23:40,08:23:40,17023,3,,,,,,2023-05-29 08:23:40-06:00,2023-05-29 08:23:40-06:00
3,b_76 4,76,0,US36 & Bfld,1241914,SU_merged_114569133,114458891,2023-05-29 00:00:00-06:00,08:24:24,08:24:24,17025,4,,,,,,2023-05-29 08:24:24-06:00,2023-05-29 08:24:24-06:00
4,b_76 4,76,0,US36 & Bfld,1241914,SU_merged_114569133,114458891,2023-05-29 00:00:00-06:00,08:25:08,08:25:08,16946,5,,,,,,2023-05-29 08:25:08-06:00,2023-05-29 08:25:08-06:00


In [22]:
# departure_datetime.compute()

In [35]:
# Sanity-check on arrival times. How many NaT? How many overlaps elsewhere?
# 112 NaT - can be neglected for now.
# Often have 99 stops at the same time. All unique, often on the quarter-hour 

# stops_scheduled['arrival_datetime'].value_counts()
# How many have 7:15 arrival? 99? Why?
stops_scheduled.loc[stops_scheduled["arrival_datetime"]==pd.to_datetime("2023-04-27 07:15:00-06:00")].sort_values('route_id').to_csv('duplicate_check.csv')

In [36]:
stops_scheduled['departure_datetime'].value_counts(dropna=False).sort_values()

2023-01-08 02:22:00-07:00      1
2023-01-08 02:22:47-07:00      1
2023-01-08 02:23:21-07:00      1
2023-01-08 02:23:59-07:00      1
2023-01-08 02:24:39-07:00      1
                            ... 
2023-08-20 02:19:00-06:00      1
2023-08-20 02:23:00-06:00      1
2023-08-20 02:28:00-06:00      1
2023-08-20 02:34:00-06:00      1
NaT                          112
Name: departure_datetime, Length: 15185296, dtype: int64

In [None]:
stops_scheduled.shape # 59 million scheduled stops!
# How many before Memorial Day? 37.7 million.
len(stops_scheduled.loc[stops_scheduled['raw_date'] < pd.to_datetime('2023-05-29 00:00:00-06:00')])

In [None]:
# Sanity-checks:
trips_scheduled
# raw_date + trip_id + service_id
trips_scheduled.sort_values(['raw_date', 'trip_id', 'service_id'])
# 1.5 million trips scheduled between Jan - May. Sanity-check:
trip_days = 160 # approximate
unique_routes = len(trips_scheduled['route_id'].unique()) *2 # Double for direction
len(trips_scheduled) / trip_days / unique_routes # ~40ish round-trips per route per day. High? Reasonable?


In [None]:
# What other sanity-checks? Plot time series of trips / day
trips_scheduled.sort_values(['raw_date', 'trip_id', 'service_id']).groupby('raw_date')['trip_id'].agg('count').plot()

In [None]:
# How many unique routes / day? 115ish weekday, 90ish weekend
trips_scheduled.sort_values(['raw_date', 'route_id', 'service_id']).groupby('raw_date')['route_id'].agg('nunique').plot()

In [None]:
# Visual inspection to double-check - Look at the 0
trips_scheduled.sort_values(['raw_date', 'route_id', 'trip_id', 'service_id']).loc[(trips_scheduled.route_id == '0') & (trips_scheduled.raw_date == '2023-01-08 00:00:00-07:00')] #.head()
# 123 rows. Double-checked against Transit, sanity-check passes

In [None]:
data.trips.loc[data.trips.trip_id.isin(['114346478','114346479','114346480','114346481'])]

In [None]:
data.trips.loc[data.trips.trip_id.isin(['114346478','114346479','114346480','114346481'])]
print(data.stop_times[data.stop_times.trip_id.isin(['114346478','114346479'])][['trip_id', 'departure_time']])

## Basic data transformations

Ex. creating actual timestamps

In [None]:
# def make_timestamp(s, date):
#     parts = s.split(':')
#     assert len(parts)==3
#     if int(parts[0]) > 23:
#         num_parts = [int(parts[0]) - 24, int(parts[1]), int(parts[2])]
#     else:
#         num_parts = [int(parts[0]), int(parts[1]), int(parts[2])]
#     return pendulum.datetime(year = date.year, month = date.month, day = date.day, hour = num_parts[0], minute = num_parts[1], second = num_parts[2])
        

In [None]:
# check that there are no dwell periods that cross hour boundary
# 476 rows - arrive at 59, leave a minute or two later. 476 instances for RTD. Why does this even matter?
# data.stop_times[data.stop_times.arrival_hour != data.stop_times.departure_hour]

In [None]:
# def make_trip_summary(data):
#     # construct a datetime index that has every day between calendar start and end 
#     calendar_date_range = pd.DataFrame(pd.date_range(min(data.calendar.start_date_dt), max(data.calendar.end_date_dt)), columns = ['raw_date'])
    
#     # cross join calendar index with actual calendar to get all combos of possible dates & services 
#     calendar_cross = calendar_date_range.merge(data.calendar, how = "cross")
    
#     # extract day of week from date index date
#     calendar_cross['dayofweek'] = calendar_cross['raw_date'].dt.dayofweek
    
#     # take wide calendar data (one col per day of week) and make it long (one row per day of week)
#     actual_service = calendar_cross.melt(id_vars = ['raw_date', 'start_date_dt', 'end_date_dt', 'start_date', 'end_date', 'service_id', 'dayofweek'], var_name = 'cal_dayofweek', value_name = 'cal_val')
    
#     # map the calendar input strings to day of week integers to align w pandas dayofweek output
#     actual_service['cal_daynum'] = actual_service['cal_dayofweek'].map({
#         'monday': 0,
#         'tuesday': 1,
#         'wednesday': 2,
#         'thursday': 3,
#         'friday': 4,
#         'saturday': 5,
#         'sunday': 6
#     })
    
#     # now check for rows that "work"
#     # i.e., the day of week matches between datetime index & calendar input
#     # and the datetime index is between the calendar row's start and end dates
#     actual_service = actual_service[(actual_service.dayofweek == actual_service.cal_daynum) & 
#                                    (actual_service.start_date_dt <= actual_service.raw_date) &
#                                    (actual_service.end_date_dt >= actual_service.raw_date)]
    
#     # now merge in calendar dates to the datetime index to get overrides
#     # TODO - localize to Mountain time
#     actual_service = actual_service.merge(data.calendar_dates, how = 'outer', left_on = ['raw_date', 'service_id'], right_on = ['date_dt', 'service_id'])
    
#     # now add a service happened flag for dates where the schedule indicates that this service occurred
#     # i.e.: calendar has a service indicator of 1 and there's no exception type from calendar_dates
#     # OR calendar_dates has exception type of 1
#     # otherwise no service 
#     # https://stackoverflow.com/questions/21415661/logical-operators-for-boolean-indexing-in-pandas
#     actual_service['service_happened'] = ((actual_service['cal_val'] == '1') & 
#                                           actual_service['exception_type'].isnull()) | (actual_service['exception_type'] == '1')

    
#     # now fill in rows where calendar_dates had a date outside the bounds of the datetime index, so raw_date is always populated
#     actual_service['raw_date'] = actual_service['raw_date'].fillna(actual_service['date_dt'])
    
#     # filter to only rows where service occurred
#     service_happened = actual_service[actual_service.service_happened]
    
#     # join trips to only service that occurred
#     trips_happened = data.trips.merge(service_happened, how = 'left', on = 'service_id')
    
#     # get only the trip / hour combos that actually occurred
#     trip_stop_hours = data.stop_times[['trip_id', 'arrival_hour']].drop_duplicates()
    
#     # now join
#     # result has one row per date + row from trips.txt (incl. route) + hour
#     trip_summary = trips_happened.merge(trip_stop_hours, how = "left", on = "trip_id")
    
#     return trip_summary
    
    

In [None]:
# trip_summary = make_trip_summary(data)

In [None]:
# print(trip_summary.dtypes)
# print(trip_summary.shape)
# trip_summary.head()

In [None]:
# data.feed_info.feed_version[0]

In [None]:
# VERSION_ID = data.feed_info.feed_version[0]
# def summarize_and_save(trip_summary): 
#     # now group to get trips by hour by date by route
#     route_daily_hourly_summary = trip_summary.groupby(by = ['raw_date', 'route_id', 'arrival_hour'])['trip_id'].count().reset_index()

#     route_daily_hourly_summary.rename(columns = {'arrival_hour': 'hour', 'trip_id': 'trip_count', 'raw_date': 'date'}, inplace = True)
#     route_daily_hourly_summary.date = route_daily_hourly_summary.date.dt.date
#     if BUCKET_TYPE == "private":
#         route_daily_hourly_summary.to_csv(f's3://rtd-ghost-buses-{BUCKET_TYPE}/schedule_summaries/route_level/schedule_route_daily_hourly_summary_{VERSION_ID}.csv', index = False)
    
#     # now group to get trips by hour by date by route by *direction*
#     route_dir_daily_hourly_summary = trip_summary.groupby(by = ['raw_date', 'route_id', 'direction_id', 'arrival_hour'])['trip_id'].count().reset_index()

#     route_dir_daily_hourly_summary.rename(columns = {'arrival_hour': 'hour', 'trip_id': 'trip_count', 'raw_date': 'date'}, inplace = True)
#     route_dir_daily_hourly_summary.date = route_dir_daily_hourly_summary.date.dt.date
#     if BUCKET_TYPE == "private":
#         route_dir_daily_hourly_summary.to_csv(f's3://rtd-ghost-buses-{BUCKET_TYPE}/schedule_summaries/route_dir_level/schedule_route_dir_daily_hourly_summary_{VERSION_ID}.csv', index = False)

In [None]:
# summarize_and_save(trip_summary)

## Most common shape by route

In [None]:
# get trip count by route, direction, shape id
trips_by_rte_direction = data.trips.groupby(['route_id', 'shape_id', 'direction_id'])['trip_id'].count().reset_index()

In [None]:
# keep only most common shape id by route, direction
# follow: https://stackoverflow.com/a/54041328
most_common_shapes = trips_by_rte_direction.sort_values('trip_id').drop_duplicates(['route_id','direction_id'],keep='last')

In [None]:
# get additional route attributes
most_common_shapes = most_common_shapes.merge(data.routes, how = 'left', on = 'route_id')

In [None]:
# make shapely points
# https://www.geeksforgeeks.org/apply-function-to-every-row-in-a-pandas-dataframe/
data.shapes['pt'] = data.shapes.apply(lambda row: shapely.geometry.Point((float(row['shape_pt_lon']), float(row['shape_pt_lat']))), axis = 1)

In [None]:
data.shapes['shape_pt_sequence'] = pd.to_numeric(data.shapes['shape_pt_sequence'])

In [None]:
# construct sorted list of shapely points
# custom aggregation function: https://stackoverflow.com/a/10964938

def make_linestring_of_points(sub_df):
    sorted_df = sub_df.sort_values(by = 'shape_pt_sequence')
    return shapely.geometry.LineString(list(sorted_df['pt']))

constructed_shapes = data.shapes.groupby('shape_id').apply(make_linestring_of_points).reset_index()

In [None]:
# merge in the other route attributes
final = most_common_shapes.merge(constructed_shapes, how = 'left', on = 'shape_id')

In [None]:
# make a "geometry" column for geopandas
final['geometry'] = final[0]

In [None]:
# construct the geopandas geodataframe
final_gdf = geopandas.GeoDataFrame(data = final)

In [None]:
# drop the column that's a list of shapely points
final_gdf = final_gdf.drop(0, axis = 1)

In [None]:
# https://gis.stackexchange.com/questions/11910/meaning-of-simplifys-tolerance-parameter
final_gdf['geometry'] = final_gdf['geometry'].simplify(.0001)

In [None]:
# save to file as geojson (this saves locally)
with open('route_shapes_simplified_linestring.geojson', 'w') as f:
    f.write(final_gdf.loc[(final_gdf['route_type'] == '3')].to_json())

# Exploratory