Read in pre-computed files of RT and scheduled data and compare!

In [1]:
# required for pandas to read csv from aws
import s3fs
import os
import pandas as pd
import pendulum
import shapely
import geopandas

In [4]:
# if "private", will assume you have write permissions and allow you to write; else will not attempt to write files
BUCKET_TYPE = "private"

In [5]:
schedule_feeds = [
    {'schedule_version': 'Merged-31664M31661-20230415-035754-Jan23 and May23',
    'feed_start_date': '2023-01-08',
    'feed_end_date': '2022-08-19'},
    
    # {'schedule_version': '20220603',
    # 'feed_start_date': '2022-06-04',
    # 'feed_end_date': '2022-06-07'},
    
    # {'schedule_version': '20220608',
    # 'feed_start_date': '2022-06-09',
    # 'feed_end_date': '2022-07-08'},

    # {'schedule_version': '20220709',
    # 'feed_start_date': '2022-07-10',
    # 'feed_end_date': '2022-07-17'},

    # {'schedule_version': '20220718',
    # 'feed_start_date': '2022-07-19',
    # 'feed_end_date': '2022-07-20'} 
]

In [6]:
for feed in schedule_feeds:
    start_date = feed['feed_start_date']
    end_date = feed['feed_end_date']
    date_range = [d for d in pendulum.period(pendulum.from_format(start_date, 'YYYY-MM-DD'), pendulum.from_format(end_date, 'YYYY-MM-DD')).range('days')]
    
    print(f"Loading schedule version {feed['schedule_version']}")
    schedule_raw = pd.read_csv(f's3://rtd-ghost-buses-{BUCKET_TYPE}/schedule_summaries/route_level/schedule_route_daily_hourly_summary_{feed["schedule_version"]}.csv')
    
    rt_raw = pd.DataFrame()

    for day in date_range:
        date_str = day.to_date_string()
        print(f"Processing {date_str} at {pendulum.now().to_datetime_string()}")    
        daily_data = pd.read_csv(f's3://rtd-ghost-buses-{BUCKET_TYPE}/bus_hourly_summary_v2/{date_str}.csv')
        rt_raw = rt_raw.append(daily_data)
        
    # basic reformatting
    rt = rt_raw.copy()
    schedule = schedule_raw.copy()
    rt['date'] = pd.to_datetime(rt.data_date, format = '%Y-%m-%d')
    rt['route_id'] = rt['rt']
    schedule['date'] = pd.to_datetime(schedule.date, format = '%Y-%m-%d')
    
    # get total by route by day
    rt_daily_by_rte = rt.groupby(by = ['date', 'route_id'])['trip_count'].sum().reset_index()
    sched_daily_by_rte = schedule.groupby(by = ['date', 'route_id'])['trip_count'].sum().reset_index()
    
    compare_daily_by_rte = rt_daily_by_rte.merge(sched_daily_by_rte, how = 'inner', 
                                             on = ['date', 'route_id'], suffixes = ['_rt', '_sched'])
    
    # compare by day of week 
    compare_daily_by_rte['dayofweek'] = compare_daily_by_rte['date'].dt.dayofweek
    compare_daily_by_rte['day_type'] = compare_daily_by_rte['dayofweek'].map({0: 'wk', 
                                                                             1: 'wk',
                                                                             2: 'wk',
                                                                             3: 'wk',
                                                                             4: 'wk',
                                                                             5: 'sat',
                                                                             6: 'sun'})

    compare_daily_by_rte.loc[compare_daily_by_rte.date.isin(['2022-05-31', '2022-07-04']), 'day_type'] = 'hol'
    
    compare_by_day_type = compare_daily_by_rte.groupby(['route_id', 'day_type'])[['trip_count_rt', 'trip_count_sched']].sum().reset_index()
    
    compare_by_day_type['ratio'] = compare_by_day_type['trip_count_rt'] / compare_by_day_type['trip_count_sched']
    
    # compare_by_day_type.sort_values(by = ['day_type', 'route_id'])
    
    # compare_by_day_type[compare_by_day_type.day_type == 'wk'].sort_values(by = 'ratio')
    
    if BUCKET_TYPE == "private":
        compare_by_day_type.to_csv(f's3://rtd-ghost-buses-{BUCKET_TYPE}/schedule_rt_comparisons/route_level/schedule_v{feed["schedule_version"]}_realtime_rt_level_comparison_{feed["feed_start_date"]}_to_{feed["feed_end_date"]}.csv', index = False)
    
    

Loading schedule version Merged-31664M31661-20230415-035754-Jan23 and May23
Processing 2023-01-08 at 2023-05-12 19:38:45


FileNotFoundError: rtd-ghost-buses-private/bus_hourly_summary_v2/2023-01-08.csv

In [None]:
# build an overall summary from the versioned schedule comparisons
combined = pd.DataFrame()
for feed in schedule_feeds:
    print(f"Processing {feed['schedule_version']}")
    feed_comp = pd.read_csv(f's3://chn-ghost-buses-{BUCKET_TYPE}/schedule_rt_comparisons/route_level/schedule_v{feed["schedule_version"]}_realtime_rt_level_comparison_{feed["feed_start_date"]}_to_{feed["feed_end_date"]}.csv')
    print(f"Loaded {len(feed_comp)} rows")
    combined = combined.append(feed_comp)
    

In [None]:
summary = combined.groupby(['route_id', 'day_type'])[['trip_count_rt', 'trip_count_sched']].sum().reset_index()

In [None]:
summary['ratio'] = summary['trip_count_rt'] / summary['trip_count_sched']

In [None]:
if BUCKET_TYPE == "private":
    summary.to_csv(f's3://rtd-ghost-buses-{BUCKET_TYPE}/schedule_rt_comparisons/route_level/combined_schedule_realtime_rt_level_comparison_2022-05-20_to_2022-07-20.csv', index = False)
