In [1]:
import os
import calendar as cal
from datetime import datetime
import pandas as pd

### Set data sources and parameters

In [2]:
apc_dir = r'C:\data\OneDrive\WSP O365\chattanooga-modeling - RTP TDM Update\Data\CARTA_PassengerData\CARTA APC DATA 2019'

gtfs_dir = r'C:\data\OneDrive\WSP O365\chattanooga-modeling - RTP TDM Update\Data\TransitUpdate2\GTFS'
# gtfs_dir = r'C:\apps\client_chattanooga_rtp_update\Data\GTFS\carta_2019_08_15'

datatypes = {'STOP_ID': str, 'ROUTE_NUMBER':str, 'stop_id': str}

In [3]:
months = [
    3,  ## March
    4,  ## April
    8,  ## August
    10,  ## October
    11,  ## November
    ]

preferred = 8,  ## August

apc_file = '{}2019_RIDECHECK_DATA.{}'

tod_windows = {
    'AM':{'from': '6', 'to': '9'},
    'PM':{'from': '15', 'to': '18'}
}

## APC to GTFS
route_xref = pd.DataFrame(data=[
    ['1.40',   'Route #1 am HC:1',  '1',   'ALTON PARK'],
    ['10.10',  'Route #10A',        '10A', 'AVON'],
    ['10.60',  'Route #10C',        '10C', 'CAMPBELL'],
    ['10.90',  'Route #10G 58:10G', '10G', 'GLENWOOD'],
    ['13.00',  'Route #13',         '13',  'ROSSVILLE'],
    ['14.00',  'Route #14',         '14',  'MOCS EXPRESS'],
    ['15.00',  'Route #15',         '15',  'ST. ELMO'],
    ['16.40',  'Route #16 In:16',   '16',  'NORTHGATE OB'],
    ['19.00',  'Route #19',         '19',  'CROMWELL ROAD'],
    ['2.00',   'Route #2',          '2',   'NORTH CHATT'],
    ['21.00',  'Route #21',         '21',  'GOLDEN GATEWAY'],
    ['28.00',  'Route #28',         '28',  'AMNICOLA HWY CHATT STATE'],
    ['33.00',  'Route #DTS:33',     '33',  'DOWNTOWN SHUTTLE'],
    ['34.00',  'Route #34',         '34',  'NORTH SHORE SHUTTLE'],
    ['4.00',   'Route #4',          '4',   'EASTGATE/HAMILTON PL'],
    ['7.00',   'Route #7',          '7',   'CHATTANOOGA HOUSING AUTHORITY'],
    ['780.00', '3:3',               '3',   'ENTERPRISE SOUTH'],
    ['8.00',   'Route #8',          '8',   'EASTDALE'],
    ['9.00',   'Route #9',          '9',   'EAST LAKE'],
    ],
    columns=['ROUTE_NUMBER', 'ROUTE_NAME', 'route_id', 'route_short_name']
)

##### Calculate Weekdays for each Month
Weekdays:
    Tue
    Wed
    Thurs

In [4]:
apc = {}
for m in months:
    weekdays = [d[0] for w in cal.Calendar().monthdays2calendar(2019, m) for d in w  if d[0]!=0 and 0 < d[1] < 4]
    month_name = calendar.month_name[m]
    
    apc[m] = {'month_name': month_name,
              'weekdays': weekdays}
    
    print('{}: {} Weekdays'.format(month_name, len(weekdays)))
    print(weekdays)

NameError: name 'calendar' is not defined

### Load APC Data

In [114]:
for m in months:
    mpref = apc[m]['month_name'][:3].upper()
    try:
        apc[m]['data'] = pd.read_csv(os.path.join(apc_dir, apc_file.format(mpref, 'TXT')), dtype=datatypes)
    except:
        apc[m]['data'] = pd.read_csv(os.path.join(apc_dir, apc_file.format(mpref, 'CSV')), dtype=datatypes)
    finally:
        print('APC Data for month of {} loaded'.format(apc[m]['month_name']))
    
    apc[m]['data'] = pd.merge(apc[m]['data'], route_xref[['ROUTE_NUMBER', 'route_id', 'route_short_name']], on='ROUTE_NUMBER', how='left')

  interactivity=interactivity, compiler=compiler, result=result)


APC Data for month of March loaded
APC Data for month of August loaded
APC Data for month of October loaded


##### Filter APC Data to Weekdays

In [115]:
for m in apc.keys():
    wdates = ['{}/{}/2019 0:00:00'.format(m, d) for d in apc[m]['weekdays']]
    apc[m]['data'] = apc[m]['data'][apc[m]['data']['SURVEY_DATE'].isin(wdates)].copy()

##### Check Daily Boardings

In [116]:
for m in apc.keys():
    daily_boards = []
    print(apc[m]['month_name'])
    for d in apc[m]['weekdays']:
        ddate = datetime.strptime('{}/{}/2019'.format(m, d), '%m/%d/%Y')
        surdate = '{}/{}/2019 0:00:00'.format(m, d)
        boards = apc[m]['data'][apc[m]['data']['SURVEY_DATE']==surdate]['PASSENGERS_ON'].sum()
        if boards > 0: daily_boards.append(boards) 
        
        print('\tDate: {:%d, %b %Y}'.format(ddate),
              'Boardings: {:,}'.format(boards))
        
    print('Daily Average: {:,.0f}\n'.format(sum(daily_boards)/len(daily_boards)))

March
	Date: 05, Mar 2019 Boardings: 4,711
	Date: 06, Mar 2019 Boardings: 4,737
	Date: 07, Mar 2019 Boardings: 5,346
	Date: 12, Mar 2019 Boardings: 5,851
	Date: 13, Mar 2019 Boardings: 5,118
	Date: 14, Mar 2019 Boardings: 4,381
	Date: 19, Mar 2019 Boardings: 5,243
	Date: 20, Mar 2019 Boardings: 5,805
	Date: 21, Mar 2019 Boardings: 5,558
	Date: 26, Mar 2019 Boardings: 6,034
	Date: 27, Mar 2019 Boardings: 5,617
	Date: 28, Mar 2019 Boardings: 5,545
Daily Average: 5,329

August
	Date: 01, Aug 2019 Boardings: 4,256
	Date: 06, Aug 2019 Boardings: 4,281
	Date: 07, Aug 2019 Boardings: 4,522
	Date: 08, Aug 2019 Boardings: 4,532
	Date: 13, Aug 2019 Boardings: 3,849
	Date: 14, Aug 2019 Boardings: 3,046
	Date: 15, Aug 2019 Boardings: 3,795
	Date: 20, Aug 2019 Boardings: 0
	Date: 21, Aug 2019 Boardings: 0
	Date: 22, Aug 2019 Boardings: 0
	Date: 27, Aug 2019 Boardings: 0
	Date: 28, Aug 2019 Boardings: 0
	Date: 29, Aug 2019 Boardings: 0
Daily Average: 4,040

October
	Date: 01, Oct 2019 Boardings: 4,8

### Load GTFS Data

In [83]:
gtfs = pd.read_csv(os.path.join(gtfs_dir, 'stops.txt'), dtype=datatypes)
gtfs = gtfs[['stop_id', 'stop_name']]

##### Verify APC to GTFS Stop_ID using intersection data
Note that non-matching records are still being counted with the 'ROUTE_NAME' attribute

In [99]:
def apc_gtfs_check(m):
    print('Comparing APC Data for month of {}'.format(apc[m]['month_name']))
    apc_stops = apc[m]['data'].groupby(['STOP_ID', 'MAIN_CROSS_STREET'])['STOP_ID'].count().reset_index(name='count')
    apc_stops['MAIN_CROSS_STREET'] = apc_stops['MAIN_CROSS_STREET'].str.replace('/', ' + ')

    stops_check = pd.merge(apc_stops, gtfs, left_on='STOP_ID', right_on='stop_id', how='left')

    matches = stops_check[stops_check['stop_id'].notna()].copy()
    not_matches = stops_check[stops_check['stop_id'].isna()].copy()

    print('APC Stations: {:,}'.format(len(apc_stops)))
    print('GTFS Stations: {:,}'.format(len(gtfs)))
    print('APC - GTFS Stations difference: {:,}'.format(len(apc_stops) - len(gtfs)))

    print('Station EXACT Matches: {:,} ratio: {:.2f}'.format(len(matches), len(matches) / len(apc_stops)))

    not_matches['MAIN_CROSS_STREET'] = not_matches['MAIN_CROSS_STREET'].str.replace(' ', '')
    not_matches['stop_name'] = not_matches['stop_name'].str.replace(' ', '')
    close_matches = not_matches[not_matches['MAIN_CROSS_STREET']==not_matches['stop_name']]
    print('Station CLOSE Matches: {:,}'.format(len(close_matches)))

    print('Station NOT Matches: {:,}'.format(len(not_matches)-len(close_matches)))

    null_stop_id = not_matches[not_matches['stop_name'].isna()]
    print('APC Stations ID NOT in GTFS: {:,}'.format(len(null_stop_id)))
    print('\n')

    # print(not_matches['MAIN_CROSS_STREET'].unique())

In [100]:
for m in apc.keys():
    apc_gtfs_check(m)

Comparing APC Data for month of March
APC Stations: 1,271
GTFS Stations: 1,181
APC - GTFS Stations difference: 90
Station EXACT Matches: 1,180 ratio: 0.93
Station CLOSE Matches: 0
Station NOT Matches: 91
APC Stations ID NOT in GTFS: 91


Comparing APC Data for month of August
APC Stations: 1,240
GTFS Stations: 1,181
APC - GTFS Stations difference: 59
Station EXACT Matches: 1,153 ratio: 0.93
Station CLOSE Matches: 0
Station NOT Matches: 87
APC Stations ID NOT in GTFS: 87


Comparing APC Data for month of October
APC Stations: 1,257
GTFS Stations: 1,181
APC - GTFS Stations difference: 76
Station EXACT Matches: 1,159 ratio: 0.92
Station CLOSE Matches: 0
Station NOT Matches: 98
APC Stations ID NOT in GTFS: 98




### Route level average data boardings by TOD
Populate table 'Transit Route Boardings':  
https://wsponline.sharepoint.com/:w:/r/sites/US-chattanoogamodeling/RTP%20TDM%20Update/Deliverables/Task%203_Model%20Validation/Chattanooga%20TPO%20Model%20Validation%20Report.docx

##### Daily average

##### Update Time Of Day periods

In [132]:
# CHECKS on TOD
apc[3]['data'][['SURVEY_DATE', 'TRIP_START_TIME', 'TIME_PERIOD', 'TIME_SCHEDULED', 'TIME_ACTUAL_ARRIVE', 'TIME_ACTUAL_DEPART',]].sample(5)
apc[3]['data']['TIME_PERIOD'].unique()

apc[3]['data'][(apc[3]['data']['TIME_PERIOD']=='AM Early')&(apc[3]['data']['ROUTE_NUMBER']==7)].groupby('SURVEY_DATE').count()
apc[3]['data'][(apc[3]['data']['TIME_PERIOD']=='PM Late')&(apc[3]['data']['ROUTE_NUMBER']==2)].groupby('SURVEY_DATE').count()

apc[3]['data'].groupby(['TIME_PERIOD']).agg({'TRIP_START_TIME':['min', 'max']})

Unnamed: 0_level_0,TRIP_START_TIME,TRIP_START_TIME
Unnamed: 0_level_1,min,max
TIME_PERIOD,Unnamed: 1_level_2,Unnamed: 2_level_2
AM Early,12/30/1899 4:21:00,12/30/1899 4:41:00
AM Peak,12/30/1899 4:46:00,12/30/1899 8:58:00
Midday,12/30/1899 10:00:00,12/30/1899 9:59:00
PM Late,12/30/1899 19:00:00,12/30/1899 23:55:00
PM Peak,12/30/1899 15:00:00,12/30/1899 18:57:00


In [138]:
apc[3]['data'][apc[3]['data']['TIME_PERIOD']=='Midday'][['TRIP_START_TIME']].sort_values('TRIP_START_TIME')
apc[3]['data'][apc[3]['data']['TIME_PERIOD']=='Midday']['TRIP_START_TIME'].unique()

array(['12/30/1899 14:21:00', '12/30/1899 14:51:00',
       '12/30/1899 12:51:00', '12/30/1899 14:05:00', '12/30/1899 9:00:00',
       '12/30/1899 9:06:00', '12/30/1899 9:40:00', '12/30/1899 9:46:00',
       '12/30/1899 10:20:00', '12/30/1899 10:26:00',
       '12/30/1899 11:00:00', '12/30/1899 11:06:00',
       '12/30/1899 11:40:00', '12/30/1899 11:46:00',
       '12/30/1899 12:20:00', '12/30/1899 12:26:00',
       '12/30/1899 13:00:00', '12/30/1899 13:06:00',
       '12/30/1899 13:40:00', '12/30/1899 13:46:00',
       '12/30/1899 14:20:00', '12/30/1899 14:26:00', '12/30/1899 9:15:00',
       '12/30/1899 9:45:00', '12/30/1899 10:15:00', '12/30/1899 10:50:00',
       '12/30/1899 11:25:00', '12/30/1899 12:00:00',
       '12/30/1899 10:45:00', '12/30/1899 11:15:00',
       '12/30/1899 11:50:00', '12/30/1899 9:26:00', '12/30/1899 11:55:00',
       '12/30/1899 12:30:00', '12/30/1899 10:31:00',
       '12/30/1899 11:20:00', '12/30/1899 13:15:00',
       '12/30/1899 13:50:00', '12/30/1899 14

In [128]:
apc[3]['data']

Unnamed: 0,SERIAL_NUMBER,SCHEDULE_ID,SCHEDULE_NAME,SIGNUP_NAME,SURVEY_DATE,SURVEY_STATUS,SURVEY_TYPE,SURVEY_SOURCE,PATTERN_ID,ROUTE_NUMBER,...,NR_BOARD,NR_ALIGHT,KNEELS,COMMENT_NUMBER,CHECKER_TIME,FIRST_LAST_STOP,route_id,route_short_name,trip_start_astime,TOD
111130,4667378,106,Aug18 (Weekday),August 19 2018,3/5/2019 0:00:00,2,1,3,910,4.00,...,,,0,,,1,4,EASTGATE/HAMILTON PL,14:21:00,OP
111131,4667378,106,Aug18 (Weekday),August 19 2018,3/5/2019 0:00:00,2,1,3,910,4.00,...,,,0,,,1,4,EASTGATE/HAMILTON PL,14:21:00,OP
111132,4667378,106,Aug18 (Weekday),August 19 2018,3/5/2019 0:00:00,2,1,3,910,4.00,...,,,0,,,2,4,EASTGATE/HAMILTON PL,14:21:00,OP
111133,4667378,106,Aug18 (Weekday),August 19 2018,3/5/2019 0:00:00,2,1,3,910,4.00,...,,,0,,,2,4,EASTGATE/HAMILTON PL,14:21:00,OP
111134,4667378,106,Aug18 (Weekday),August 19 2018,3/5/2019 0:00:00,2,1,3,910,4.00,...,,,0,,,2,4,EASTGATE/HAMILTON PL,14:21:00,OP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1046918,4694035,106,Aug18 (Weekday),August 19 2018,3/28/2019 0:00:00,2,1,3,908,4.00,...,,,0,,,2,4,EASTGATE/HAMILTON PL,17:45:00,PM
1046919,4694035,106,Aug18 (Weekday),August 19 2018,3/28/2019 0:00:00,2,1,3,908,4.00,...,,,0,,,2,4,EASTGATE/HAMILTON PL,17:45:00,PM
1046920,4694035,106,Aug18 (Weekday),August 19 2018,3/28/2019 0:00:00,2,1,3,908,4.00,...,,,0,,,2,4,EASTGATE/HAMILTON PL,17:45:00,PM
1046921,4694035,106,Aug18 (Weekday),August 19 2018,3/28/2019 0:00:00,2,1,3,908,4.00,...,,,0,,,3,4,EASTGATE/HAMILTON PL,17:45:00,PM


In [117]:
def update_tod(m):
    apc[m]['data']['trip_start_astime'] = \
    [datetime.strptime(t, '%m/%d/%Y %H:%M:%S').time() for t in apc[m]['data']['TRIP_START_TIME']]
    
    apc[m]['data']['TOD'] = 'OP'  ## All other times are set to Off-Peak
    
    for t in tod_windows.keys():
        apc[m]['data'].loc[apc[m]['data']['trip_start_astime'].between(
            datetime.strptime((tod_windows[t]['from']), '%H').time(),
            datetime.strptime((tod_windows[t]['to']), '%H').time()
            ), 'TOD'] = t

In [118]:
for m in apc.keys():
    update_tod(m)

##### Time of Day average

In [122]:
def tod_ave(m):
    print('Processing Time Of Day Average for month of {}'.format(apc[m]['month_name']))
    apc_pick = apc[m]['data'][['SURVEY_DATE', 'ROUTE_NUMBER', 'PASSENGERS_ON', 'route_id', 'route_short_name', 'TOD']]

    weekdays = apc_pick.groupby(['ROUTE_NUMBER', 'route_id', 'route_short_name', 'SURVEY_DATE', 'TOD'])['PASSENGERS_ON'].sum()\
                       .groupby(['ROUTE_NUMBER', 'route_id', 'route_short_name', 'TOD']).mean().unstack()

    # weekdays = round(weekdays).astype(int)
    weekdays = weekdays.reset_index()
    weekdays.columns=['ROUTE_NUMBER', 'route_id', 'route_short_name', 'AM', 'OP', 'PM']
    weekdays['ROUTE_NUMBER_num'] = weekdays['ROUTE_NUMBER'].astype(float)
    weekdays = weekdays.sort_values(by='ROUTE_NUMBER_num').reset_index(drop=True)

    display(weekdays.drop(columns='ROUTE_NUMBER_num'))

In [123]:
for m in apc.keys():
    tod_ave(m)

Processing Time Of Day Average for month of March


Unnamed: 0,ROUTE_NUMBER,route_id,route_short_name,AM,OP,PM
0,1.4,1,ALTON PARK,91.416667,313.583333,101.416667
1,2.0,2,NORTH CHATT,13.166667,55.416667,31.666667
2,4.0,4,EASTGATE/HAMILTON PL,300.75,1018.583333,392.666667
3,7.0,7,CHATTANOOGA HOUSING AUTHORITY,2.454545,21.75,9.75
4,8.0,8,EASTDALE,19.181818,44.636364,21.0
5,9.0,9,EAST LAKE,84.25,195.416667,88.166667
6,10.1,10A,AVON,37.083333,175.833333,62.916667
7,10.6,10C,CAMPBELL,17.083333,19.916667,30.916667
8,10.9,10G,GLENWOOD,40.166667,143.333333,40.416667
9,13.0,13,ROSSVILLE,49.272727,113.583333,53.272727


Processing Time Of Day Average for month of August


Unnamed: 0,ROUTE_NUMBER,route_id,route_short_name,AM,OP,PM
0,1.4,1,ALTON PARK,73.142857,247.571429,74.0
1,2.0,2,NORTH CHATT,6.0,27.571429,25.0
2,4.0,4,EASTGATE/HAMILTON PL,254.142857,640.285714,296.428571
3,7.0,7,CHATTANOOGA HOUSING AUTHORITY,1.0,15.714286,5.142857
4,8.0,8,EASTDALE,17.4,50.666667,18.5
5,9.0,9,EAST LAKE,84.571429,194.285714,69.714286
6,10.1,10A,AVON,41.857143,118.0,44.571429
7,10.6,10C,CAMPBELL,9.666667,19.857143,15.2
8,10.9,10G,GLENWOOD,33.571429,102.142857,28.428571
9,13.0,13,ROSSVILLE,64.333333,115.428571,52.833333


Processing Time Of Day Average for month of October


Unnamed: 0,ROUTE_NUMBER,route_id,route_short_name,AM,OP,PM
0,1.4,1,ALTON PARK,64.466667,219.4,80.666667
1,2.0,2,NORTH CHATT,12.0,32.230769,22.833333
2,4.0,4,EASTGATE/HAMILTON PL,196.066667,664.866667,260.333333
3,7.0,7,CHATTANOOGA HOUSING AUTHORITY,2.214286,17.466667,5.857143
4,8.0,8,EASTDALE,16.0,43.0,23.0
5,9.0,9,EAST LAKE,78.2,176.4,70.466667
6,10.1,10A,AVON,35.933333,127.666667,45.4
7,10.6,10C,CAMPBELL,12.0,22.866667,18.642857
8,10.9,10G,GLENWOOD,41.933333,102.2,27.333333
9,13.0,13,ROSSVILLE,52.142857,63.133333,47.533333


### Major Station Boardings

##### For each Day

##### Period Total