In [1]:
import pandas as pd
import os
from datetime import datetime

### Set data sources and parameters

In [2]:
apc_dir = r'C:\data\OneDrive\WSP O365\chattanooga-modeling - RTP TDM Update\Data\CARTA_PassengerData\CARTA APC DATA 2019'

gtfs_dir = r'C:\data\OneDrive\WSP O365\chattanooga-modeling - RTP TDM Update\Data\TransitUpdate2\GTFS'
# gtfs_dir = r'C:\apps\client_chattanooga_rtp_update\Data\GTFS\carta_2019_08_15'

datatypes = {'STOP_ID': str, 'ROUTE_NUMBER':str, 'stop_id': str}

In [3]:
## APC to GTFS
route_xref = pd.DataFrame(data=[
    ['1.40',   'Route #1 am HC:1',  '1',   'ALTON PARK'],
    ['10.10',  'Route #10A',        '10A', 'AVON'],
    ['10.60',  'Route #10C',        '10C', 'CAMPBELL'],
    ['10.90',  'Route #10G 58:10G', '10G', 'GLENWOOD'],
    ['13.00',  'Route #13',         '13',  'ROSSVILLE'],
    ['14.00',  'Route #14',         '14',  'MOCS EXPRESS'],
    ['15.00',  'Route #15',         '15',  'ST. ELMO'],
    ['16.40',  'Route #16 In:16',   '16',  'NORTHGATE OB'],
    ['19.00',  'Route #19',         '19',  'CROMWELL ROAD'],
    ['2.00',   'Route #2',          '2',   'NORTH CHATT'],
    ['21.00',  'Route #21',         '21',  'GOLDEN GATEWAY'],
    ['28.00',  'Route #28',         '28',  'AMNICOLA HWY CHATT STATE'],
    ['33.00',  'Route #DTS:33',     '33',  'DOWNTOWN SHUTTLE'],
    ['34.00',  'Route #34',         '34',  'NORTH SHORE SHUTTLE'],
    ['4.00',   'Route #4',          '4',   'EASTGATE/HAMILTON PL'],
    ['7.00',   'Route #7',          '7',   'CHATTANOOGA HOUSING AUTHORITY'],
    ['780.00', '3:3',               '3',   'ENTERPRISE SOUTH'],
    ['8.00',   'Route #8',          '8',   'EASTDALE'],
    ['9.00',   'Route #9',          '9',   'EAST LAKE'],
    ],
    columns=['ROUTE_NUMBER', 'ROUTE_NAME', 'route_id', 'route_short_name']
)

In [4]:
### AUGUST
apc_file = 'AUG2019_RIDECHECK_DATA.TXT'

## Tue, Wed, Thur
dates_pick = [
             1,
    6,   7,  8,
    13, 14, 15,
    20, 21, 22,  ## NOT in Data
    27, 28, 29   ## NOT in Data
]

dates_pick = ['8/{}/2019 0:00:00'.format(d) for d in dates_pick]

### Load APC and GTFS Data

In [5]:
apc = pd.read_csv(os.path.join(apc_dir, apc_file), dtype=datatypes)
apc = pd.merge(apc, route_xref[['ROUTE_NUMBER', 'route_id', 'route_short_name']], on='ROUTE_NUMBER', how='left')
# list(apc)
# apc = apc[['ROUTE_NUMBER', 'ROUTE_NAME','STOP_ID', 'MAIN_CROSS_STREET']].copy()

apc_stops = apc.groupby(['STOP_ID', 'MAIN_CROSS_STREET'])['STOP_ID'].count().reset_index(name='count')
apc_stops['MAIN_CROSS_STREET'] = apc_stops['MAIN_CROSS_STREET'].str.replace('/', ' + ')

gtfs = pd.read_csv(os.path.join(gtfs_dir, 'stops.txt'), dtype=datatypes)
gtfs = gtfs[['stop_id', 'stop_name']]

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
apc['SURVEY_DATE'].unique()

array(['8/1/2019 0:00:00', '8/2/2019 0:00:00', '8/3/2019 0:00:00',
       '8/4/2019 0:00:00', '8/5/2019 0:00:00', '8/6/2019 0:00:00',
       '8/7/2019 0:00:00', '8/8/2019 0:00:00', '8/9/2019 0:00:00',
       '8/10/2019 0:00:00', '8/11/2019 0:00:00', '8/12/2019 0:00:00',
       '8/13/2019 0:00:00', '8/14/2019 0:00:00', '8/15/2019 0:00:00',
       '8/16/2019 0:00:00', '8/17/2019 0:00:00', '8/23/2019 0:00:00',
       '8/25/2019 0:00:00'], dtype=object)

In [7]:
# apc[['ROUTE_NUMBER', 'ROUTE_NAME']].groupby(['ROUTE_NUMBER', 'ROUTE_NAME'])['ROUTE_NUMBER'].count().reset_index(name='_count')

##### Verify APC to GTFS Stop_ID using intersection data
Note that non-matching records are still being counted with the 'ROUTE_NAME' attribute

In [8]:
stops_check = pd.merge(apc_stops, gtfs, left_on='STOP_ID', right_on='stop_id', how='left')

matches = stops_check[stops_check['stop_id'].notna()]
not_matches = stops_check[stops_check['stop_id'].isna()]

print('APC Stations: {:,}'.format(len(apc_stops)))
print('GTFS Stations: {:,}'.format(len(gtfs)))
print('APC - GTFS Stations difference: {:,}'.format(len(apc_stops) - len(gtfs)))

print('Station EXACT Matches: {:,} ratio: {:.2f}'.format(len(matches), len(matches) / len(apc_stops)))

not_matches['MAIN_CROSS_STREET'] = not_matches['MAIN_CROSS_STREET'].str.replace(' ', '')
not_matches['stop_name'] = not_matches['stop_name'].str.replace(' ', '')
close_matches = not_matches[not_matches['MAIN_CROSS_STREET']==not_matches['stop_name']]
print('Station CLOSE Matches: {:,}'.format(len(close_matches)))

print('Station NOT Matches: {:,}'.format(len(not_matches)-len(close_matches)))

null_stop_id = not_matches[not_matches['stop_name'].isna()]
print('APC Stations ID NOT in GTFS: {:,}'.format(len(null_stop_id)))

# print(not_matches['MAIN_CROSS_STREET'].unique())

APC Stations: 1,240
GTFS Stations: 1,181
APC - GTFS Stations difference: 59
Station EXACT Matches: 1,153 ratio: 0.93
Station CLOSE Matches: 0
Station NOT Matches: 87
APC Stations ID NOT in GTFS: 87


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [9]:
# matches.sample(10)

### Route level average data boardings by TOD
Weekdays:
    Tue
    Wed
    Thurs
    
August 2019 (13 obs)

Populate table 'Transit Route Boardings':  
https://wsponline.sharepoint.com/:w:/r/sites/US-chattanoogamodeling/RTP%20TDM%20Update/Deliverables/Task%203_Model%20Validation/Chattanooga%20TPO%20Model%20Validation%20Report.docx

##### Daily average

In [10]:
apc_pick = apc[apc['SURVEY_DATE'].isin(dates_pick)]
apc_pick = apc[apc['PASSENGERS_ON'] > 0]

apc_pick = apc_pick[['SURVEY_DATE', 'ROUTE_NUMBER', 'PASSENGERS_ON', 'route_id', 'route_short_name']]

weekdays = apc_pick.groupby(['ROUTE_NUMBER', 'route_id', 'route_short_name', 'SURVEY_DATE'])['PASSENGERS_ON'].sum()\
                   .groupby(['ROUTE_NUMBER', 'route_id', 'route_short_name']).mean()

# weekdays = round(weekdays).astype(int)
weekdays = weekdays.reset_index()
weekdays.columns=['ROUTE_NUMBER', 'route_id', 'route_short_name', 'boards']
weekdays['ROUTE_NUMBER_num'] = weekdays['ROUTE_NUMBER'].astype(float)
weekdays = weekdays.sort_values(by='ROUTE_NUMBER_num').reset_index(drop=True)

weekdays.drop(columns='ROUTE_NUMBER_num')#.to_clipboard()

Unnamed: 0,ROUTE_NUMBER,route_id,route_short_name,boards
0,1.4,1,ALTON PARK,399.294118
1,2.0,2,NORTH CHATT,76.5
2,4.0,4,EASTGATE/HAMILTON PL,1112.666667
3,7.0,7,CHATTANOOGA HOUSING AUTHORITY,25.25
4,8.0,8,EASTDALE,77.636364
5,9.0,9,EAST LAKE,306.941176
6,10.1,10A,AVON,199.588235
7,10.6,10C,CAMPBELL,45.0
8,10.9,10G,GLENWOOD,161.588235
9,13.0,13,ROSSVILLE,213.0


##### Update Time Of Day periods

In [11]:
## CHECKS on TOD
# apc[['SURVEY_DATE', 'TRIP_START_TIME', 'TIME_PERIOD', 'TIME_SCHEDULED', 'TIME_ACTUAL_ARRIVE', 'TIME_ACTUAL_DEPART',]].sample(5)
# apc['TIME_PERIOD'].unique()

# test[(test['TIME_PERIOD']=='AM Early')&(test['ROUTE_NUMBER']==7)].groupby('SURVEY_DATE').count()
# test[(test['TIME_PERIOD']=='PM Late')&(test['ROUTE_NUMBER']==2)].groupby('SURVEY_DATE').count()

# apc.groupby(['TIME_PERIOD']).agg({'TRIP_START_TIME':['min', 'max']})

In [12]:
tod_windows = {
    'AM':{'from': '6', 'to': '9'},
    'PM':{'from': '15', 'to': '18'}
}

apc['trip_start_astime'] = [datetime.strptime(t, '%m/%d/%Y %H:%M:%S').time() for t in apc['TRIP_START_TIME']] 

apc['TOD'] = 'OP'  ## All other times are set to Off-Peak

for t in tod_windows.keys():
    apc.loc[apc['trip_start_astime'].between(
        datetime.strptime((tod_windows[t]['from']), '%H').time(),
        datetime.strptime((tod_windows[t]['to']), '%H').time()
        ), 'TOD'] = t

##### Time of Day average

In [13]:
apc_pick = apc[apc['SURVEY_DATE'].isin(dates_pick)]
apc_pick = apc_pick[['SURVEY_DATE', 'ROUTE_NUMBER', 'PASSENGERS_ON', 'route_id', 'route_short_name', 'TOD']]

weekdays = apc_pick.groupby(['ROUTE_NUMBER', 'route_id', 'route_short_name', 'SURVEY_DATE', 'TOD'])['PASSENGERS_ON'].sum()\
                   .groupby(['ROUTE_NUMBER', 'route_id', 'route_short_name', 'TOD']).mean().unstack()

# weekdays = round(weekdays).astype(int)
weekdays = weekdays.reset_index()
weekdays.columns=['ROUTE_NUMBER', 'route_id', 'route_short_name', 'AM', 'OP', 'PM']
weekdays['ROUTE_NUMBER_num'] = weekdays['ROUTE_NUMBER'].astype(float)
weekdays = weekdays.sort_values(by='ROUTE_NUMBER_num').reset_index(drop=True)

weekdays.drop(columns='ROUTE_NUMBER_num')#.to_clipboard()

Unnamed: 0,ROUTE_NUMBER,route_id,route_short_name,AM,OP,PM
0,1.4,1,ALTON PARK,73.142857,247.571429,74.0
1,2.0,2,NORTH CHATT,6.0,27.571429,25.0
2,4.0,4,EASTGATE/HAMILTON PL,254.142857,640.285714,296.428571
3,7.0,7,CHATTANOOGA HOUSING AUTHORITY,1.0,15.714286,5.142857
4,8.0,8,EASTDALE,17.4,50.666667,18.5
5,9.0,9,EAST LAKE,84.571429,194.285714,69.714286
6,10.1,10A,AVON,41.857143,118.0,44.571429
7,10.6,10C,CAMPBELL,9.666667,19.857143,15.2
8,10.9,10G,GLENWOOD,33.571429,102.142857,28.428571
9,13.0,13,ROSSVILLE,64.333333,115.428571,52.833333


### Major Station Boardings

##### For each Day

In [14]:
apc_pick = apc[apc['SURVEY_DATE'].isin(dates_pick)]
apc_pick[['STOP_ID', 'MAIN_CROSS_STREET', 'SURVEY_DATE']]
station_boards = apc_pick.groupby(['STOP_ID', 'MAIN_CROSS_STREET', 'SURVEY_DATE'])['SERIAL_NUMBER']\
                         .count().reset_index(name='BOARDS')
station_boards.sort_values(by='BOARDS', ascending=False).head(20)

Unnamed: 0,STOP_ID,MAIN_CROSS_STREET,SURVEY_DATE,BOARDS
236,100075,SPN,8/14/2019 0:00:00,235
237,100075,SPN,8/15/2019 0:00:00,225
239,100075,SPN,8/7/2019 0:00:00,225
234,100075,SPN,8/1/2019 0:00:00,221
6894,742,MARKET/TVA,8/7/2019 0:00:00,218
240,100075,SPN,8/8/2019 0:00:00,217
238,100075,SPN,8/6/2019 0:00:00,214
235,100075,SPN,8/13/2019 0:00:00,213
6890,742,MARKET/TVA,8/13/2019 0:00:00,211
6895,742,MARKET/TVA,8/8/2019 0:00:00,210


##### Period Total

In [15]:
apc_pick = apc[apc['SURVEY_DATE'].isin(dates_pick)]
apc_pick[['STOP_ID', 'MAIN_CROSS_STREET']]
station_boards = apc_pick.groupby(['STOP_ID', 'MAIN_CROSS_STREET'])['SERIAL_NUMBER'].count().reset_index(name='BOARDS')
station_boards.sort_values(by='BOARDS', ascending=False).head(20)

Unnamed: 0,STOP_ID,MAIN_CROSS_STREET,BOARDS
35,100075,SPN,1550
1031,742,MARKET/TVA,1418
416,145,MARKET/12TH,1313
359,1351,MARKET/4TH,1221
43,100133,Mkt 4th,1082
495,163,MARKET/11TH PATTEN TOWERS,1082
470,1555,MARKET/6TH,1080
258,12,MARKET/FAMILY DOLLAR,1076
36,100077,SPS,1060
463,1539,10TH/LIBRARY,1040


##### Some checks

In [16]:
for d in dates_pick:
    print('Date: {:%d, %b %Y}'.format(datetime.strptime(d, '%m/%d/%Y %H:%M:%S').date()),
          'Boardings: {:,}'.format(apc[apc['SURVEY_DATE']==d]['PASSENGERS_ON'].sum()))

Date: 01, Aug 2019 Boardings: 4,256
Date: 06, Aug 2019 Boardings: 4,281
Date: 07, Aug 2019 Boardings: 4,522
Date: 08, Aug 2019 Boardings: 4,532
Date: 13, Aug 2019 Boardings: 3,849
Date: 14, Aug 2019 Boardings: 3,046
Date: 15, Aug 2019 Boardings: 3,795
Date: 20, Aug 2019 Boardings: 0
Date: 21, Aug 2019 Boardings: 0
Date: 22, Aug 2019 Boardings: 0
Date: 27, Aug 2019 Boardings: 0
Date: 28, Aug 2019 Boardings: 0
Date: 29, Aug 2019 Boardings: 0
