In [None]:
import pandas as pd
import os

In [None]:
d_types = {'RespondentID':str, 'Zipcode':str}

In [None]:
geo_left = pd.read_csv(os.path.join('..', 'data', 'survey', '2015 on-board survey', 'geo_responses_Left_20201211-122631.txt'),
                       dtype=d_types,
                       header=None,
                       names=['RespondentID', 'confidence', 'lat', 'lng']
                           )

###Clean-up
geo_left['RespondentID'] = geo_left['RespondentID'].str.replace('[', '')
geo_left['lng'] = geo_left['lng'].str.replace(']', '')

idx = geo_left.groupby('RespondentID')['confidence'].transform(max) == geo_left['confidence']
geo_left_clean = geo_left[idx].drop_duplicates('RespondentID')

In [None]:
geo_dest = pd.read_csv(os.path.join('..', 'data', 'survey', '2015 on-board survey', 'geo_responses_Dest_20201211-125615.txt'),
                       dtype=d_types,
                       header=None,
                       names=['RespondentID', 'confidence', 'lat', 'lng']
                           )

###Clean-up
geo_dest['RespondentID'] = geo_dest['RespondentID'].str.replace('[', '')
geo_dest['lng'] = geo_dest['lng'].str.replace(']', '')

idx = geo_dest.groupby('RespondentID')['confidence'].transform(max) == geo_dest['confidence']
geo_dest_clean = geo_dest[idx].drop_duplicates('RespondentID')

In [None]:
survey_all = pd.read_excel(os.path.join('..', 'data', 'survey', '2015 on-board survey', '01052016_Data_Cleaning_19.xlsx'),
                       dtype=d_types
                      )

survey_all.columns = [x.rstrip() for x in survey_all.columns]

In [None]:
survey_all = pd.merge(survey_all,
                      geo_left_clean,
                      on='RespondentID',
                      how='left',
                     )

survey_all = pd.merge(survey_all,
                      geo_dest_clean,
                      on='RespondentID',
                      how='left',
                      suffixes=['_left', '_dest']
                     )

In [None]:
list(survey_all)

In [None]:
survey = survey_all[[
        'RespondentID',
        'Route',
        'Route - In/Out Bound',
        'Left From',
        'Left Other',
        'Left St No',
        'Left St Name',
        'Left Nearest Intersection',
        'Left Name',
        'Final Destination',
        'Destination-Other',
        'Dest St No',
        'Dest St Name',
        'Dest Nearest Intersection',
        'Dest Name',
        'Zipcode',
    
        'confidence_left',
        'lat_left',
        'lng_left',
        'confidence_dest',
        'lat_dest',
        'lng_dest'
]]

In [None]:
report_side = 'Left'
geo_df = geo_left_clean

# report_side = 'Dest'
# geo_df = geo_dest_clean

conf_col = 'confidence_{}'.format(report_side.lower())
st = '{} St Name'.format(report_side)
intrs = '{} Nearest Intersection'.format(report_side)

print('{} Side Summary:'.format(report_side).upper())

print('Total records: ', '{:,.0f}'.format(len(survey)))
print('Records processed in Geocode: ', '{:,.0f}'.format(len(geo_df)))

pct8 = len(survey[survey[conf_col]>=8]) / len(survey)
print('Records with Geocode Confidence factor greater than 8: ', '{:,.0f}'.format(len(survey[survey[conf_col]>=8])),\
      '{0:.0%}'.format(pct8))

pctnot8 = len(survey[survey[conf_col]<8]) / len(survey)
print('Records with Geocode Confidence factor lower than 8: ', len(survey[survey[conf_col]<8]), '{0:.0%}'.format(pctnot8))

pct_nostreet = len(survey[survey[st].isna()]) / len(survey)
print('Records with no Street Name: ', len(survey[survey[st].isna()]), '{0:.0%}'.format(pct_nostreet))
print('Records with no Street Name and no Nearest Intersection: ',\
      len(survey[(survey[st].isna()) & (survey[intrs].isna())]))

In [None]:
survey['Route'] = survey['Route']

survey.loc[survey['confidence_left']>=8, 'Geocode_result_left'] = 'pass'
survey.loc[survey['confidence_left']<8, 'Geocode_result_left'] = 'no_pass'
survey.loc[survey['confidence_left'].isna(), 'Geocode_result_left'] = 'insufficient_data'

survey.loc[survey['confidence_dest']>=8, 'Geocode_result_dest'] = 'pass'
survey.loc[survey['confidence_dest']<8, 'Geocode_result_dest'] = 'no_pass'
survey.loc[survey['confidence_dest'].isna(), 'Geocode_result_dest'] = 'insufficient_data'


In [None]:
survey

In [None]:
survey[['RespondentID', 'Route', 'Geocode_result_left']].pivot_table(
    columns=['Geocode_result_left'],
    index='Route',
    values='RespondentID',
    aggfunc='count',
    margins=True
).to_clipboard()

In [None]:
survey[['RespondentID', 'Route', 'Geocode_result_dest']].pivot_table(
    columns=['Geocode_result_dest'],
    index='Route',
    values='RespondentID',
    aggfunc='count',
    margins=True
).to_clipboard()

In [None]:
survey.to_csv(os.path.join('..', 'data', 'survey', '2015 on-board survey', 'survey_geocoded.csv'))