In [1]:
import pandas as pd
import os

In [2]:
d_types = {'RespondentID':str, 'Zipcode':str}

In [3]:
geo_left = pd.read_csv(os.path.join('..', 'data', 'survey', '2015 on-board survey', 'geo_responses_Left_20201211-122631.txt'),
                       dtype=d_types,
                       header=None,
                       names=['RespondentID', 'confidence', 'lat', 'lng']
                           )

###Clean-up
geo_left['RespondentID'] = geo_left['RespondentID'].str.replace('[', '')
geo_left['lng'] = geo_left['lng'].str.replace(']', '')

idx = geo_left.groupby('RespondentID')['confidence'].transform(max) == geo_left['confidence']
geo_left_clean = geo_left[idx].drop_duplicates('RespondentID')

In [4]:
geo_dest = pd.read_csv(os.path.join('..', 'data', 'survey', '2015 on-board survey', 'geo_responses_Dest_20201211-125615.txt'),
                       dtype=d_types,
                       header=None,
                       names=['RespondentID', 'confidence', 'lat', 'lng']
                           )

###Clean-up
geo_dest['RespondentID'] = geo_dest['RespondentID'].str.replace('[', '')
geo_dest['lng'] = geo_dest['lng'].str.replace(']', '')

idx = geo_dest.groupby('RespondentID')['confidence'].transform(max) == geo_dest['confidence']
geo_dest_clean = geo_dest[idx].drop_duplicates('RespondentID')

In [5]:
survey_all = pd.read_excel(os.path.join('..', 'data', 'survey', '2015 on-board survey', '01052016_Data_Cleaning_19.xlsx'),
                       dtype=d_types
                      )

survey_all.columns = [x.rstrip() for x in survey_all.columns]

In [6]:
survey_all = pd.merge(survey_all,
                      geo_left_clean,
                      on='RespondentID',
                      how='left',
                     )

survey_all = pd.merge(survey_all,
                      geo_dest_clean,
                      on='RespondentID',
                      how='left',
                      suffixes=['_left', '_dest']
                     )

In [7]:
list(survey_all)

['RespondentID',
 'CollectorID',
 'StartDate',
 'EndDate',
 'IP Address',
 'Route',
 'Route - In/Out Bound',
 'Time - Hour',
 'Time - Minute',
 'Time - AM/PM',
 'Time',
 'Left From',
 'Left Other',
 'Left St No',
 'Left St Name',
 'Left Nearest Intersection',
 'Left Name',
 'To Bus Walked',
 'To Bus Drove & parked a car',
 'To Bus Dropped off by someone',
 'To Bus Biked',
 'To Bus Rode with someone who parked',
 'To Bus- Other',
 'To Bus Time (mins) Total Buses',
 'Response',
 'Final Destination',
 'Destination-Other',
 'Dest St No',
 'Dest St Name',
 'Dest Nearest Intersection',
 'Dest Name',
 'Dest Walk',
 'Dest Drive a Car',
 'Dest Pick up by someone',
 'Dest Bike',
 'Dest Ride with someone who parked',
 'Dest Bus',
 'Dest Time to Walk/ Bike (minutes)',
 'Time Trip Began',
 'Payment',
 'How Often',
 'How Long',
 'Employment',
 'Employment Other',
 'Why-No other transport',
 'Why-Fuel cost',
 "Why-Don't like to drive",
 "Why-Can't drive",
 'Why-Other',
 'Alternate Drive a car',
 'Alt

In [8]:
survey = survey_all[[
        'RespondentID',
        'Route',
        'Route - In/Out Bound',
        'Left From',
        'Left Other',
        'Left St No',
        'Left St Name',
        'Left Nearest Intersection',
        'Left Name',
        'Final Destination',
        'Destination-Other',
        'Dest St No',
        'Dest St Name',
        'Dest Nearest Intersection',
        'Dest Name',
        'Zipcode',
    
        'confidence_left',
        'lat_left',
        'lng_left',
        'confidence_dest',
        'lat_dest',
        'lng_dest'
]]

In [9]:
report_side = 'Left'
geo_df = geo_left_clean

# report_side = 'Dest'
# geo_df = geo_dest_clean

conf_col = 'confidence_{}'.format(report_side.lower())
st = '{} St Name'.format(report_side)
intrs = '{} Nearest Intersection'.format(report_side)

print('{} Side Summary:'.format(report_side).upper())

print('Total records: ', '{:,.0f}'.format(len(survey)))
print('Records processed in Geocode: ', '{:,.0f}'.format(len(geo_df)))

pct8 = len(survey[survey[conf_col]>=8]) / len(survey)
print('Records with Geocode Confidence factor greater than 8: ', '{:,.0f}'.format(len(survey[survey[conf_col]>=8])),\
      '{0:.0%}'.format(pct8))

pctnot8 = len(survey[survey[conf_col]<8]) / len(survey)
print('Records with Geocode Confidence factor lower than 8: ', len(survey[survey[conf_col]<8]), '{0:.0%}'.format(pctnot8))

pct_nostreet = len(survey[survey[st].isna()]) / len(survey)
print('Records with no Street Name: ', len(survey[survey[st].isna()]), '{0:.0%}'.format(pct_nostreet))
print('Records with no Street Name and no Nearest Intersection: ',\
      len(survey[(survey[st].isna()) & (survey[intrs].isna())]))

LEFT SIDE SUMMARY:
Total records:  1,885
Records processed in Geocode:  1,717
Records with Geocode Confidence factor greater than 8:  1,565 83%
Records with Geocode Confidence factor lower than 8:  152 8%
Records with no Street Name:  438 23%
Records with no Street Name and no Nearest Intersection:  342


In [11]:
survey['Route'] = survey['Route']

survey.loc[survey['confidence_left']>=8, 'Geocode_result_left'] = 'pass'
survey.loc[survey['confidence_left']<8, 'Geocode_result_left'] = 'no_pass'
survey.loc[survey['confidence_left'].isna(), 'Geocode_result_left'] = 'insufficient_data'

survey.loc[survey['confidence_dest']>=8, 'Geocode_result_dest'] = 'pass'
survey.loc[survey['confidence_dest']<8, 'Geocode_result_dest'] = 'no_pass'
survey.loc[survey['confidence_dest'].isna(), 'Geocode_result_dest'] = 'insufficient_data'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [12]:
survey

Unnamed: 0,RespondentID,Route,Route - In/Out Bound,Left From,Left Other,Left St No,Left St Name,Left Nearest Intersection,Left Name,Final Destination,...,Dest Name,Zipcode,confidence_left,lat_left,lng_left,confidence_dest,lat_dest,lng_dest,Geocode_result_left,Geocode_result_dest
0,4199326829,10- East Chattanooga,In Bound,Home,,3308,Campbell Street,Campbell & Glass,,Work,...,,37406,9.0,35.072721,-85.240472,9.0,35.000155,-85.2919233,pass,pass
1,4206052159,10- East Chattanooga,In Bound,Home,,314,Riverside Drive,Riverside & Wilder,,Work,...,,37406,9.0,35.076006,-85.25788779999999,,,,pass,insufficient_data
2,4206066684,10- East Chattanooga,In Bound,Home,,3211,Campbell Street,Stuart & Chamberlain,,University / College,...,Erlanger,37406,9.0,35.072284,-85.24194399999999,9.0,35.048552,-85.28950130000001,pass,pass
3,4209630869,21- Golden Gateway,Out Bound,Home,,,D. Drive,,,Work,...,,37406,9.0,35.040771,-85.25995370000001,9.0,35.048697,-85.2902862,pass,pass
4,4209714464,8- Eastdale,Out Bound,Home,,,Tunnel Blvd,Tunnel Blvd & Shallowford,,Personal errands,...,,37410,9.0,35.038021,-85.24016259999999,9.0,35.032369,-85.18462749999999,pass,pass
5,4210021492,1- Alton Park,In Bound,Home,,,West Main,Main & Cypress,"Howard, 25th St",Work,...,,37402,9.0,35.026100,-85.30982499999999,9.0,35.026521,-85.30909369999999,pass,pass
6,4210049627,1- Alton Park,Out Bound,Home,,2459,5th Avenue,4th ave & Foust,,Work,...,,37407,9.0,35.015280,-85.2802724,9.0,35.026521,-85.30909369999999,pass,pass
7,4210067701,1- Alton Park,In Bound,Home,,4429,Fagan,,Wal-Mart & Bank,,...,,,9.0,35.015511,-85.3764317,,,,pass,insufficient_data
8,4210354314,1- Alton Park,In Bound,Home,,1230,Grove Street Ct,Market St & 10th St,College Hills Courts,Work,...,Kentucky Fried Chicken,37402,9.0,35.042905,-85.3185582,9.0,35.039208,-85.266752,pass,pass
9,4215237302,9- East Lake,In Bound,Home,,,,,East Lake Court,Personal errands,...,East Gate Shopping Center,,9.0,35.012783,-85.2821192,9.0,35.008456,-85.2151665,pass,pass


In [13]:
survey[['RespondentID', 'Route', 'Geocode_result_left']].pivot_table(
    columns=['Geocode_result_left'],
    index='Route',
    values='RespondentID',
    aggfunc='count',
    margins=True
).to_clipboard()

In [14]:
survey[['RespondentID', 'Route', 'Geocode_result_dest']].pivot_table(
    columns=['Geocode_result_dest'],
    index='Route',
    values='RespondentID',
    aggfunc='count',
    margins=True
).to_clipboard()

In [None]:
survey.to_csv(os.path.join('..', 'data', 'survey', '2015 on-board survey', 'survey_geocoded.csv'))