In [None]:
#default_exp frontier_and_remote

In [None]:
# Although both the FAR dataset and the ZCTA-to-whatever relationship files both come from Census,
# the zip code areas presented by FAR are from ESRI and are not entirely compatible with the ZCTA areas.
# ESRI business location data come from InfoGroup. It isn't clear where the zip code data that FAR take
# from ESRI are from, but they almost certainly are postal zip codes.

In [None]:
# From Wikipedia:
# ZCTAs are generalized area representations of the United States Postal Service (USPS) ZIP code service areas, 
# but are not the same as ZIP codes. Individual USPS ZIP codes can cross state, place, county, census tract, 
# census block group and census block boundaries, so the Census Bureau asserts that "there is no correlation 
# between ZIP codes and Census Bureau geography".[1] Moreover, the USPS frequently realigns, merges, or splits 
# ZIP codes to meet changing needs. These changes are usually not reflected in the annual TIGER releases. 
# Each ZCTA is constructed by aggregating the Census 2010 blocks whose addresses use a given ZIP code. In 
# assembling census statistical units to create ZCTAs, the Census Bureau took the ZIP code used by the majority 
# of addresses in each census unit at the time the data was compiled. As a result, some addresses end up with a 
# ZCTA code that is different from their ZIP code. ZCTAs are not developed for ZIP codes that comprise only a 
# small number of addresses.[2] Several ZCTAs represent ZIPs that no longer exist due to realignment by the USPS.

# There are approximately 42,000 ZIP Codes and 32,000 ZCTAs. The main reason that there is not one ZCTA for every 
# ZIP Code is that PO Boxes are excluded in ZCTAs, since only populated areas are included in the Census data.

In [None]:
# Summary: problems using zip code as an analytical unit.

# Postal zip codes change to suit the Postal Services needs and those changes are not immediately reflected
# in other data bases or in TIGER files.

# ZCTA (standardized zipcode-based tablulation areas) units are not the same as postal zip codes, as described
# above. 

# We do not know the source of InfoGroup's zip code data. It is probably self-identified by the enterprise and
# therefore probably a postal zip code.

# ZCTA relationship files do not include any relationship between ZCTAs and postal zip codes. 

# ESRI business location data comes from InfoGroup, which probably suggests that all their zip code data are
# postal zip codes. So the zip codes in the FAR data should be compatible with our InfoGroup data, although not 
# aligned in time. The FAR data comes otherwise from the 2010 Census.

# The Census Geocoder can associate ZCTAs with lat/long, which would be just the thing. But it allows only 1,000 
# individual requests at a time. That would mean 14,700 separate batch jobs to apply ZCTA IDs to the 14.7 mn
# InfoGroup records for 2017. That doesn't come close to scaling.

# Python has a free zipcode database (installable with pip) called 'uszipcode'. It refers only to
# postal zip codes. It does not take addresses as input but it does take lat/long, among many other parameters.
# https://uszipcode.readthedocs.io/index.html#id8

# Through the ZCTA relationship files we can associate ZCTAs with the OMB's 
# metropolitan/micropolitan/nonmetropolitan spatial units and draw a basemap of ZCTAs, but we don't have
# analytical data at the moment at that geographical level.

In [None]:
import pandas as pd

In [None]:
infile = 'data/FARcodesZIPdata2010WithAKandHI.csv'
df = pd.read_csv(infile,dtype=object)

In [None]:
df['ZIP'] = df['ZIP'].apply(lambda x: x.zfill(5) if len(x) < 5 == 0 else x)

In [None]:
df.columns

Index(['ZIP', 'state', 'name', 'far1', 'far2', 'far3', 'far4', 'gridpop',
       'sqmi', 'density', 'fr1pop', 'fr2pop', 'fr3pop', 'fr4pop', 'fr1pct',
       'fr2pct', 'fr3pct', 'fr4pct'],
      dtype='object')

In [None]:
df.head()

Unnamed: 0,ZIP,state,name,far1,far2,far3,far4,gridpop,sqmi,density,fr1pop,fr2pop,fr3pop,fr4pop,fr1pct,fr2pct,fr3pct,fr4pct
0,2,AK,Yukon Flats Nat Wildlife,1,1,1,1,607,95707,0.0,606.4550106,606.4550106,606.4550106,606.4550106,99.96791067,99.96791067,99.96791067,99.96791067
1,7,AK,Southerly North Slope Bo,1,1,1,1,82,65388,0.0,81.6690849,81.6690849,81.6690849,81.6690849,100.0,100.0,100.0,100.0
2,99723,AK,Barrow,1,1,1,0,4714,20301,0.2,4713.558186,4713.558186,4713.558186,682.6115643,100.0,100.0,100.0,14.48187414
3,99559,AK,Bethel,1,1,1,1,11984,18097,0.7,11983.63138,11983.63138,11983.63138,6710.332216,100.0,100.0,100.0,55.99581632
4,99573,AK,Copper Center,1,1,1,1,1896,17741,0.1,1896.342902,1896.342902,1896.342902,1896.342902,100.0,100.0,100.0,100.0


In [None]:
# Create a 'FAR Level' variable: 
# 0 if all far* variables are zero.
# 1 if far1==1 and all others 0.
# 2 if far2==1 and far3==0 and far4==0
# 3 if far3==1 and far4==0
# 4 if far4==1

In [None]:
df_zip = df[['ZIP','far1','far2','far3','far4']].copy()
df_zip[['far1','far2','far3','far4']] = df_zip[['far1','far2','far3','far4']].astype(int)

In [None]:
def farlevel(row):
    if sum([row['far1'],row['far2'],row['far3'],row['far4']]) == 0:
        return 0
    elif row['far1'] == 1 and sum([row['far2'],row['far3'],row['far4']]) == 0:
        return 1
    elif row['far2'] == 1 and sum([row['far3'],row['far4']]) == 0:
        return 2
    elif row['far3'] == 1 and row['far4'] == 0:
        return 3
    elif row['far4'] == 1:
        return 4

In [None]:
df_zip['FAR Level'] = df_zip.apply(farlevel,axis=1)

In [None]:
print(len(df_zip))
df_zip = df_zip.drop_duplicates()
print(len(df_zip))

30337
30319


In [None]:
df_zip['FAR Level'].value_counts()

0    24999
4     2599
1     1167
2      967
3      587
Name: FAR Level, dtype: int64

In [None]:
infile = 'data/df_2017_OMB_Census_HRSA.csv'
df_ig = pd.read_csv(infile,dtype=object)
len(df_ig)

14733437

In [None]:
df_ig['ZipCode'] = df_ig['ZipCode'].apply(lambda x: x.zfill(5) if len(x) < 5 == 0 else x)

In [None]:
%%time
# Add FAR codes to each InfoGroup record by matching on zip codes
df_out = df_ig.merge(df_zip,how='left',left_on='ZipCode',right_on='ZIP',indicator=True)

CPU times: user 1min 39s, sys: 21.1 s, total: 2min
Wall time: 1min 34s


In [None]:
df_out['_merge'].value_counts()

both          13265723
left_only      1467714
right_only           0
Name: _merge, dtype: int64

In [None]:
df_out.drop(columns=['ZIP','_merge'],inplace=True)

In [None]:
# Select records only for 48 continental states for reporting. (FAR codes apply only to them.)
df_select = df_out[(df_out['Continental']=='1') & (df_out['State FIPS'] != '11')].copy()
len(df_select)

14571195

In [None]:
f = len(df_select)
print('***',str(f),'***')

f1 = len(df_select[df_select['FAR Level'] == 1])
print(f1/f)

f2 = len(df_select[df_select['FAR Level'] == 2])
print(f2/f)

f3 = len(df_select[df_select['FAR Level'] == 3])
print(f3/f)

f4 = len(df_select[df_select['FAR Level'] == 4])
print(f4/f)

*** 14571195 ***
0.01518399829252165
0.011140541321422162
0.007552091643821938
0.008008128365587036


In [None]:
fmiss = len(df_select[df_select['FAR Level'].isnull()])
print(str(fmiss/f))

fnum = len(df_select[df_select['FAR Level'] >= 0])
print(str(fnum/f))

print('\n'+'Stats only for in-scope (48 states) and non-missing.')
f0 = len(df_select[df_select['FAR Level'] == 0])
print('Not Far and remote:',str(f0/fnum))

ft = len(df_select[df_select['FAR Level'] >= 1])
print('Far and remote:',str(ft/fnum))

0.10019150797172093
0.8998084920282791

Stats only for in-scope (48 states) and non-missing.
Not Far and remote: 0.9534514732919008
Far and remote: 0.04654852670809917


In [None]:
outfile = 'data/df_2017_OMB_Census_HRSA_FAR.csv'
df_out.to_csv(outfile,index=None)