In [1]:
# default_exp frontier_and_remote

In [2]:
# hide
# Although both the FAR dataset and the ZCTA-to-whatever relationship files both come from 
# Census, the zip code areas presented by FAR are from ESRI and are not entirely compatible 
# with the ZCTA areas.
# ESRI business location data come from InfoGroup. It isn't clear where the zip code data 
# that FAR take from ESRI are from, but they almost certainly are postal zip codes.

In [3]:
# hide
# From Wikipedia:
# ZCTAs are generalized area representations of the United States Postal Service (USPS) ZIP code service areas, 
# but are not the same as ZIP codes. Individual USPS ZIP codes can cross state, place, county, census tract, 
# census block group and census block boundaries, so the Census Bureau asserts that "there is no correlation 
# between ZIP codes and Census Bureau geography".[1] Moreover, the USPS frequently realigns, merges, or splits 
# ZIP codes to meet changing needs. These changes are usually not reflected in the annual TIGER releases. 
# Each ZCTA is constructed by aggregating the Census 2010 blocks whose addresses use a given ZIP code. In 
# assembling census statistical units to create ZCTAs, the Census Bureau took the ZIP code used by the majority 
# of addresses in each census unit at the time the data was compiled. As a result, some addresses end up with a 
# ZCTA code that is different from their ZIP code. ZCTAs are not developed for ZIP codes that comprise only a 
# small number of addresses.[2] Several ZCTAs represent ZIPs that no longer exist due to realignment by the USPS.

# There are approximately 42,000 ZIP Codes and 32,000 ZCTAs. The main reason that there is not one ZCTA for every 
# ZIP Code is that PO Boxes are excluded in ZCTAs, since only populated areas are included in the Census data.

In [4]:
# hide
# Summary: problems using zip code as an analytical unit.

# Postal zip codes change to suit the Postal Services needs and those changes are not immediately reflected
# in other data bases or in TIGER files.

# ZCTA (standardized zipcode-based tablulation areas) units are not the same as postal zip codes, as described
# above. 

# We do not know the source of InfoGroup's zip code data. It is probably self-identified by the enterprise and
# therefore probably a postal zip code.

# ZCTA relationship files do not include any relationship between ZCTAs and postal zip codes. 

# ESRI business location data comes from InfoGroup, which probably suggests that all their zip code data are
# postal zip codes. So the zip codes in the FAR data should be compatible with our InfoGroup data, although not 
# aligned in time. The FAR data comes otherwise from the 2010 Census.

# The Census Geocoder can associate ZCTAs with lat/long, which would be just the thing. But it allows only 1,000 
# individual requests at a time. That would mean 14,700 separate batch jobs to apply ZCTA IDs to the 14.7 mn
# InfoGroup records for 2017. That doesn't come close to scaling.

# Python has a free zipcode database (installable with pip) called 'uszipcode'. It refers only to
# postal zip codes. It does not take addresses as input but it does take lat/long, among many other parameters.
# https://uszipcode.readthedocs.io/index.html#id8

# Through the ZCTA relationship files we can associate ZCTAs with the OMB's 
# metropolitan/micropolitan/nonmetropolitan spatial units and draw a basemap of ZCTAs, but we don't have
# analytical data at the moment at that geographical level.

In [5]:
import pandas as pd

In [6]:
def farlevel(row):
    if sum([row['far1'],row['far2'],row['far3'],row['far4']]) == 0:
        return 0
    elif row['far1'] == 1 and sum([row['far2'],row['far3'],row['far4']]) == 0:
        return 1
    elif row['far2'] == 1 and sum([row['far3'],row['far4']]) == 0:
        return 2
    elif row['far3'] == 1 and row['far4'] == 0:
        return 3
    elif row['far4'] == 1:
        return 4
    
def log_far_levels(dfx):
    f = len(dfx)
    f0 = len(dfx[dfx['FAR Level']==0])
    print('Not Far and remote in InfoGroup file:',str(((f0/f) * 100))+'%',file=logfile)
    print('Distribution of FAR types for enterprises with non-zero FAR Level:',file=logfile)
    df = dfx[dfx['FAR Level']>0]
    f = len(df)
    f1 = len(df[df['FAR Level'] == 1])
    print('\tFAR Level 1:',str(((f1/f) * 100))+'%',file=logfile)
    f2 = len(df[df['FAR Level'] == 2])
    print('\tFAR Level 2:',str(((f2/f) * 100))+'%',file=logfile)
    f3 = len(df[df['FAR Level'] == 3])
    print('\tFAR Level 3:',str(((f3/f) * 100))+'%',file=logfile)
    f4 = len(df[df['FAR Level'] == 4])
    print('\tFAR Level 4:',str(((f4/f) * 100))+'%',file=logfile)

In [7]:
far_file = '/InfoGroup/data/rurality/FARcodesZIPdata2010WithAKandHI.csv'
df = pd.read_csv(far_file,dtype=object)
df['ZIP'] = df['ZIP'].apply(lambda x: x.zfill(5) if len(x) < 5 == 0 else x)
df_zip = df[['ZIP','far1','far2','far3','far4']].copy()
df_zip[['far1','far2','far3','far4']] = df_zip[['far1','far2','far3','far4']].astype(int)
df_zip['FAR Level'] = df_zip.apply(farlevel,axis=1)
df_zip = df_zip.drop_duplicates()

In [8]:
logfile = open('006-FAR.log','w')

In [9]:
for yr in range(1997,2018):
    print(f'{yr}:',file=logfile)
    infile = f'/InfoGroup/data/rurality/InfoGroup_{yr}_nb05.csv'
    df_ig = pd.read_csv(infile,dtype=object)
    df_ig['ZipCode'] = df_ig['ZipCode'].apply(lambda x: x.zfill(5) if len(x) < 5 == 0 else x)
    
    df_out = df_ig.merge(df_zip,how='left',left_on='ZipCode',right_on='ZIP',indicator=True)
    df_out.drop(columns=['ZIP','_merge'],inplace=True)
    
    # FAR codes apply only to the continental states. To select just those for reporting:
    # df_select = df_out[(df_out['Continental']=='1') & (df_out['State FIPS'] != '11')].copy()
    
    # Log the distribution of FAR levels by year.
    log_far_levels(df_out)
    
    # Write out new file.
    outfile = f'/InfoGroup/data/rurality/InfoGroup_{yr}_nb06.csv'
    df_out.to_csv(outfile,index=None)

In [10]:
logfile.close()

In [8]:
import pandas as pd
#df = pd.read_csv('/InfoGroup/data/rurality/InfoGroup_2017_nb07.csv',dtype=object)
#df.dropna(inplace=True)
df = df[df['FAR Level']!='0.0']
df[['far1','far2','far3','far4','FAR Level']].tail(20)

Unnamed: 0,far1,far2,far3,far4,FAR Level
7951067,1.0,1.0,0.0,0.0,2.0
7953024,1.0,1.0,0.0,0.0,2.0
7988609,1.0,0.0,0.0,0.0,1.0
8514297,1.0,0.0,0.0,0.0,1.0
8514446,1.0,0.0,0.0,0.0,1.0
8650286,1.0,1.0,1.0,1.0,4.0
8975232,1.0,0.0,0.0,0.0,1.0
8977125,1.0,0.0,0.0,0.0,1.0
8991782,1.0,0.0,0.0,0.0,1.0
9392516,1.0,0.0,0.0,0.0,1.0
