In [None]:
#default_exp HRSA_tracts

In [None]:
# "There are measurement challenges with both the Census and OMB definitions. Some policy 
# experts note that the Census definition classifies quite a bit of suburban area as rural. 
# The OMB definition includes rural areas in Metropolitan counties including, for example, 
# the Grand Canyon which is located in a Metro county. Consequently, one could argue that the 
# Census Bureau standard includes an overcount of the rural population whereas the OMB standard 
# represents an undercount." 

# To get the locations that are rural by HRSA definition, add these census tracts in metro/micro 
# counties to the non-metro/micro counties in the OMB definition. (HRSA is the Health Resources 
# and Services Administration. A unit of HRSA is the Federal Office of Rural Health Policy 
# (FORHP). FORHP is the unit that defines 'rural' in a way that is different from OMB and Census.

# The HRSA definition, in other words, is a refinement on the OMB definition. The Census 
# definition is unrelated to either and later notebooks show that it incorporates the largest 
# number of InfoGroup firms and the largest quantity of rural employment.

# from: https://www.hrsa.gov/rural-health/about-us/definition/index.html(the first paragraph is 
# also from this source):
# "The FORHP accepts all non-Metro counties as rural and uses an additional method of determining 
# rurality called the Rural-Urban Commuting Area (RUCA) codes." RUCA codes are computed from 
# Census data. See 
# https://www.ers.usda.gov/topics/rural-economy-population.aspx and 
# https://www.ers.usda.gov/data-products/rural-urban-commuting-area-codes/

In [None]:
# This is not the same universe as the ACP project. They "focus on the counties in which more than 80% of a given 
# county falls into the HRSA rural definition." This is not clear. 80% of what? Probably area, but in any case, 
# their definition applies to whole counties.

# To the extent that our analysis is based on individual businesses, we can locate observations within
# areas that are wholly rural; i.e., non-metro counties and rural census tracts in metro counties as defined by
# HRSA/FORHP. 

# At the bottom of this notebook we add flags to identify a record as 'rural' in each of the three 
# (OMB, Census, HRSA) criteria.

In [None]:
import PyPDF2
import pandas as pd

In [None]:
# Source: https://www.hrsa.gov/sites/default/files/hrsa/ruralhealth/resources/forhpeligibleareas.pdf
infile = '/InfoGroup/rural/data/FORHP_eligibleareas.pdf'

In [None]:
# pdf file object
pdfFileObj = open(infile, 'rb')
# pdf reader object
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)

In [None]:
# number of pages in pdf
npages = pdfReader.numPages
npages

48

In [None]:
pages = []
for pg in range(0,npages):
    # a page object
    pageObj = pdfReader.getPage(pg)
    # extracting text from page.
    pages.append(pageObj.extractText())
    
pdfFileObj.close()

In [None]:
len(pages)

48

In [None]:
pages[19]
# Tract data begins at pages[19]

'20 o\nf 48  Updated \n12/31/2018 \nSection II\n ALABAMA\n  Baldwin County\n  01003010100 \n 01003010200 \n\n 01003010500 \n\n 01003010600 \n\n 01003011000 \n\n 01003011401 \n\n 01003011403 \n\n 01003011406 \n\n 01003011407 \n\n 01003011408 \n\n 01003011501 \n\n 01003011502 \n\n 01003011601 \n Blount County\n  01009050101 \n 01009050102 \n\n 01009050300 \n\n 01009050400 \n Chilton County\n  01021060101 \n 01021060102 \n\n 01021060200 \n\n 01021060500 \n\n 01021060600 \n\n 01021060700 \n Elmore County\n  01051030400  \n\n Etowah County\n  01055011001 \n 01055011002 \n Geneva County\n  01061050100  \n Henry County\n  01067030300  \n\n Lawrence County\n  01079979400 \n 01079979800  \n Limestone County\n  01083020201 \n 01083020202 \n\n 01083020300 \n\n 01083020401 \n\n 01083020402 \n\n 01083020500 \n\n 01083020600 \n\n 01083020700 \n\n 01083020900 \n\n 01083021000 \n\n 01083021100 \n Mobile County\n  01097007202  \n\n Morgan \nCounty\n  01103005600  \n\n Pickens\n County\n  01107050200  \

In [None]:
with open('/InfoGroup/rural/data/tract_data.txt','w') as fout:
    for pg in range(19,48):
        fout.write(pages[pg])

In [None]:
# Extract just the tract IDs into a list. First 5 digits of Tract IDs are the same as the State/County FIPS code.
tracts = []
with open('/InfoGroup/rural/data/tract_data.tmp','r') as fin:
    for line in fin:
        if line[0] != chr(32):
            continue
        else:
            line = line.strip()
            try:
                if line[0].isnumeric(): 
                    tracts.append(line)
            except IndexError:
                pass

In [None]:
print(len(tracts))
with open('/InfoGroup/rural/data/rural_census_tracts.lis','w') as fout:
    for t in tracts:
        fout.write(t+'\n')

2302


In [None]:
year = 2017
infile = 'data/df_%d_OMB_Census.csv' % year
df = pd.read_csv(infile,dtype=object)

In [None]:
df['Census Tract'].fillna('999999',inplace=True)
df['FIPS Code'].fillna('99999',inplace=True)

In [None]:
# As read in from the csv file, 'Census Tract' and 'FIPS Code' are strings that look like floats (e.g., 12345.0).
# So they cannot be changed directly to ints; they must first be made into floats.
df = df.astype({'Census Tract':'float','FIPS Code':'float'})
df = df.astype({'Census Tract':'int','FIPS Code':'int','rural_OMB':'int'})

In [None]:
# A census tract ID comparable to the 11-digit ones in the HRSA list is a combination of the
# 'FIPS Code' and 'Census Tract' columns in df (InfoGroup), as long as neither is missing.

In [None]:
dfx = df[df['Census Tract']==999999]
print(len(df))
print(len(dfx))
# So very few missing values in the InfoGroup 'Census Tract' variable.

14733437
18


In [None]:
import numpy as np
def combo(row):
    if row['FIPS Code'] == 99999 or row['Census Tract'] == 999999:
        return np.nan
    else:
        try:
            return str(row['FIPS Code']) + str(row['Census Tract'])
        except TypeError:
            fips = str(int(row['FIPS Code']))
            tract = str(int(row['Census Tract']))
            return fips + tract

In [None]:
%%time
df['Full Census Tract'] = df.apply(combo,axis=1)

CPU times: user 17min 56s, sys: 12.2 s, total: 18min 9s
Wall time: 18min 9s


In [None]:
dfy = df[df['Full Census Tract'].isnull()]
len(dfy)

18

In [None]:
fct = df['Full Census Tract'].tolist()
print(len(fct))
fct_set = set(fct)
print(len(fct_set))

14733437
82385


In [None]:
rural_tracts = set(tracts)

In [None]:
len(rural_tracts)

2302

In [None]:
def rur3(row):
    if row['rural_OMB'] == 1 or row['Full Census Tract'] in rural_tracts:
        return 1
    return 0

In [None]:
%%time
df['rural_HRSA'] = df.apply(rur3,axis=1)

CPU times: user 11min 42s, sys: 9.64 s, total: 11min 52s
Wall time: 11min 51s


In [None]:
df['rural_HRSA'].value_counts()

0    13740765
1      992672
Name: rural_HRSA, dtype: int64

In [None]:
df['rural_HRSA'].value_counts(normalize=True) * 100

0    93.262455
1     6.737545
Name: rural_HRSA, dtype: float64

In [None]:
# Rural businesses by the HRSA definiton of rural are 6.74% of the total.

In [None]:
outfile = 'data/df_%d_OMB_Census_HRSA.csv' % year
df.to_csv(outfile,index=None)

In [None]:
# -----------------------

In [None]:
# InfoGroup for 2017 has data on 82,385 census tracts for all (above). The HRSA/FORHP file 
# of rural units lists 2,302 rural census tracts in addition to all those in non-Metro counties. The Census's 
# Zip Code-to-Census Tracts relationship file identifies 74,091 census tracts in all states(below).

In [None]:
# The ID of a census tract is the combination of the state FIPS, county FIPS, and tract number.
infile = '/home/tflory/Relationship_Files/Census_Tract_to_PUMA.csv'
ct_df = pd.read_csv(infile,usecols=['STATEFP','COUNTYFP','TRACTCE']).drop_duplicates()

In [None]:
len(ct_df['STATEFP'] <= 56)

74091