In [None]:
# Make this the definitive list of HRSA/FORHP rural census tracts rather than those in notebook 4(1).

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('data/nonmetrocountiesandcts2016.csv', dtype=object)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4545 entries, 0 to 4544
Data columns (total 6 columns):
ST            4545 non-null object
CountyName    4545 non-null object
CT            2534 non-null object
RUCA 2010     2534 non-null object
CTY FIPS      4545 non-null object
Memo          28 non-null object
dtypes: object(6)
memory usage: 213.2+ KB


In [None]:
rural_tracts_df = df[~df['CT'].isnull()]
len(rural_tracts_df)

2534

In [None]:
# How well do these 2534 tracts overlap with the smaller number in the separate pdf document
# that was processed in notebook 4(1)?

In [None]:
tracts = []
with open('/InfoGroup/rural/data/rural_census_tracts.lis', 'r') as fin:
    for tract in fin:
        tracts.append(tract.rstrip())
ser = pd.DataFrame(tracts, columns=(['CT']))
print(ser.head())
print(len(ser))

            CT
0  01003010100
1  01003010200
2  01003010500
3  01003010600
4  01003011000
2302


In [None]:
overlap = rural_tracts_df.merge(ser, on='CT', how='outer', indicator=True)
overlap['_merge'].value_counts()

both          2215
left_only      319
right_only      87
Name: _merge, dtype: int64

In [None]:
# So the pdf lists 87 tracts that are not in the 2016 spreadsheet, and the
# spreadsheet lists 319 that are not in the pdf.
# The pdf includes only tracts with RUCA codes 4-10, except for a few anomalous cases
# that I think are all in Alaska.
#
# What are the RUCA scores for the tracts in the 2016 'nonmetro tracts and counties'
# spreadsheet? Keep in mind that these are selected tracts within metro counties. All tracts
# in nonmetro counties are considered rural, apparently without respect to any RUCA codes.
rural_tracts_df['RUCA 2010'].value_counts()

4     994
10    534
7     461
5     205
2     116
6      89
8      71
9      57
3       7
Name: RUCA 2010, dtype: int64

# BELOW IS THE IMPORTANT STUFF.

In [None]:
import pandas as pd

In [None]:
# Examine ALL census tracts in the 'ruca2010revised' spreadsheet to replicate the
# criteria of rurality and compile a complete list of census tracts, not just those
# in metro counties.
#
# The spreadsheet with this data is linked on
# https://www.ers.usda.gov/data-products/rural-urban-commuting-area-codes/
full_df = pd.read_csv('data/ruca2010revised.csv', dtype=object)

In [None]:
full_df = full_df.astype({'Tract': 'int',
                          'Primary RUCA Code 2010': 'int',
                          'Land Area (square miles) 2010': 'float',
                          'Population Density (per square mile) 2010': 'float'})

In [None]:
# The following criteria identify tracts "that are extremely large and where
# use of RUCA codes alone fails to account for distance to services and sparse population. In
# response to these concerns, ORHP has also designated as rural census tracts with RUCA codes 2
# or 3 that are at least 400 square miles in area with a population density of no more than 35
# people."
exceptional_tracts = full_df[((full_df['Primary RUCA Code 2010'] == 2) |
                              (full_df['Primary RUCA Code 2010'] == 3)) &
                             (full_df['Population Density (per square mile) 2010'] <= 35.0) &
                             (full_df['Land Area (square miles) 2010'] >= 400.0)]

In [None]:
print(len(exceptional_tracts))
print(exceptional_tracts['Population Density (per square mile) 2010'].max())
print(exceptional_tracts['Land Area (square miles) 2010'].min())
print(exceptional_tracts['Primary RUCA Code 2010'].value_counts())

175
21.9
402.8
2    162
3     13
Name: Primary RUCA Code 2010, dtype: int64


In [None]:
# Now all other rural census tracts: "Census tracts with
# RUCA codes 4 through 10 are considered rural for the purposes of Rural Health grants. "
regular_tracts = full_df[full_df['Primary RUCA Code 2010'] >= 4]
print(len(regular_tracts))
print(regular_tracts['Primary RUCA Code 2010'].value_counts())

13764
4     4261
10    3461
7     2165
5     1972
8      827
6      411
9      343
99     324
Name: Primary RUCA Code 2010, dtype: int64


In [None]:
all_rural_tracts = pd.concat([regular_tracts, exceptional_tracts])
print(len(all_rural_tracts))
print(len(full_df))

13939
74002


In [None]:
# These are the 2010 census tracts that qualify as rural by the HRSA/FORHP definition: 18.8%
# of the total.
len(all_rural_tracts)/(len(full_df)) * 100.0

18.835977406016053

In [None]:
all_rural_tracts.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13939 entries, 12 to 73016
Data columns (total 9 columns):
State-County FIPS Code                       13939 non-null object
State                                        13939 non-null object
County                                       13939 non-null object
Tract                                        13939 non-null int64
Primary RUCA Code 2010                       13939 non-null int64
Secondary RUCA Code 2010                     13939 non-null object
Tract Population 2010                        13939 non-null object
Land Area (square miles) 2010                13939 non-null float64
Population Density (per square mile) 2010    13579 non-null float64
dtypes: float64(2), int64(2), object(5)
memory usage: 1.1+ MB


In [None]:
all_rural_tracts.to_csv('/InfoGroup/rural/points-in-polygons/data/rural_HRSA_updated_tracts.csv',index=None)

In [None]:
hrsa_tracts = set(all_rural_tracts['Tract'].tolist())
type(hrsa_tracts)

set

In [None]:
# Compare this to the number and percentage of rural tracts as defined by purely spatial
# relationships: a tract is rural if its spatial centroid is not contained within the polygon
# of a census urban area.
# 31.3% of census tracts are rural by the spatial definition. That's 73,056 census tracts of
# which 22,858 are rural. The UA and tract lists are from 2017.
# The HRSA/FORHP RUCA scores use 2010 data.

In [None]:
# Now get the enterprise counts and employment numbers from InfoGroup for both of these
# rurality concepts.

In [None]:
infile = 'data/df_2017_OMB_Census_HRSA_FAR.csv'
df = pd.read_csv(infile, dtype=object)

In [None]:
# 'Full Census Tract' is the 11-digit tract ID
df['Full Census Tract'].fillna('999999999', inplace=True)
df['Full Census Tract'] = df['Full Census Tract'].astype(int)

In [None]:
df['rural_updated_HRSA'] = df['Full Census Tract'].apply(
    lambda x: 1 if x in hrsa_tracts else 0)
df['rural_updated_HRSA'].value_counts()

0    13139051
1     1594386
Name: rural_updated_HRSA, dtype: int64

In [None]:
print('Percentage of 2017 InfoGroup enterprises that are rural according to updated HRSA standard: ', end='')
rural = len(df[df['rural_updated_HRSA'] == 1])
total = len(df)
print(str(rural/total * 100.0))

Percentage of 2017 InfoGroup enterprises that are rural according to updated HRSA standard: 10.821548291820843


In [None]:
df['Employee Size (5) - Location'] = df['Employee Size (5) - Location'].astype(float)
total_emp = df['Employee Size (5) - Location'].sum()
print("Total employment:", str(total_emp))

Total employment: 159762888.0


In [None]:
total_updated_HRSA = df[df['rural_updated_HRSA']
                        == 1]['Employee Size (5) - Location'].sum()
pct = (total_updated_HRSA / total_emp) * 100.0
print('Updated HRSA rural percentage employment:', str(pct))

Updated HRSA rural percentage employment: 9.456448984572688


In [None]:
# Now the same for the tract-spatial standard.
df_spatial = pd.read_csv(
    '/InfoGroup/rural/points-in-polygons/data/all_tracts.csv', dtype=object)
df_spatial['rural_tract'].value_counts()

0    50198
1    22858
Name: rural_tract, dtype: int64

In [None]:
spatial_tracts = df_spatial[df_spatial['rural_tract']
                            == '1']['GEOID'].astype(int).tolist()

In [None]:
% % time
df['rural_spatial_tracts'] = df['Full Census Tract'].apply(
    lambda x: 1 if x in spatial_tracts else 0)
df['rural_spatial_tracts'].value_counts()

CPU times: user 1h 17min 10s, sys: 616 ms, total: 1h 17min 11s
Wall time: 1h 17min 9s


0    12940369
1     1793068
Name: rural_spatial_tracts, dtype: int64

In [None]:
# Percentage of InfoGroup enterprises located in a spatially defined rural census tract.
1793068/len(df) * 100.0

12.170059165420804

In [None]:
total_spatial_tracts = df[df['rural_spatial_tracts']
                          == 1]['Employee Size (5) - Location'].sum()
pct = (total_spatial_tracts / total_emp) * 100.0
print('Spatially defined rural census tracts percentage employment:', str(pct))

Spatially defined rural census tracts percentage employment: 10.553179909967577


In [None]:
df.to_csv('data/df_2017_OMB_Census_HRSA_FAR_tracts.csv', index=None)

In [None]:
# How much overlap is there between the HRSA-revised and the spatial lists of rural census tracts?

In [None]:
import pandas as pd
df = pd.read_csv('data/df_2017_OMB_Census_HRSA_FAR_tracts.csv', dtype=object)

In [None]:
df.columns

Index(['Company', 'Address Line 1', 'City', 'State', 'ZipCode', 'County Code',
       'Primary SIC Code', 'SIC6_Descriptions', 'Primary NAICS Code',
       'NAICS8 Descriptions', 'Employee Size (5) - Location',
       'Sales Volume (9) - Location', 'Business Status Code',
       'Industry Specific First Byte', 'Year Established', 'ABI',
       'Subsidiary Number', 'Parent Number', 'Parent Actual Employee Size',
       'Parent Actual Sales Volume', 'Census Tract', 'Census Block',
       'Latitude', 'Longitude', 'CBSA Code', 'CBSA Level', 'FIPS Code',
       'State FIPS', 'Continental', 'NAICS2', 'NAICS2 desc', 'CBSA Level desc',
       'rural_OMB', 'rural_Census_general', 'NAICS6', 'NAICS6 desc', 'UA',
       'rural_Census', 'Full Census Tract', 'rural_HRSA', 'far1', 'far2',
       'far3', 'far4', 'FAR Level', 'rural_updated_HRSA',
       'rural_spatial_tracts'],
      dtype='object')

In [None]:
revised_set = set(df[df['rural_updated_HRSA'] == '1']['ABI'].tolist())
spatial_set = set(df[df['rural_spatial_tracts'] == '1']['ABI'].tolist())

In [None]:
print(len(revised_set))
print(len(spatial_set))

1594386
1793068


In [None]:
% % time
l = 0
r_not_in_s = []
for r in revised_set:
    if r not in spatial_set:
        l += 1
        r_not_in_s.append(r)
print(str(l), 'ABIs in revised_HRSA rural enterprises that are not among spatial rural enterprises.')

436434 ABIs in revised_HRSA rural enterprises that are not among spatial rural enterprises.
CPU times: user 661 ms, sys: 23.4 ms, total: 684 ms
Wall time: 679 ms


In [None]:
% % time
n = 0
s_not_in_r = []
for s in spatial_set:
    if s not in revised_set:
        n += 1
        s_not_in_r.append(r)
print(str(n), 'ABIs in spatial rural enterprises that are not among revised_HRSA rural enterprises.')

635116 ABIs in spatial rural enterprises that are not among revised_HRSA rural enterprises.
CPU times: user 744 ms, sys: 23.9 ms, total: 768 ms
Wall time: 763 ms


In [None]:
total_set = set(list(revised_set) + list(spatial_set))

In [None]:
len(total_set)

2229502

In [None]:
len(df)

14733437

In [None]:
df['Employee Size (5) - Location'] = df['Employee Size (5) - Location'].astype(float)
total_tracts = df[(df['rural_spatial_tracts'] == '1') | (
    df['rural_updated_HRSA'] == '1')]['Employee Size (5) - Location'].sum()
pct = (total_tracts / 159762888.0) * 100.0
print('Total rural census tracts percentage employment:', str(pct))

Total rural census tracts percentage employment: 13.362190848728272
