In [None]:
# Explore rural census tracts in metro counties (CBSAs) and urban areas (UAs).

In [None]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon

In [None]:
pd.options.display.max_rows = 999

In [None]:
# All tracts
gdf = gpd.read_file('/InfoGroup/rural/map_files/tracts/tl_2017_USA_tract.shp',dtype=object)

In [None]:
gdf.columns

Index(['STATEFP', 'COUNTYFP', 'TRACTCE', 'GEOID', 'NAME', 'NAMELSAD', 'MTFCC',
       'FUNCSTAT', 'ALAND', 'AWATER', 'INTPTLAT', 'INTPTLON', 'geometry'],
      dtype='object')

In [None]:
# Compute the centroid of the polygon
gdf['centroid'] = gdf['geometry'].apply(lambda x: Point(x.centroid)) # shapely methods

In [None]:
gdf.head()

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,GEOID,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry,centroid
0,46,75,91600,46075091600,916,Census Tract 916,G5020,S,2511453379,3147180,43.9519896,-100.6861386,"POLYGON ((-101.064626 43.96611799999999, -101....",POINT (-100.6897219989547 43.96060830334499)
1,46,129,965200,46129965200,9652,Census Tract 9652,G5020,S,50637139,17634531,45.5481591,-100.4222642,"POLYGON ((-100.486583 45.537934, -100.486571 4...",POINT (-100.4204913301959 45.55065746393002)
2,46,129,965100,46129965100,9651,Census Tract 9651,G5020,S,1784710819,75580712,45.4278339,-100.0075629,"POLYGON ((-100.387018 45.53725, -100.386639 45...",POINT (-100.0172511111368 45.42551049826866)
3,46,91,950800,46091950800,9508,Census Tract 9508,G5020,S,2170630077,124680563,45.7370443,-97.5808695,"POLYGON ((-97.979636 45.603137, -97.979635 45....",POINT (-97.5985004429327 45.75867076685663)
4,46,109,940700,46109940700,9407,Census Tract 9407,G5020,S,1293755958,38170551,45.8123364,-96.9277941,"POLYGON ((-97.228313 45.934417, -97.2283109999...",POINT (-96.92711283028811 45.81412865434089)


In [None]:
# File that contains polygon geometry of UAs for 2017
#ua_gdf = gpd.read_file('map_files/tl_2017_us_uac10.shp')
# File that contains polygon geometry of UAs for 2010
ua_gdf = gpd.read_file('map_files/tl_2010_us_uac10.shp')
ua_gdf.columns

Index(['UACE10', 'GEOID10', 'NAME10', 'NAMELSAD10', 'LSAD10', 'MTFCC10',
       'UATYP10', 'FUNCSTAT10', 'ALAND10', 'AWATER10', 'INTPTLAT10',
       'INTPTLON10', 'geometry'],
      dtype='object')

In [None]:
ua_gdf['UATYP10'].value_counts()

C    3095
U     497
Name: UATYP10, dtype: int64

In [None]:
print(len(gdf))
print(len(ua_gdf))

73056
3592


## See the comments below.

In [None]:
xgdf = gdf.sample(frac=0.1).copy()
xua_gdf = ua_gdf.sample(frac=0.1).copy()

In [None]:
print(len(xgdf))
print(len(xua_gdf))

7306
359


In [None]:
# Assign a UA code 
relationship = []
for j,k in xgdf.iterrows():
    for i,l in xua_gdf.iterrows():
        if k['centroid'].within(l['geometry']):
            relationship.append([k['GEOID'],l['GEOID10'],l['UATYP10']])

In [None]:
rel_df = pd.DataFrame(relationship,columns=['tract GEOID','UA GEOID10','UATYP10'])

In [None]:
rel_df.head()

In [None]:
#rel_df.to_csv('/InfoGroup/rural/points-in-polygons/data/rel_df.csv',index=None)

In [None]:
# The above creation of the relationship list of lists and rel_df.csv file worked on a sample from
# the input files.
# The same logic applied to the full UA and tract files was executed in the tracts.py program in
# /InfoGroup/rural/points-in-polygons/. It took 14.67 clock hours to complete. The cells below 
# read the rel_df.csv
# file created by that program execution. There are 50,198 pairs of tract and UA codes. These 
# are cases in which the
# centroid of a tract was found to be contained within the boundaries of a census urban area. 
# Matching back to the
# full list of census tracts, we can identify those as rural whose centroids do not fall within 
# any UA.

In [None]:
rel_df = pd.read_csv('/InfoGroup/rural/points-in-polygons/data/rel_df.csv',dtype=object)

In [None]:
len(rel_df)

50198

In [None]:
## All tracts
gdf = gpd.read_file('/InfoGroup/rural/map_files/tracts/tl_2017_USA_tract.shp',dtype=object)

In [None]:
# Merge the two dataframes. The resulting dataframe will have the rural tracts flagged by a missing value
# for 'UA Code'.
merged = gdf.merge(rel_df,how='left',left_on='GEOID',right_on='tract_GEOID',indicator=True)

In [None]:
merged['_merge'].value_counts()

both          50198
left_only     22858
right_only        0
Name: _merge, dtype: int64

In [None]:
rur = merged[merged['_merge']=='left_only']
print(len(rur))
print(rur.head())

22858
  STATEFP COUNTYFP TRACTCE        GEOID  NAME           NAMELSAD  MTFCC  \
0      46      075  091600  46075091600   916   Census Tract 916  G5020   
1      46      129  965200  46129965200  9652  Census Tract 9652  G5020   
2      46      129  965100  46129965100  9651  Census Tract 9651  G5020   
3      46      091  950800  46091950800  9508  Census Tract 9508  G5020   
4      46      109  940700  46109940700  9407  Census Tract 9407  G5020   

  FUNCSTAT       ALAND     AWATER     INTPTLAT      INTPTLON  \
0        S  2511453379    3147180  +43.9519896  -100.6861386   
1        S    50637139   17634531  +45.5481591  -100.4222642   
2        S  1784710819   75580712  +45.4278339  -100.0075629   
3        S  2170630077  124680563  +45.7370443  -097.5808695   
4        S  1293755958   38170551  +45.8123364  -096.9277941   

                                            geometry tract_GEOID UA_GEOID10  \
0  POLYGON ((-101.064626 43.96611799999999, -101....         NaN        NaN   


In [None]:
merged['rural_spatial_tract'] = merged['UA_GEOID10'].astype(str).apply(lambda x: 1 if x=='nan' else 0)

In [None]:
merged['rural_spatial_tract'].value_counts()

0    50198
1    22858
Name: rural_spatial_tract, dtype: int64

In [None]:
merged.columns

Index(['STATEFP', 'COUNTYFP', 'TRACTCE', 'GEOID', 'NAME', 'NAMELSAD', 'MTFCC',
       'FUNCSTAT', 'ALAND', 'AWATER', 'INTPTLAT', 'INTPTLON', 'geometry',
       'tract_GEOID', 'UA_GEOID10', '_merge', 'rural_spatial_tract'],
      dtype='object')

In [None]:
merged.drop(columns=['tract_GEOID','_merge'],inplace=True)

In [None]:
len (merged)

73056

In [None]:
rural_only = merged[merged['rural_spatial_tract']==1]
rural_only.to_csv('/InfoGroup/rural/points-in-polygons/data/rural_spatial_tracts.csv',index=None)

In [None]:
merged.to_csv('/InfoGroup/rural/points-in-polygons/data/all_tracts.csv',index=None)

In [None]:
len(rural_only)/len(merged)

0.312883267630311

In [None]:
# 31.3% of census tracts are rural by this definition; i.e., that the tract's spatial centroid does not fall
# anywhere within the boundaries of a census urban area.
#
# Do the FORHP rural census tracts (defined entirely differently) overlap considerably with this complete
# list of rural census tracts?

In [None]:
rural_tracts = [line.rstrip("\n") for line in open("data/rural_census_tracts.lis")]
len(rural_tracts)

2302

In [None]:
forhp_tracts = pd.DataFrame(rural_tracts,columns=['GEOID'])

In [None]:
overlap_merge = merged.merge(forhp_tracts,on='GEOID',how='outer',indicator=True)
overlap_merge['_merge'].value_counts()

left_only     70849
both           2207
right_only       95
Name: _merge, dtype: int64

In [None]:
# So 95 of the FORHP 2,302 rural census tracts do not qualify as rural by the centroid-in-UA criterion.
95/2302
# That's 4.1%.

0.04126846220677671

In [None]:
# The UA and tract files used in this procedure are from 2017. The FORHP calculations apply to
# 2010, with revisions to the second-tier codes in 2019.