In [None]:
# default_exp urban_rural_tracts

In [None]:
# Double-check the FORHP list of census tracts that have rural characteristics but that are
# contained within a metro county.

In [None]:
import pandas as pd
import geopandas as gpd

In [None]:
# Comparisons and creation of GeoDataFrame and shapefile for rural census tracts, both types.

In [None]:
dir = '~/notebooks/InfoGroup/rurality/points-in-polygons/data/'
hrsa_rural = pd.read_csv(dir + 'rural_HRSA_updated_tracts.csv',dtype=object)
spatial_rural = pd.read_csv(dir + 'rural_spatial_tracts.csv',dtype=object)
all_tracts = pd.read_csv(dir + 'all_tracts.csv',dtype=object)
#all_tracts = gpd.read_file(dir + '../../map_files/tracts/tl_2017_USA_tract.shp',dtype=object)

In [None]:
print(len(hrsa_rural))
print(len(spatial_rural))
print(len(all_tracts))

In [None]:
hrsa_rural['rural_HRSA_updated_tract'] = '1'

In [None]:
len(all_tracts[all_tracts['rural_spatial_tract']=='1'])

In [None]:
n = 0
m = 0
for i,v in all_tracts['geometry'].iteritems():
    if 'MULTIPOLYGON' in v:
        n += 1
    else:
        m += 1
print(str(n),str(m))
# This is why we can't output a shapefile from all_tracts or any merged file below: 
# some MULTIPOLYGONs among the POLYGONs.

In [None]:
rurals_merge = spatial_rural.merge(hrsa_rural, left_on='GEOID', right_on='Tract', how='outer',
                                   indicator=True)

In [None]:
rurals_merge['_merge'].value_counts()
# 27384 of 74027 [unique tract numbers in RUCA list (2010) and shapefile (2017) = 37.0%

In [None]:
# all_tracts and spatial_rural variables come from the census shapefile and include geometry.
# hrsa_rural comes from the RUCA file and does not include geometry:
#          ruca_df = pd.read_csv('data/ruca2010revised.csv', dtype=object)
# Now create a single frame with all the variables and both flags.

In [None]:
ruca_df = pd.read_csv('/InfoGroup/data/rurality/ruca2010revised.csv', dtype=object)
print(len(ruca_df))
print(ruca_df.columns)

In [None]:
ruca_df.head()

In [None]:
# 73,056 tracts in the census shape file for 2017.
# The RUCA file is from 2010.
len(ruca_df)

In [None]:
# GEOID is the tract ID from the shapefile. "Tract" is the tract ID in the RUCA file.
# How much overlap?
# Then create a separate file for rural tracts only.
all_tracts_merged = ruca_df.merge(all_tracts, left_on='Tract', right_on='GEOID', how='outer',
                                  indicator=True)
all_tracts_merged['_merge'].value_counts()

In [None]:
all_tracts_merged.columns

In [None]:
all_tracts_merged.drop(columns='_merge',inplace=True)
all_tracts_2017 = all_tracts_merged.merge(hrsa_rural[['Tract','rural_HRSA_updated_tract']],
                                         how='outer',on='Tract',indicator=True)
all_tracts_2017['_merge'].value_counts()

In [None]:
all_tracts_2017.dtypes

In [None]:
all_tracts_2017.drop(columns='_merge',inplace=True)

In [None]:
# Since we can't write a shapefile from geopandas from this file with both MULTIPOLYGONs and
# POLYGONs, we'll have to see if ArcMAP can read in a csv file.
all_tracts_2017.to_csv('~/notebooks/InfoGroup/rurality/map_files/all_tracts_2017.csv')

In [None]:
# Select just the rural tracts from the USA shapefile. Maybe that will work in ArcMap.
usa_shapefile = '/home/tflory/notebooks/InfoGroup/rurality/map_files/USA_census_tracts.shp'
usa_gdf = gpd.read_file(usa_shapefile,driver='ESRI Shapefile')
all_tracts_2017_df = pd.read_csv('/home/tflory/notebooks/InfoGroup/rurality/map_files/all_tracts_2017.csv',
                                usecols=['Tract','rural_spatial_tract','rural_HRSA_updated_tract'],
                                dtype=object)

In [None]:
print(usa_gdf.columns)
print(all_tracts_2017_df.columns)
print(usa_gdf.head())

In [None]:
usa_gdf_merged = usa_gdf.merge(all_tracts_2017_df,left_on='FIPS',right_on='Tract',how='outer',
                              indicator=True)
usa_gdf_merged['_merge'].value_counts()

In [None]:
usa_gdf_merged.drop(columns='_merge',inplace=True)
usa_gdf_merged.columns

In [None]:
usa_rural_tracts = gpd.GeoDataFrame(usa_gdf_merged[
    (usa_gdf_merged['rural_HRSA_updated_tract']=='1') | 
    (usa_gdf_merged['rural_spatial_tract']=='1')],dtype=object)

In [None]:
usa_rural_tracts.dtypes

In [None]:
# Have to remove rows with missing geometry.
print(len(usa_rural_tracts))
print(len(usa_rural_tracts[usa_rural_tracts['geometry'].isnull()]))
usa_rural_tracts.dropna(subset=['geometry'],inplace=True)
print(len(usa_rural_tracts))

In [None]:
usa_rural_tracts.to_file('/home/tflory/notebooks/InfoGroup/rurality/map_files/usa_rural_tracts.shp',\
                         driver='ESRI Shapefile') # crs_wkt='epsg4326')

In [None]:
# End comparisons, etc. 

In [None]:
# This comes from the HRSA website:
# https://www.hrsa.gov/sites/default/files/hrsa/ruralhealth/aboutus/definition/nonmetrocountiesandcts2016.xlsx
# It lists more census tracts than are listed as rural-within-metro counties in the pdf:
# https://www.hrsa.gov/sites/default/files/hrsa/ruralhealth/resources/forhpeligibleareas.pdf
#
# We're using this slightly larger list because the filename implies that it is 3 years more
# recent, although the text on the website says 2013. If this is not an updated list, it is
# unclear why the two lists should be different. But this one is more comprehensive and 
# systematically presented as well as, probably, more recent, so we're using this one.
df = pd.read_csv('/InfoGroup/data/rurality/nonmetrocountiesandcts2016.csv', dtype=object)

In [None]:
df.info()

In [None]:
# This selects all rural census tracts in metropolitan counties. It does not include tracts in
# micropolitan counties. It also does not include the rural census tracts in metropolitan counties
# in which ALL census tracts are rural by the HRSA/FORHP reckoning. There are 28 such counties.
# See 'data/nonmetrocountiesandcts2016.xlsx'.
# The rural tracts in those 28 metropolitan counties can be added later by getting the individual 
# tract IDs from the overall list and filtering by county. The data source used here does not
# list those tracts individually. For the purposes of this analysis they are not useful since
# we're interested in rural census tracts that interact with vibrant or substantial urban areas.
rural_tracts_df = df[~df['CT'].isnull()]
len(rural_tracts_df)

In [None]:
rural_tracts_df.head()

In [None]:
# How many counties are represented?
len(set(rural_tracts_df['CTY FIPS'].tolist()))

In [None]:
dane_df = rural_tracts_df[rural_tracts_df['CountyName']=='Dane']

In [None]:
dane_df