## Determine if the water system is rural or urban

This notebook is being used to determine if a public water system is urban (>=50% of the area falls within a Census-defined urban area) or rural (<50% of the area falls within a Census-defined urban area). The end result of this notebook is a DataFrame with the water system ID and whether it is rural or urban. This information will be used in the regression analysis.

In [2]:
import geopandas as gpd
import pandas as pd
import numpy as np

In [3]:
# Open urban areas shapefile
urban_area = gpd.read_file('../data/TIGER/tl_2016_us_uac10.shp')

# Examine the data
urban_area.head()

Unnamed: 0,UACE10,GEOID10,NAME10,NAMELSAD10,LSAD10,MTFCC10,UATYP10,FUNCSTAT10,ALAND10,AWATER10,INTPTLAT10,INTPTLON10,geometry
0,24310,24310,"Dixon, IL","Dixon, IL Urban Cluster",76,G3500,C,S,25524689,938058,41.8529507,-89.4817439,"POLYGON ((-89.498589 41.854668, -89.498538 41...."
1,27847,27847,"Escanaba, MI","Escanaba, MI Urban Cluster",76,G3500,C,S,46488558,283456,45.7274565,-87.0824457,"POLYGON ((-87.120975 45.79325499999999, -87.12..."
2,18100,18100,"Clintonville, WI","Clintonville, WI Urban Cluster",76,G3500,C,S,5854721,502397,44.6232203,-88.7611283,"POLYGON ((-88.78650499999999 44.629957, -88.78..."
3,6166,6166,"Bedford, IN","Bedford, IN Urban Cluster",76,G3500,C,S,30403132,2314,38.856653,-86.5012383,"(POLYGON ((-86.518316 38.79547, -86.518253 38...."
4,75270,75270,"Riverdale, CA","Riverdale, CA Urban Cluster",76,G3500,C,S,2306823,0,36.431071,-119.8620544,"POLYGON ((-119.869132 36.430832, -119.870931 3..."


In [7]:
# Import NJ public water system data
pws_geodata = gpd.read_file('../data/NJDEP/NJPWS.shp')

In [9]:
# Confirm they have the same coordinate system

urban_area = urban_area.to_crs({'init': 'epsg:3424'})
print(urban_area.crs, pws_geodata.crs)

{'init': 'epsg:3424'} {'init': 'epsg:3424'}


In [10]:
# Produce an intersection of the two datasets
urban_pws_intersection = gpd.overlay(pws_geodata, urban_area, how="intersection")

In [11]:
urban_pws_intersection['urban_area'] = urban_pws_intersection.geometry.area

urban_pws_int = urban_pws_intersection[['PWID','urban_area','geometry']]
urban_pws_int.head()

Unnamed: 0,PWID,urban_area,geometry
0,NJ0314001,9054935.0,"POLYGON ((429152.770478677 476819.609905608, 4..."
6,NJ0307002,5953241.0,"POLYGON ((441972.9741689265 483150.4371389374,..."
102,NJ0303001,212909000.0,(POLYGON ((436626.6899140105 491361.7796401083...
120,NJ0318002,17482160.0,"POLYGON ((422943.0493023433 451971.7819766924,..."
264,NJ0323001,21110250.0,(POLYGON ((445841.6839511767 455734.1146751046...


In [12]:
# Aggregate urban area by PWID
urban_pws_int = urban_pws_int.dissolve(by='PWID',aggfunc='sum')
urban_pws_int = urban_pws_int.reset_index()
urban_pws_int.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 487 entries, 0 to 486
Data columns (total 3 columns):
PWID          487 non-null object
geometry      487 non-null object
urban_area    487 non-null float64
dtypes: float64(1), object(2)
memory usage: 11.5+ KB


In [13]:
# Join to pws layer
pws_simple = pws_geodata[['PWID','geometry']]
pws_urban_rural = pd.merge(pws_simple, urban_pws_int, how='outer', 
                           on=['PWID'], validate="one_to_one")

# Explore the number of rows
pws_urban_rural.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 574 entries, 0 to 573
Data columns (total 4 columns):
PWID          574 non-null object
geometry_x    574 non-null object
geometry_y    487 non-null object
urban_area    487 non-null float64
dtypes: float64(1), object(3)
memory usage: 22.4+ KB


In [14]:
# Replace null values in urban_area column with zero
pws_urban_rural['urban_area'].fillna(0, inplace=True)

# Confirm the nan columns were replaced
pws_urban_rural.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 574 entries, 0 to 573
Data columns (total 4 columns):
PWID          574 non-null object
geometry_x    574 non-null object
geometry_y    487 non-null object
urban_area    574 non-null float64
dtypes: float64(1), object(3)
memory usage: 22.4+ KB


In [15]:
# calculate PWS area
pws_urban_rural = pws_urban_rural.drop(columns='geometry_y')
pws_urban_rural = pws_urban_rural.rename(columns={'geometry_x': 'geometry'})

pws_urban_rural.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 574 entries, 0 to 573
Data columns (total 3 columns):
PWID          574 non-null object
geometry      574 non-null object
urban_area    574 non-null float64
dtypes: float64(1), object(2)
memory usage: 17.9+ KB


In [16]:
# Calculate PWS area
pws_urban_rural['pws_area'] = pws_urban_rural.geometry.area

In [17]:
# Calculate percentage urban
pws_urban_rural['perc_urban'] = pws_urban_rural['urban_area'] / pws_urban_rural['pws_area'] 

In [18]:
# Add a variable to identify the PWS as urban or rural
pws_urban_rural['urban_rural'] = np.where(pws_urban_rural['perc_urban']> 0.50, 'urban', 'rural')

In [19]:
# Convert to a dataframe and remove unnecessary columns
pws_urban_rural = pws_urban_rural.drop(columns=['geometry','pws_area','urban_area', 'perc_urban'])
pws_urban_rural.head()

Unnamed: 0,PWID,urban_rural
0,NJ0314001,urban
1,NJ0320002,urban
2,NJ0301001,rural
3,NJ0307002,urban
4,NJ0325001,rural


In [20]:
# Export to csv
pws_urban_rural.to_csv('../data/Created/SpatialAnalysis/pws_urban_rural.csv', index = False)