*This notebook assigns county demographic data to the New Jersey water systems based on the primary county in which the water system is located.*

In [97]:
import requests
import pandas as pd
import geopandas as gpd
import io
import numpy as np

## Explore and clean file containing water system characteristics and violation data
See sdwis_data.ipynb for more information on this dataset.

In [98]:
sdwa_vios = pd.read_csv('../data/Created/SDWIS/sdwa_vios_complete.csv')
sdwa_vios = sdwa_vios.rename(columns={'PWSID': "PWID"})

In [99]:
# Convert violation fields to integer type
sdwa_vios['health_violations'] = sdwa_vios['health_violations'].astype('int64')
sdwa_vios['all_violations'] = sdwa_vios['all_violations'].astype('int64')

In [100]:
# If there are multiple counties, select the first one
def select_first_word(county_list):
    return county_list.split(',')[0]

sdwa_vios['county'] = sdwa_vios['COUNTIES_SERVED'].apply(select_first_word)

In [101]:
sdwa_vios.head()

Unnamed: 0,PWID,PRIMARY_SOURCE_CODE,OWNER_TYPE_CODE,SERVICE_CONNECTIONS_COUNT,COUNTIES_SERVED,all_violations,health_violations,primary_source,owner_type,county
0,NJ1432001,GW,L,10,Morris,30,0,Ground water,Local Government,Morris
1,NJ1435002,GW,L,4350,Morris,5,0,Ground water,Local Government,Morris
2,NJ1503001,GW,L,2253,Ocean,1,1,Ground water,Local Government,Ocean
3,NJ1522001,GW,L,933,Ocean,1,0,Ground water,Local Government,Ocean
4,NJ1005001,GW,L,4600,Hunterdon,4,0,Ground water,Local Government,Hunterdon


## Join Census county data to water system data

In [102]:
# Open Census data
cen_data_county = pd.read_csv('../data/Created/census_county.csv')
cen_data_county.head()

Unnamed: 0,total_pop,white_pop,pov_pop,state,county
0,627551,473037,47055,34,25
1,373362,189649,40450,34,21
2,143570,125183,7573,34,37
3,154952,73427,26781,34,11
4,333316,192408,15745,34,35


In [103]:
# Add columns showing percentage of people under the poverty line and percentage people of color
cen_data_county['POC_pop'] = cen_data_county['total_pop'] - cen_data_county['white_pop']
cen_data_county['perc_POC'] = (cen_data_county['POC_pop'] / cen_data_county['total_pop']) * 100
cen_data_county['perc_pov'] = (cen_data_county['pov_pop'] / cen_data_county['total_pop']) * 100

In [104]:
cen_data_county.head()

Unnamed: 0,total_pop,white_pop,pov_pop,state,county,POC_pop,perc_POC,perc_pov
0,627551,473037,47055,34,25,154514,24.621744,7.498195
1,373362,189649,40450,34,21,183713,49.205061,10.83399
2,143570,125183,7573,34,37,18387,12.806993,5.274779
3,154952,73427,26781,34,11,81525,52.613067,17.283417
4,333316,192408,15745,34,35,140908,42.274598,4.723746


In [105]:
# Add county names to the Census data for joining
county_fips_codes = {'1': 'Atlantic',
                    '3': 'Bergen',
                    '5': 'Burlington',
                    '7': 'Camden',
                    '9': 'Cape May',
                    '11': 'Cumberland',
                    '13': 'Essex',
                    '15': 'Gloucester',
                    '17': 'Hudson',
                    '19': 'Hunterdon',
                    '21': 'Mercer',
                    '23': 'Middlesex',
                    '25': 'Monmouth',
                    '27': 'Morris',
                    '29': 'Ocean',
                    '31': 'Passaic',
                    '33': 'Salem',
                    '35': 'Somerset',
                    '37': 'Sussex',
                    '39': 'Union',
                    '41': 'Warren'}

In [106]:
# Function to convert the county FIPS code to county name
def cfc_to_name(code):
    return county_fips_codes[str(code)]

cen_data_county['county'] = cen_data_county['county'].apply(cfc_to_name).astype('object')

In [107]:
# Join the two datasets
pws_cen_county = pd.merge(sdwa_vios, cen_data_county, 
                          on='county', how='left', validate='many_to_one')

In [108]:
pws_county_complete[pws_county_complete.total_pop.isna()]

Unnamed: 0,PWID,geometry,PRIMARY_SOURCE_CODE,OWNER_TYPE_CODE,SERVICE_CONNECTIONS_COUNT,COUNTIES_SERVED,all_violations,health_violations,primary_source,owner_type,...,total_pop,white_pop,pov_pop,state,POC_pop,perc_POC,perc_pov,perc_urban,perc_rural,urban_rural


## Join new DataFrame to public water system spatial file

In [109]:
# Import NJ public water system geospatial data
pws_geodata = gpd.read_file('../data/NJDEP/New_Jersey__Public_Community_Water_Purveyor_Service_Areas.shp')

In [110]:
# Join the two datasets
pws_county_geodata = pd.merge(pws_geodata[['PWID', 'geometry']], 
                              pws_cen_county, on='PWID' )

## Join rural/urban data to the new file and save 

In [111]:
urban_rural = pd.read_csv('../data/Census/DEC_10_SF1_H2_with_ann.csv')

In [112]:
# Reassign header names
urban_rural.columns = urban_rural.iloc[0]

In [113]:
urban_rural = urban_rural.iloc[1:]
urban_rural.head()

Unnamed: 0,Id,Id2,Geography,Total:,Urban:,Urban: - Inside urbanized areas,Urban: - Inside urban clusters,Rural,Not defined for this file
1,0500000US34001,34001,"Atlantic County, New Jersey",126647,112952,107877,5075,13695,0
2,0500000US34003,34003,"Bergen County, New Jersey",352388,352067,352067,0,321,0
3,0500000US34005,34005,"Burlington County, New Jersey",175615(r22678),164536,152961,11575,11079,0
4,0500000US34007,34007,"Camden County, New Jersey",204943(r22684),201571,201551,20,3372,0
5,0500000US34009,34009,"Cape May County, New Jersey",98309,89379,89379,0,8930,0


In [114]:
# Function for simplifying county name
def remove_last_three_words(test):
    return test.rsplit(' ', 3)[0]

In [115]:
# Cleaning/reorganizing data
urban_rural['county'] = urban_rural['Geography'].apply(remove_last_three_words)
urban_rural['urban'] = urban_rural['Urban:'].astype('int64')
urban_rural['rural'] = urban_rural['Rural'].astype('int64')

urban_rural['perc_urban'] = urban_rural['urban'] / (urban_rural['rural'] + urban_rural['urban'])
urban_rural['perc_rural'] =  urban_rural['rural'] / (urban_rural['rural'] + urban_rural['urban'])

In [116]:
urban_rural_clean = urban_rural[['county','perc_urban','perc_rural']]
urban_rural_clean.head()

Unnamed: 0,county,perc_urban,perc_rural
1,Atlantic,0.891865,0.108135
2,Bergen,0.999089,0.000911
3,Burlington,0.936913,0.063087
4,Camden,0.983547,0.016453
5,Cape May,0.909164,0.090836


In [117]:
# Add a variable to identify the PWS as urban or rural
urban_rural_clean['urban_rural'] = np.where(urban_rural_clean['perc_urban']> 0.50, 'urban', 'rural')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [118]:
# Join the datasets
pws_county_complete = pd.merge(pws_county_geodata, urban_rural_clean, 
                               on='county', how='left', validate='many_to_many')

In [119]:
pws_county_complete.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 456 entries, 0 to 455
Data columns (total 21 columns):
PWID                         456 non-null object
geometry                     456 non-null object
PRIMARY_SOURCE_CODE          456 non-null object
OWNER_TYPE_CODE              456 non-null object
SERVICE_CONNECTIONS_COUNT    456 non-null int64
COUNTIES_SERVED              456 non-null object
all_violations               456 non-null int64
health_violations            456 non-null int64
primary_source               456 non-null object
owner_type                   456 non-null object
county                       456 non-null object
total_pop                    456 non-null float64
white_pop                    456 non-null float64
pov_pop                      456 non-null float64
state                        456 non-null float64
POC_pop                      456 non-null float64
perc_POC                     456 non-null float64
perc_pov                     456 non-null float64
p

In [120]:
# Export to GeoJSON
# First, need to upcast everything into the "multi-polygon" type

from shapely import geometry
upcast_dispatch = {geometry.Point: geometry.MultiPoint, 
                   geometry.LineString: geometry.MultiLineString, 
                   geometry.Polygon: geometry.MultiPolygon}

def maybe_cast_to_multigeometry(geom):
    caster = upcast_dispatch.get(type(geom), lambda x: x[0])
    return caster([geom])

pws_county_complete['geometry'] = pws_county_complete['geometry'].apply(maybe_cast_to_multigeometry)

In [121]:
pws_county_complete.to_file('../data/Created/SpatialAnalysis/county_vios.geojson', driver = 'GeoJSON')