# Water Quality: Data Wrangle

In [30]:
import pandas as pd
import geopandas as gpd
import numpy as np
import geopy
import re
import matplotlib.pyplot as plt
from string import punctuation
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim
import warnings

In [31]:
wdf = pd.read_excel('data/water_qual.xlsx', header=2, sheet_name='Initial', 
                    usecols=[0, 1, 2, 3, 4], skipfooter=6, 
                    dtype={'1st Draw':"str", '2-3 Minute': 'str', '5 Minute': 'str'})
wdf.head(3)

Unnamed: 0,Date Sampled,Address,1st Draw,2-3 Minute,5 Minute
0,2020-01-26 07:42:00,X N Bishop St,<1.0,<1.0,<1.0
1,2016-11-08 05:30:00,XX N Lasalle St,<1,<1,<1
2,2019-11-07 06:18:00,X W Brayton St,<1.0,6.9,<1.0


In [32]:
print(f'The earliest sampling date for an initial test is {wdf["Date Sampled"].min()} ')
print(f'and the most recent sampling date is {wdf["Date Sampled"].max()}.')
print()
print(f'There are a total of {wdf.shape[0]} rows in the initial data set.')

# Replace any "<1.0" or similar with 1.0
wdf.replace(to_replace="^<\s?[01]\.?.*", value="1.0", inplace=True, regex=True) 
# Drop any NAs in first column - must decide whether to drop NAs from subsequent columns. 
wdf.dropna(subset=["1st Draw"], inplace=True) 
# Remove observations that merely state a residence has been referred to addl. testing
(wdf.drop(wdf[wdf['1st Draw'].str.contains("^\D", regex=True)]
    .index.values, inplace=True))
# Convert to float
wdf.loc[:, '1st Draw': '5 Minute'] = (wdf.loc[:,'1st Draw':'5 Minute']
    .astype('float64'))
# Replace any observations that were float <1.0 with 1.0. 
wdf.loc[:,'1st Draw':'5 Minute'] = (wdf.loc[:,'1st Draw':'5 Minute']
    .mask(wdf.loc[:,'1st Draw':'5 Minute'] < 1, 
        other=1.0))

print(f'These represent initial test samples from {wdf.shape[0]} unique residences, ')
print(f'after removing observations that mark incorrectly taken samples and ')
print(f'subsequent testing results.')
print()
print('Summary statistics for the 1st sample drawn in initial testing:')
print(wdf['1st Draw'].describe())
print()
print('Summary statistics for the 2nd sample (2-3 Minute Mark) drawn in initial testing:')
print(wdf['2-3 Minute'].describe())
print()
print('Summary statistics for the final sample (5 Minute Mark) drawn in initial testing:')
print(wdf['5 Minute'].describe())

wdf["Threshold"] = (wdf.loc[:,'1st Draw':'5 Minute'] >= 15).any(axis=1)
print()
print(f'A total of {wdf["Threshold"].sum()} (' + 
      f'{round(wdf["Threshold"].sum() / wdf.shape[0] * 100, 2)}' + 
      f'%) households had at least one ')
print(f'water sample test contain 15.0 or more lead ppb in initial testing.')

The earliest sampling date for an initial test is 2016-01-27 00:00:00 
and the most recent sampling date is 2021-01-07 13:00:00.

There are a total of 23882 rows in the initial data set.
These represent initial test samples from 22668 unique residences, 
after removing observations that mark incorrectly taken samples and 
subsequent testing results.

Summary statistics for the 1st sample drawn in initial testing:
count    22668.000000
mean         3.640967
std         13.724032
min          1.000000
25%          1.000000
50%          2.000000
75%          3.800000
max        730.000000
Name: 1st Draw, dtype: float64

Summary statistics for the 2nd sample (2-3 Minute Mark) drawn in initial testing:
count    22667.000000
mean         4.112260
std          6.835936
min          1.000000
25%          1.000000
50%          2.200000
75%          5.400000
max        460.000000
Name: 2-3 Minute, dtype: float64

Summary statistics for the final sample (5 Minute Mark) drawn in initial testing:
c

In [33]:
# Cleaning addresses for geocoding
wdf.Address.replace({'^XX\s': '10 ', '^X\s': '1 ', 
                     'XX':'00', '[0-9]X':'0'}, regex=True, inplace=True)
wdf.Address = wdf.Address.str.strip(punctuation)
wdf.Address += ', Chicago, IL, USA'

In [34]:
# Geocoding Addresses

# Geocoding the +20,000 observations takes substantially long. Please load
# the data/wdf_points.csv to continue working with the geocoded dataset.

# locator = Nominatim(user_agent="myGeocoder")
# location = locator.geocode("Chicago, IL, USA")

# geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
# wdf['location'] = wdf['Address'].apply(geocode)
# wdf['point'] = wdf['location'].apply(lambda loc: tuple(loc.point) if loc else None)
# wdf[['latitude', 'longitude', 'altitude']] = pd.DataFrame(wdf['point'].tolist(), index=wdf.index)
# wdf.drop('altitude', 'point', axis=1, inplace=True)

# wdf.to_csv('data/wdf_points.csv')

In [35]:
wdf = pd.read_csv('data/wdf_points.csv', index_col=[0])
wdf["avg_reading"] = wdf[["1st Draw", "2-3 Minute", '5 Minute']].mean(axis=1)
wdf["max_reading"] = wdf[["1st Draw", "2-3 Minute", '5 Minute']].max(axis=1)
wdf["t_high"] = np.where(wdf['max_reading'] >= 15, 1, 0)
wdf["t_med"] = np.where(wdf['max_reading'] >= 5, 1, 0)

# Manually cleaning some observations
wdf.loc[wdf["Address"] == "1800 W Chicago Ave , Chicago, IL, USA", ["longitude"]] = -87.67228848673774
wdf.loc[wdf["Address"] == "1800 W Chicago Ave , Chicago, IL, USA", ["latitude"]] = 41.896391737686976
wdf.loc[wdf["Address"] == "1100 W 17th St, Chicago, IL, USA", ["longitude"]] = -87.65310150616693
wdf.loc[wdf["Address"] == "1100 W 17th St, Chicago, IL, USA", ["latitude"]] = 41.862919274995626
wdf.loc[wdf["Address"] == "1600 N 18th St , Chicago, IL, USA", ["longitude"]] = -87.85520742698857
wdf.loc[wdf["Address"] == "1600 N 18th St , Chicago, IL, USA", ["latitude"]] = 41.906865692166896
wdf.loc[wdf["Address"] == "2400 W Harrison Ave, Chicago, IL, USA", ["longitude"]] = -87.68634655975251
wdf.loc[wdf["Address"] == "2400 W Harrison Ave, Chicago, IL, USA", ["latitude"]] = 41.87425757200666
wdf.loc[wdf["Address"] == "800 N Elizabeth St , Chicago, IL, USA", ["longitude"]] = -87.65992222906638
wdf.loc[wdf["Address"] == "800 N Elizabeth St , Chicago, IL, USA", ["latitude"]] = 41.89661964867207

In [36]:
# Converting to geopandas dataframe
wdf_g = gpd.GeoDataFrame(
    wdf, geometry=gpd.points_from_xy(wdf.longitude, wdf.latitude))

In [37]:
wdf_g.to_file('data/water_quality_clean.shp')

  wdf_g.to_file('data/water_quality_clean.shp')
