Water Quality Data Cleaning Module

In [2]:
import pandas as pd
import geopandas as gpd
import geopy
import re
import matplotlib.pyplot as plt
from string import punctuation
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim

In [16]:
wdf = pd.read_excel('data/water_qual.xlsx', header=2, sheet_name='Initial', 
                    usecols=[0, 1, 2, 3, 4], skipfooter=6, 
                    dtype={'1st Draw':"str", '2-3 Minute': 'str', '5 Minute': 'str'})
wdf.head(3)

Unnamed: 0,Date Sampled,Address,1st Draw,2-3 Minute,5 Minute
0,2020-01-26 07:42:00,X N Bishop St,<1.0,<1.0,<1.0
1,2016-11-08 05:30:00,XX N Lasalle St,<1,<1,<1
2,2019-11-07 06:18:00,X W Brayton St,<1.0,6.9,<1.0


In [17]:
# Sequential data
wdf_s = pd.read_excel('data/water_qual.xlsx', header=2, sheet_name='Sequential', 
                    usecols=[0, 1, 2, 12, 13], skipfooter=6, 
                    dtype={'1st Draw':"str", '3 Minute': 'str', '5 Minute': 'str'})

In [18]:
# Data Exploration and Cleaning
print(f'Date ranges for data are: {wdf["Date Sampled"].min()} to {wdf["Date Sampled"].max()}')
print(f'There are {wdf.shape[0]:,} observations in the dataset.')

# Replace any "<1.0" or similar with 1.0
wdf.replace(to_replace="^<\s?[01]\.?.*", value="1.0", inplace=True, regex=True)

wdf.dropna(subset=["1st Draw"], inplace=True)

# Remove observations that merely state a residence has been referred to addl. testing
wdf.drop(wdf[wdf['1st Draw'].str.contains("^\D", regex=True)].index.values, inplace=True)

# Convert to float
wdf.loc[:, '1st Draw': '5 Minute'] = (wdf.loc[:,'1st Draw':'5 Minute'].astype('float64'))

# Replace any observations that were float <1.0 with 1.0. 
wdf.loc[:,'1st Draw':'5 Minute'] = (wdf.loc[:,'1st Draw':'5 Minute']
    .mask(wdf.loc[:,'1st Draw':'5 Minute'] < 1, other=1.0))

Date ranges for data are: 2016-01-27 00:00:00 to 2021-01-07 13:00:00
There are 23,882 observations in the dataset.


In [19]:
FLprint('Summary statistics for the 1st sample drawn in initial testing:')
print(wdf['1st Draw'].describe())
print()
print('Summary statistics for the 2nd sample (2-3 Minute Mark) drawn in initial testing:')
print(wdf['2-3 Minute'].describe())
print()
print('Summary statistics for the final sample (5 Minute Mark) drawn in initial testing:')
print(wdf['5 Minute'].describe())

wdf["Threshold"] = (wdf.loc[:,'1st Draw':'5 Minute'] >= 15).any(axis=1)
print()
print(f'A total of {wdf["Threshold"].sum():,} (' + 
      f'{round(wdf["Threshold"].sum() / wdf.shape[0] * 100, 2)}' + 
      f'%) observations had at least one ')
print(f'water sample test contain 15.0 or more lead ppb in initial testing.')

Summary statistics for the 1st sample drawn in initial testing:
count    22668.000000
mean         3.640967
std         13.724032
min          1.000000
25%          1.000000
50%          2.000000
75%          3.800000
max        730.000000
Name: 1st Draw, dtype: float64

Summary statistics for the 2nd sample (2-3 Minute Mark) drawn in initial testing:
count    22667.000000
mean         4.112260
std          6.835936
min          1.000000
25%          1.000000
50%          2.200000
75%          5.400000
max        460.000000
Name: 2-3 Minute, dtype: float64

Summary statistics for the final sample (5 Minute Mark) drawn in initial testing:
count    22667.000000
mean         2.267388
std          3.051567
min          1.000000
25%          1.000000
50%          1.200000
75%          2.500000
max        240.000000
Name: 5 Minute, dtype: float64

A total of 1,007 (4.44%) observations had at least one 
water sample test contain 15.0 or more lead ppb in initial testing.


In [20]:
# Cleaning addresses
wdf.Address.replace({'^XX\s': '10 ', '^X\s': '1 ', 
                     'XX':'00', '[0-9]X':'0'}, regex=True, inplace=True)
wdf.Address = wdf.Address.str.strip(punctuation)
wdf.Address += ', Chicago, IL, USA'

In [None]:
# Geocoding
locator = Nominatim(user_agent="myGeocoder")
location = locator.geocode("Chicago, IL, USA")

geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
wdf['location'] = wdf['Address'].apply(geocode)
wdf['point'] = wdf['location'].apply(lambda loc: tuple(loc.point) if loc else None)
wdf[['latitude', 'longitude', 'altitude']] = pd.DataFrame(wdf['point'].tolist(), index=wdf.index)
wdf.drop('altitude', 'point', axis=1, inplace=True)

In [55]:
print(f'''There are {wdf['latitude'].isna().sum()} NaNs coordinates in the dataset. 
Additional cleaning is needed.''')

There are 585 NaNs coordinates in the dataset. 
Additional cleaning is needed.


In [59]:
wdf.to_csv('data/wdf_points.csv')

In [49]:
wdf_g = gpd.GeoDataFrame(
    wdf, geometry=gpd.points_from_xy(wdf.longitude, wdf.latitude))

In [3]:
wdf_g = pd.read_csv('data/wdf_points.csv')

In [5]:
# Visual Checks and additional cleaning
wdf_g.head(10)

Unnamed: 0.1,Unnamed: 0,Date Sampled,Address,1st Draw,2-3 Minute,5 Minute,Threshold,location,latitude,longitude,geometry
0,0,2020-01-26 07:42:00,"1 N Bishop St, Chicago, IL, USA",1.0,1.0,1.0,False,"1, North Bishop Street, Near West Side, Chicag...",41.881667,-87.663587,POINT (-87.66358700000001 41.881667)
1,1,2016-11-08 05:30:00,"10 N Lasalle St, Chicago, IL, USA",1.0,1.0,1.0,False,"10, North LaSalle Street, Loop, Chicago, Cook ...",41.882197,-87.632477,POINT (-87.63247739373433 41.88219746140331)
2,2,2019-11-07 06:18:00,"1 W Brayton St, Chicago, IL, USA",1.0,6.9,1.0,False,"1, West Brayton Street, Cookes Subdivision, We...",41.665199,-87.622381,POINT (-87.62238063102942 41.6651985)
3,3,2020-06-07 07:58:00,"1 W Superior St, Chicago, IL, USA",1.0,1.0,1.0,False,"One Superior Place, 1, West Superior Street, C...",41.895201,-87.628955,POINT (-87.62895534670797 41.8952011)
4,4,2020-10-26 07:45:00,"1 W Superior St, Chicago, IL, USA",1.0,1.0,1.0,False,"One Superior Place, 1, West Superior Street, C...",41.895201,-87.628955,POINT (-87.62895534670797 41.8952011)
5,5,2019-05-02 07:40:00,"100 E 14th St, Chicago, IL, USA",1.0,1.0,1.0,False,"1400 Museum Park, 100, East 14th Street, The G...",41.864485,-87.623687,POINT (-87.62368747497932 41.86448535)
6,6,2016-06-05 08:45:00,"100 E Bellevue Pl, Chicago, IL, USA",1.0,1.0,1.0,False,"Bellevue Place, 100, East Bellevue Place, Gold...",41.901914,-87.625663,POINT (-87.62566332779593 41.90191365)
7,7,2019-11-02 08:40:00,"100 E 14th St , Chicago, IL, USA",1.0,1.0,1.0,False,"1400 Museum Park, 100, East 14th Street, The G...",41.864485,-87.623687,POINT (-87.62368747497932 41.86448535)
8,8,2018-08-22 18:15:00,"100 E 14th St , Chicago, IL, USA",1.0,1.0,1.0,False,"1400 Museum Park, 100, East 14th Street, The G...",41.864485,-87.623687,POINT (-87.62368747497932 41.86448535)
9,9,2018-08-18 10:20:00,"100 E 14th St , Chicago, IL, USA",1.0,1.0,1.0,False,"1400 Museum Park, 100, East 14th Street, The G...",41.864485,-87.623687,POINT (-87.62368747497932 41.86448535)
