In [1]:
library(tidyverse)

── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ ggplot2 2.2.1     ✔ purrr   0.2.4
✔ tibble  1.4.1     ✔ dplyr   0.7.4
✔ tidyr   0.7.2     ✔ stringr 1.2.0
✔ readr   1.1.1     ✔ forcats 0.2.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


In [2]:
library(lubridate)
library(stringr)


Attaching package: ‘lubridate’

The following object is masked from ‘package:base’:

    date



In [3]:
water = read_csv('http://594442.youcanlearnit.net/austinwater.csv')

Parsed with column specification:
cols(
  .default = col_character(),
  RESULT = col_double(),
  SAMPLE_SITE_NO = col_integer(),
  DEPTH_IN_METERS = col_double(),
  DATA_REF_NO = col_integer(),
  LAT_DD_WGS84 = col_double(),
  LON_DD_WGS84 = col_double(),
  SAMPLE_REF_NO = col_integer()
)
See spec(...) for full column specifications.


In [4]:
water =  tibble('siteName'=water$SITE_NAME,
                'siteType'=water$SITE_TYPE,
                'sampleTime'=water$SAMPLE_DATE,
                'parameterType'=water$PARAM_TYPE,
                'parameter'=water$PARAMETER,
                'result'=water$RESULT,
                'unit'=water$UNIT)

In [5]:
glimpse(water)

Observations: 1,122,275
Variables: 7
$ siteName      <chr> "Old Mill (Sunken Gardens) Spring", "Old Mill (Sunken...
$ siteType      <chr> "Spring", "Spring", "Spring", "Spring", "Spring", "Sp...
$ sampleTime    <chr> "07/18/2013 02:58:00 PM", "07/18/2013 02:58:00 PM", "...
$ parameterType <chr> "Benthic Macroinvertebrates", "Benthic Macroinvertebr...
$ parameter     <chr> "ANNELIDA (WORM/LEECHES/PLANARIA)", "CRAYFISH (ADULT)...
$ result        <dbl> 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0...
$ unit          <chr> "1=Present; 0=Absent", "Relative Abundance (1): <20 (...


In [6]:
unique(water$parameter)


In [7]:
unique(water[which(str_detect(water$parameter,'PH')),]$parameter)

In [8]:
unique(water$parameterType)

In [10]:
filtered_water = subset(water,(parameterType=='Alkalinity/Hardness/pH') |
                                  parameterType=='Conventionals')

In [11]:
unique(filtered_water$parameter)

In [12]:
# I want only two of these, (discuss PH and temp choices), so let's filter those

filtered_water = subset(filtered_water, ((parameter=='PH') |
                                            (parameter=='WATER TEMPERATURE')))

glimpse(filtered_water)

Observations: 51,797
Variables: 7
$ siteName      <chr> "North Branch of Sycamore Creek @ Foster Ranch Rd", "...
$ siteType      <chr> "Stream", "Stream", "Spring", "Spring", "Stream", "St...
$ sampleTime    <chr> "01/21/2015 12:00:00 PM", "01/21/2015 12:00:00 PM", "...
$ parameterType <chr> "Alkalinity/Hardness/pH", "Conventionals", "Alkalinit...
$ parameter     <chr> "PH", "WATER TEMPERATURE", "PH", "WATER TEMPERATURE",...
$ result        <dbl> 8.35, 11.73, 7.82, 18.29, 8.15, 8.09, 8.47, 8.12, 8.6...
$ unit          <chr> "Standard units", "Deg. Celsius", "Standard units", "...


In [13]:
summary(filtered_water)

   siteName           siteType          sampleTime        parameterType     
 Length:51797       Length:51797       Length:51797       Length:51797      
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
                                                                            
  parameter             result              unit          
 Length:51797       Min.   :      0.1   Length:51797      
 Class :character   1st Qu.:      7.7   Class :character  
 Mode  :character   Median :      9.8   Mode  :character  
                    Mean   :     45.7                     
                    3rd Qu.:     21.1                     
                    Max.   :11

In [14]:
filtered_water$siteType = as.factor(filtered_water$siteType)
filtered_water$parameterType = as.factor(filtered_water$parameterType)
filtered_water$parameter = as.factor(filtered_water$parameter)
filtered_water$unit = as.factor(filtered_water$unit)

summary(filtered_water)


   siteName                                      siteType    
 Length:51797       Stream                           :24091  
 Class :character   Lake                             :16852  
 Mode  :character   Spring                           : 7248  
                    Well                             : 1607  
                    Soil                             : 1591  
                    Non-spatial or Protected Location:  218  
                    (Other)                          :  190  
  sampleTime                       parameterType               parameter    
 Length:51797       Alkalinity/Hardness/pH:25624   PH               :25624  
 Class :character   Conventionals         :26173   WATER TEMPERATURE:26173  
 Mode  :character                                                           
                                                                            
                                                                            
                                          

In [15]:
# And sampleTime should be a date/time object
filtered_water$sampleTime = mdy_hms(filtered_water$sampleTime)

summary(filtered_water)

   siteName                                      siteType    
 Length:51797       Stream                           :24091  
 Class :character   Lake                             :16852  
 Mode  :character   Spring                           : 7248  
                    Well                             : 1607  
                    Soil                             : 1591  
                    Non-spatial or Protected Location:  218  
                    (Other)                          :  190  
   sampleTime                                 parameterType  
 Min.   :1986-05-01 13:35:00   Alkalinity/Hardness/pH:25624  
 1st Qu.:1997-05-29 09:31:00   Conventionals         :26173  
 Median :2002-03-12 09:30:00                                 
 Mean   :2003-06-19 12:12:39                                 
 3rd Qu.:2009-12-16 11:40:00                                 
 Max.   :2017-02-09 14:00:00                                 
                                                             
        

In [16]:
# Why are some of these measurements in feet?
subset(filtered_water,unit=='Feet')

siteName,siteType,sampleTime,parameterType,parameter,result,unit
Lanier Well,Well,2012-09-25 10:25:00,Conventionals,WATER TEMPERATURE,78.92,Feet


In [17]:
# Looks like that is supposed to be Farenheit
convert = which(filtered_water$unit=='Feet')
filtered_water$unit[convert] <- 'Deg. Fahrenheit'

In [18]:
summary(filtered_water)

   siteName                                      siteType    
 Length:51797       Stream                           :24091  
 Class :character   Lake                             :16852  
 Mode  :character   Spring                           : 7248  
                    Well                             : 1607  
                    Soil                             : 1591  
                    Non-spatial or Protected Location:  218  
                    (Other)                          :  190  
   sampleTime                                 parameterType  
 Min.   :1986-05-01 13:35:00   Alkalinity/Hardness/pH:25624  
 1st Qu.:1997-05-29 09:31:00   Conventionals         :26173  
 Median :2002-03-12 09:30:00                                 
 Mean   :2003-06-19 12:12:39                                 
 3rd Qu.:2009-12-16 11:40:00                                 
 Max.   :2017-02-09 14:00:00                                 
                                                             
        

In [22]:
# What about the MG/L?
subset(filtered_water,unit=='MG/L')
subset(filtered_water,unit=='MG/L' & parameter=='PH')

convert <- which(filtered_water$unit=='MG/L' & filtered_water$parameter=='PH')
filtered_water$unit[convert] <- 'Standard units'


siteName,siteType,sampleTime,parameterType,parameter,result,unit


siteName,siteType,sampleTime,parameterType,parameter,result,unit


In [24]:

subset(filtered_water,unit=='MG/L')
subset(filtered_water,unit=='MG/L' & filtered_water$result>70)
convert <- which(filtered_water$unit=='MG/L' & filtered_water$result>70)
filtered_water$unit[convert] <- 'Deg. Fahrenheit'

subset(filtered_water,unit=='MG/L')
convert <- which(filtered_water$unit=='MG/L')
filtered_water$unit[convert] <- 'Deg. Celsius'

summary(filtered_water)

siteName,siteType,sampleTime,parameterType,parameter,result,unit


siteName,siteType,sampleTime,parameterType,parameter,result,unit


siteName,siteType,sampleTime,parameterType,parameter,result,unit


   siteName                                      siteType    
 Length:51797       Stream                           :24091  
 Class :character   Lake                             :16852  
 Mode  :character   Spring                           : 7248  
                    Well                             : 1607  
                    Soil                             : 1591  
                    Non-spatial or Protected Location:  218  
                    (Other)                          :  190  
   sampleTime                                 parameterType  
 Min.   :1986-05-01 13:35:00   Alkalinity/Hardness/pH:25624  
 1st Qu.:1997-05-29 09:31:00   Conventionals         :26173  
 Median :2002-03-12 09:30:00                                 
 Mean   :2003-06-19 12:12:39                                 
 3rd Qu.:2009-12-16 11:40:00                                 
 Max.   :2017-02-09 14:00:00                                 
                                                             
        