In [1]:
%matplotlib inline

import pytz
import requests
import matplotlib.pyplot as plt
import pandas as pd
import folium

import ulmo
from ulmo.util import convert_datetime

# Time series Part 1: Point timeseries data

Credit: this notebook was adapted from https://github.com/waterhackweek/tsdata_access (Emilio Mayorga, Yifan Cheng)

## Table of content
* 1. Study domain
 - 1.1. Background information
 - 1.2. Research questions?
* 2. Access USGS NWIS data using Ulmo
 - 2.1. Site selection
 - 2.2. Download data
* 3. Site-specific data analysis
 - 3.1. Time Series Plot
 - 3.2. Data Selecting and Indexing
 - 3.3. Basic Statistics
 - 3.4. Data Aggregation (Groupby/Resample)

# 1. Study domain
## Yakima River Basin
HUC8 code: 17030001

In [2]:
# here we use geopandas to plot the study domain

# 2. Access USGS NWIS data using Ulmo

Currently, [ulmo](https://github.com/ulmo-dev/ulmo) supports the following datasets / services:

* California Department of Water Resources Historical Data
* Climate Prediction Center Weekly Drought
* CUAHSI WaterOneFlow
* Lower Colorado River Authority Hydromet and Water Quality Data
* NASA Daymet weather data
* National Climatic Data Center Climate Index Reference Sequential (CIRS)
* National Climatic Data Center Global Historical Climate Network Daily
* National Climatic Data Center Global Summary of the Day
* Texas Weather Connection Daily Keetch-Byram Drought Index (KBDI)
* US Army Corps of Engineers - Tulsa District Water Control
* **USGS National Water Information System (NWIS)**
* USGS Emergency Data Distribution Network services
* USGS Earth Resources Observation Systems (EROS) services
* USGS National Elevation Dataset (NED) services

Upper Yakima HUC8 code: 17030001 

In [2]:
HUC8code = '17030001'

## 2.1. Collect all sites information within this river basin

In [3]:
nwis_sites = ulmo.usgs.nwis.get_sites(huc=HUC8code, service=None)

making request for sites: http://waterservices.usgs.gov/nwis/dv/
processing data from request: https://waterservices.usgs.gov/nwis/dv/?format=waterml&hucs=17030001
making request for sites: http://waterservices.usgs.gov/nwis/iv/
processing data from request: https://waterservices.usgs.gov/nwis/iv/?format=waterml&hucs=17030001


In [4]:
type(nwis_sites), len(nwis_sites)

(dict, 20)

In [5]:
nwis_sites.keys()

dict_keys(['12473980', '12473985', '12474500', '12475000', '12476000', '12477000', '12479000', '12479500', '12480000', '12480500', '12481000', '12481500', '12483500', '12483600', '12483800', '12484480', '12484500', '12485500', '12486500', '12487000'])

In [35]:
site_code = '12479500'

In [7]:
nwis_sites[site_code]

{'code': '12483500',
 'name': 'MANASTASH CREEK NEAR ELLENSBURG, WA',
 'network': 'NWIS',
 'agency': 'USGS',
 'location': {'latitude': '46.96651326',
  'longitude': '-120.6956286',
  'srs': 'EPSG:4326'},
 'timezone_info': {'uses_dst': True,
  'dst_tz': {'abbreviation': 'PDT', 'offset': '-07:00'},
  'default_tz': {'abbreviation': 'PST', 'offset': '-08:00'}},
 'county': '53037',
 'huc': '17030001',
 'site_type': 'ST',
 'state_code': '53'}

In [8]:
# pull out lat & lon from the nested dictionary
for k, v in nwis_sites.items():
    v['longitude'] = v['location']['longitude']
    v['latitude'] = v['location']['latitude']

In [14]:
nwis_sites_df = pd.DataFrame(nwis_sites)
nwis_sites_df.head()

Unnamed: 0,12473980,12473985,12474500,12475000,12476000,12477000,12479000,12479500,12480000,12480500,12481000,12481500,12483500,12483600,12483800,12484480,12484500,12485500,12486500,12487000
code,12473980,12473985,12474500,12475000,12476000,12477000,12479000,12479500,12480000,12480500,12481000,12481500,12483500,12483600,12483800,12484480,12484500,12485500,12486500,12487000
name,"GOLD CREEK ABOVE KEECHELUS LAKE NEAR HYAK, WA","BOX CANYON CREEK NEAR HYAK, WA","YAKIMA RIVER NEAR MARTIN, WA","CABIN CREEK NEAR EASTON, WA","KACHESS RIVER NEAR EASTON, WA","YAKIMA RIVER AT EASTON, WA","CLE ELUM RIVER NEAR ROSLYN, WA","YAKIMA RIVER AT CLE ELUM, WA","TEANAWAY RIVER BELOW FORKS NEAR CLE ELUM, WA","TEANAWAY RIVER NEAR CLE ELUM, WA","SWAUK CREEK NEAR CLE ELUM, WA","CASCADE CANAL NEAR ELLENSBURG, WA","MANASTASH CREEK NEAR ELLENSBURG, WA","WILSON CREEK NEAR ELLENSBURG, WA","NANEUM CREEK NEAR ELLENSBURG, WA","CHERRY CREEK AT THRALL, WA","YAKIMA RIVER AT UMTANUM, WA","SELAH-MOXEE CANAL NEAR SELAH, WA","TAYLOR DITCH NEAR SELAH, WA","YAKIMA RIVER AT SELAH GAP NEAR NORTH YAKIMA, WA"
network,NWIS,NWIS,NWIS,NWIS,NWIS,NWIS,NWIS,NWIS,NWIS,NWIS,NWIS,NWIS,NWIS,NWIS,NWIS,NWIS,NWIS,NWIS,NWIS,NWIS
agency,USGS,USGS,USGS,USGS,USGS,USGS,USGS,USGS,USGS,USGS,USGS,USGS,USGS,USGS,USGS,USGS,USGS,USGS,USGS,USGS
location,"{'latitude': '47.3917792', 'longitude': '-121....","{'latitude': '47.35900289', 'longitude': '-121...","{'latitude': '47.32122417', 'longitude': '-121...","{'latitude': '47.2415036', 'longitude': '-121....","{'latitude': '47.2612262', 'longitude': '-121....","{'latitude': '47.2387267', 'longitude': '-121....","{'latitude': '47.24456209', 'longitude': '-121...","{'latitude': '47.19123078', 'longitude': '-120...","{'latitude': '47.2465092', 'longitude': '-120....","{'latitude': '47.19567717', 'longitude': '-120...","{'latitude': '47.1637336', 'longitude': '-120....","{'latitude': '47.1095676', 'longitude': '-120....","{'latitude': '46.96651326', 'longitude': '-120...","{'latitude': '47.1262365', 'longitude': '-120....","{'latitude': '47.1267922', 'longitude': '-120....","{'latitude': '46.9259594', 'longitude': '-120....","{'latitude': '46.8626264', 'longitude': '-120....","{'latitude': '46.6892934', 'longitude': '-120....","{'latitude': '46.6795711', 'longitude': '-120....","{'latitude': '46.63095987', 'longitude': '-120..."


In [15]:
# Let's flip (transpose) the dataframe, so each row is a site
nwis_sites_df = nwis_sites_df.T
nwis_sites_df.head(2)

Unnamed: 0,code,name,network,agency,location,timezone_info,county,huc,site_type,state_code,longitude,latitude
12473980,12473980,"GOLD CREEK ABOVE KEECHELUS LAKE NEAR HYAK, WA",NWIS,USGS,"{'latitude': '47.3917792', 'longitude': '-121....","{'uses_dst': True, 'dst_tz': {'abbreviation': ...",53037,17030001,ST,53,-121.381483,47.3917792
12473985,12473985,"BOX CANYON CREEK NEAR HYAK, WA",NWIS,USGS,"{'latitude': '47.35900289', 'longitude': '-121...","{'uses_dst': True, 'dst_tz': {'abbreviation': ...",53037,17030001,ST,53,-121.2456489,47.35900289


In [16]:
# Final cleanups, improvements
nwis_sites_df['latitude'] = nwis_sites_df['latitude'].astype(float)
nwis_sites_df['longitude'] = nwis_sites_df['longitude'].astype(float)
nwis_sites_df.drop(columns=['location', 'timezone_info'], inplace=True)
nwis_sites_df.head(2)

Unnamed: 0,code,name,network,agency,county,huc,site_type,state_code,longitude,latitude
12473980,12473980,"GOLD CREEK ABOVE KEECHELUS LAKE NEAR HYAK, WA",NWIS,USGS,53037,17030001,ST,53,-121.381483,47.391779
12473985,12473985,"BOX CANYON CREEK NEAR HYAK, WA",NWIS,USGS,53037,17030001,ST,53,-121.245649,47.359003


In [17]:
nwis_sites_df['site_type'].value_counts()

ST        17
ST-CA      2
ST-DCH     1
Name: site_type, dtype: int64

Site type codes are defined at http://help.waterdata.usgs.gov/site_tp_cd. For our purposes:

* ST: Stream
* ST-CA: Canel
* ST-DCH: Ditch

We could also have issued the nwis.get_sites query by passing a site_type value or list, like this:

nwis_sits = ulmo.usgs.nwis.get_sites(huc=HUC8code, site_type=['ST'], service=None)

## 2.2. Let's map the sites on an interactive map, with Folium

In [18]:
m = folium.Map(tiles='Stamen terrain')

for idx, row in nwis_sites_df.iterrows():
    folium.Marker(location=[row['latitude'], row['longitude']], 
                  icon=folium.Icon(color='blue' if row['site_type'] == 'ST' else 'gray'),
                  popup="<b>{0}</b> {1}. Site type: {2}. {3}".format(
                      row['code'], row['name'], row['site_type'], row['network'])
                 ).add_to(m)
    
# Set the map extent (bounds) to the extent of the sites
m.fit_bounds(m.get_bounds())

m

## 2.3. Download USGS data

Use functions from ulmo: ulmo.usgs.nwis.get_site_data()
   * site_code: USGS site number (str)
   * parameter_code: Variable code (str) ([Link to all USGS parameter code](https://help.waterdata.usgs.gov/codes-and-parameters/parameters), [Link to USGS parameters commonly used in hydrology](https://help.waterdata.usgs.gov/parameter_cd?group_cd=PHY))
        * 00010: Temperature, water, degrees Celsius
        * 00011: Temperature, water, degrees Fahrenheit
        * 00060: Discharge, cubic feet per second
   * statistic_code: Statistic code (str) ([Link to all USGS statistic code](https://help.waterdata.usgs.gov/stat_code))
        * 00001: Maximum
        * 00002: Minimum
        * 00003: Mean

In [50]:
# download USGS data using ulmo
param_code = '00060'
stat_code  = '00003'
start_date = '1950-01-01'
end_date   = '1980-12-31'
site_data = ulmo.usgs.nwis.get_site_data(site_code=site_code,parameter_code=param_code, service='daily',
                                         statistic_code=stat_code, start=start_date,end=end_date,
                                         methods="all")

processing data from request: https://waterservices.usgs.gov/nwis/dv/?format=waterml&site=12479500&parameterCd=00060&statCd=00003&startDT=1950-01-01&endDT=1980-12-31


In [51]:
# convert data format to Pandas Dataframe

df = pd.DataFrame(site_data['00060'+':'+
                            '00003']['values'])         # create dataframe
df[site_code] = df['value'].astype(float)       # convert ['value'] to float
df['date'] = pd.to_datetime(df['datetime'])             # convert ['datetime'] format to date and add as a new column
df.set_index(['date'],inplace=True)                     # set date to be index
df = df.drop(['datetime','value','qualifiers'],axis=1)  # delete column ['datetime']

In [52]:
df

Unnamed: 0_level_0,12479500
date,Unnamed: 1_level_1
1950-01-01,886.0
1950-01-02,807.0
1950-01-03,737.0
1950-01-04,807.0
1950-01-05,835.0
...,...
1978-09-25,1570.0
1978-09-26,1500.0
1978-09-27,1590.0
1978-09-28,1500.0


# 3. Site-specific data analysis

## 3.1. Frequently used packages
* [NumPy](https://numpy.org/) (Deal with **Array**)
    * Fundamental package for scientific computing
    * All following packages depends on Numpy
* [Pandas](https://pandas.pydata.org/) (Deal with **DataFrame**)
* [SciPy](https://www.scipy.org/) 
    * Built on NumPy, and matplotlib
* [Scikit Learn](https://scikit-learn.org/stable/)
    * Built on NumPy, SciPy, and matplotlib
    * Simple and efficient tools for predictive data analysis
        * Claassification, Regression, Clustering...
* ...
