## Housekeeping

In [7]:
!pwd

/Users/ben/OneDrive - King County/python_data_science/ocean_hack_week


In [1]:
# Load libraries
import requests
import time
import json
import warnings
warnings.filterwarnings('ignore')

import re
import requests
import xarray as xr
import pandas as pd
import os

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# Enter credentials
username = 'OOIAPI-FVEH1J9ENOAPBW'
token = 'TXTYLJ2Q89HUB2'

## Get the data from the server
Specify data identifiers

In [3]:
# Reference designator for Seafloor oxygen of in-shore surface mooring
refdes = 'CE06ISSM-MFD37-03-DOSTAD000'
method = 'recovered_inst'
stream = 'dosta_abcdjm_ctdbp_instrument_recovered'
beginDT = '1990-01-01T01:01:01.500Z' # this time range encompasses all three deployments of this instrument.
endDT = '2018-09-01T01:01:01.500Z'

#### Build the GET request URL and send the request to the M2M API endpoint.

In [4]:
base_url = 'https://ooinet.oceanobservatories.org/api/m2m/12576/sensor/inv'
data_request_url ='/'.join((base_url,refdes[:8],refdes[9:14],refdes[15:],method,stream))
print(data_request_url)
params = {
    'beginDT':beginDT,
    'endDT':endDT,   
}
r = requests.get(data_request_url, params=params, auth=(username, token))
data = r.json()

https://ooinet.oceanobservatories.org/api/m2m/12576/sensor/inv/CE06ISSM/MFD37/03-DOSTAD000/recovered_inst/dosta_abcdjm_ctdbp_instrument_recovered


#### Examine the response
Notice that the timeCalculation key is attempting to give you a rough estimate of how long it will take to fulfill your data request.

In [5]:
type(data)
data.keys()
data['allURLs'][0]
print(json.dumps(data, indent=2))

dict

dict_keys(['requestUUID', 'outputURL', 'allURLs', 'sizeCalculation', 'timeCalculation', 'numberOfSubJobs'])

'https://opendap.oceanobservatories.org/thredds/catalog/ooi/ben.larson@noaa.gov/20180824T163446-CE06ISSM-MFD37-03-DOSTAD000-recovered_inst-dosta_abcdjm_ctdbp_instrument_recovered/catalog.html'

{
  "requestUUID": "ca66c692-afbc-484b-a7ea-5ab8d5cb3d47",
  "outputURL": "https://opendap.oceanobservatories.org/thredds/catalog/ooi/ben.larson@noaa.gov/20180824T163446-CE06ISSM-MFD37-03-DOSTAD000-recovered_inst-dosta_abcdjm_ctdbp_instrument_recovered/catalog.html",
  "allURLs": [
    "https://opendap.oceanobservatories.org/thredds/catalog/ooi/ben.larson@noaa.gov/20180824T163446-CE06ISSM-MFD37-03-DOSTAD000-recovered_inst-dosta_abcdjm_ctdbp_instrument_recovered/catalog.html",
    "https://opendap.oceanobservatories.org/async_results/ben.larson@noaa.gov/20180824T163446-CE06ISSM-MFD37-03-DOSTAD000-recovered_inst-dosta_abcdjm_ctdbp_instrument_recovered"
  ],
  "sizeCalculation": 5707523,
  "timeCalculation": 60,
  "numberOfSubJobs": 529
}


#### Navigating your THREDDS directory.
Check if the request has completed.

In [6]:
%%time
# Build the URL that contains the status, and ping it until it confirms the request is complete
check_complete = data['allURLs'][1] + '/status.txt'
# Loop through status for 1800 seconds
for i in range(1800): 
    r = requests.get(check_complete)
    if r.status_code == requests.codes.ok:
        print('request completed')
        break
    else:
        time.sleep(1)

request completed
CPU times: user 1.95 s, sys: 133 ms, total: 2.08 s
Wall time: 2min 1s


Check out the THREDDS directory.

In [8]:
data['allURLs'][0]

'https://opendap.oceanobservatories.org/thredds/catalog/ooi/ben.larson@noaa.gov/20180824T163446-CE06ISSM-MFD37-03-DOSTAD000-recovered_inst-dosta_abcdjm_ctdbp_instrument_recovered/catalog.html'

Check out the Apache directory

In [9]:
data['allURLs'][1]

'https://opendap.oceanobservatories.org/async_results/ben.larson@noaa.gov/20180824T163446-CE06ISSM-MFD37-03-DOSTAD000-recovered_inst-dosta_abcdjm_ctdbp_instrument_recovered'

Parse the thredds server to get a list of all NetCDF files. Each deployment is seperated into a seperate netcdf file.

In [10]:
# Set the threds url recovered from the 
url = data['allURLs'][0]
tds_url = 'https://opendap.oceanobservatories.org/thredds/dodsC'
datasets = requests.get(url).text
# Regular expression to get all the urls
urls = re.findall(r'href=[\'"]?([^\'" >]+)', datasets)
# Find the urls with nc extensions
x = re.findall(r'(ooi/.*?.nc)', datasets)
for i in x:
    if i.endswith('.nc') == False:
        x.remove(i)
for i in x:
    try:
        float(i[-4])
    except:
        x.remove(i)
datasets = [os.path.join(tds_url, i) for i in x]

1.0

In [11]:
# Inspect the result
len(datasets)
datasets[2]

12

'https://opendap.oceanobservatories.org/thredds/dodsC/ooi/ben.larson@noaa.gov/20180824T163446-CE06ISSM-MFD37-03-DOSTAD000-recovered_inst-dosta_abcdjm_ctdbp_instrument_recovered/deployment0005_CE06ISSM-MFD37-03-DOSTAD000-recovered_inst-dosta_abcdjm_ctdbp_instrument_recovered_20160927T164503-20170328T093510.nc'

### Further prune the data to remove co-located sensor data

In [12]:
datasets_sub = []
for i in datasets:
    if '03-CTDBPC000' in i:
        pass
    else:
        datasets_sub.append(i) 
        

#### Explore the data in Python using xarray

In [13]:
ds = xr.open_mfdataset(datasets_sub)

In [14]:
ds

<xarray.Dataset>
Dimensions:                       (obs: 67633)
Coordinates:
  * obs                           (obs) int64 0 1 2 3 4 5 6 7 8 9 10 11 12 ...
    time                          (obs) datetime64[ns] dask.array<shape=(67633,), chunksize=(11787,)>
    int_ctd_pressure              (obs) float64 dask.array<shape=(67633,), chunksize=(11787,)>
    lat                           (obs) float64 dask.array<shape=(67633,), chunksize=(11787,)>
    lon                           (obs) float64 dask.array<shape=(67633,), chunksize=(11787,)>
Data variables:
    deployment                    (obs) int32 dask.array<shape=(67633,), chunksize=(11787,)>
    id                            (obs) |S64 dask.array<shape=(67633,), chunksize=(11787,)>
    ctd_time                      (obs) datetime64[ns] dask.array<shape=(67633,), chunksize=(11787,)>
    driver_timestamp              (obs) datetime64[ns] dask.array<shape=(67633,), chunksize=(11787,)>
    ingestion_timestamp           (obs) datetime64[n

In [None]:
ds = xr.open_mfdataset(datasets)
ds = ds.swap_dims({'obs': 'time'})
ds = ds.chunk({'time': 100})
ds = ds.sortby('time') # data from different deployments can overlap so we want to sort all data by time stamp.
ds