# Welcome to ESS-DIVE's Search Sample Identifiers Jupyter Notebook

This Jupyter Notebook will help data users search ESS-DIVE datasets by Sample Identifiers (IGSN, Sample IDs, Site IDs):

    1. Set Up 
    2. (Optional) Search Using Dataset API 
    3. Search Using DeepDive API 
    4. Visualzing Data
    5. Download relevant files

Written By: Leo Herrera

Acknowledgements: This notebook builds from Danielle Christianson's Search & Download notebook.


## 1. Set Up

In [16]:
# Set up libraries 
# This notebook requires Python 3.
# ===================================

import csv
import datetime as dt
import io
import json
import os
import pandas as pd
import requests

from ipywidgets import widgets, interact
from IPython.display import display, display_html
from pathlib import Path
from urllib.request import Request, urlopen, urlretrieve
from zipfile import ZipFile


import requests
from io import BytesIO
import pprint
from urllib.parse import urlencode
import numpy as np
pp = pprint.PrettyPrinter(indent=4)


#===================

## 1. Set Up/Configure 

1. Go to ESS-DIVE (https://data.ess-dive.lbl.gov/data), login with your ORCID, and copy your authentication token from your account settings page.
2. Enter your authentication token into the widget above
3. Run the following code cell

   _Always re-run this code cell when you update your token. Tokens expire every 24 hours._

<b> Run this cell and a widget will appear below, paste your ocrid ID authentication token into the widget and move on to the next cell. Do no rerun this cell, or your authentication token will disappear

In [3]:
my_token = "<put_your_token_here>"
token_text = widgets.Text(my_token, description="Token:")
display(token_text)

Text(value='<put_your_token_here>', description='Token:')

In [17]:
essdive_api_url = 'http://api.ess-dive.lbl.gov'

essdive_direct_url = 'https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/object/'

token = token_text.value

### Configure local storage for downloads (if desired)

Enter the local directory path in which you want to save downloaded files.

In [18]:
local_dir = Path('/Users/YLH/ESS-DIVE/essdive-tutorials/downloads')

# ===================================
if local_dir.exists():
    print(f'Success! Local directory {local_dir} configured for downloads')
    print('===================================')
    current_files = [x for x in os.listdir(local_dir) if x != '.DS_Store']
    if current_files:
        print(f'Local directory contains: {current_files}')
    else:
        print(f'Local directory is currently empty.')
else:
    print(f'Cannot find local directory {local_dir}. Please reenter valid directory path.')
    
download_file_log = {}
print('===================================')
print('Downloaded files will be logged in the dictionary object "download_file_log".\n'
      'You can save this dictionary as a file later in the notebook.\n'
      'The filename, file url, and datetime accessed are recorded as a tuple in the "downloaded_files" element.')

Success! Local directory /Users/YLH/ESS-DIVE/essdive-tutorials/downloads configured for downloads
Local directory contains: ['WHONDRS_YDE22_Data_Package.zip', 'XRF_FTICR_Manuscript_Data_Package.zip', '6_SupplementalData.zip', 'SSS_Ecosystem_Respiration_Data_Package.zip', 'v2_RC2_TemporalStudy_2021_2022_SampleData.zip']
Downloaded files will be logged in the dictionary object "download_file_log".
You can save this dictionary as a file later in the notebook.
The filename, file url, and datetime accessed are recorded as a tuple in the "downloaded_files" element.


In [63]:
# Run these general functions
# ===================================
        
def assess_datasets_flmd_dd_csv_files(dataset_details_list):
    """
    Find the datasets with flmd files
    Sort the csv file contents into potential and data files; add to the dataset details dictionary
    """
    
    flmd_datasets_indices = set()
    flmd_dataset_details = []
    
    for idx, dataset in enumerate(dataset_details_list):
        file_list = dataset.get('distribution')
    
        flmd_url = {}
        csv_files = {}
        for f in file_list:
            encoding_format = f.get('encodingFormat')
            filename = f.get('name')
            url = f.get('contentUrl')
        
            if 'csv' not in encoding_format or url is None:
                continue
        
            if 'flmd' in filename:
                flmd_datasets_indices.add(idx)
                flmd_url.update({filename: url})
        
            else:
                csv_files.update({filename: url})

        dataset.update({
            'flmd_url': flmd_url,
            'csv_files': csv_files
        })
    
        if not flmd_url:      
            dataset_name = dataset.get('name')
            print(f"No flmd found for dataset: {dataset_name}")
        
    print("=====================================")
    
    if len(flmd_datasets_indices) > 0:
        print(f'flmd found in {len(flmd_datasets_indices)} datasets')
        flmd_dataset_details = [dataset_details_list[x] for x in flmd_datasets_indices]
    else:
        print(f'No datasets in the search results have flmds.')
        
    no_flmd_dataset_details = [dataset_detail for idx, dataset_detail in enumerate(dataset_details_list) if idx not in flmd_datasets_indices]
    
    return flmd_dataset_details, no_flmd_dataset_details


def get_dataset_details(dataset_url):
    
    response_status = None
    try:
        dataset_response = requests.get(dataset_url, headers={"Authorization": f"Bearer {token}"})
        response_status = dataset_response.status_code
    except Exception as e:
        print(f"{dataset.get('dataset').get('name')} did not have a successful return: {e}")
        return None

    # If successful response, add to dataset_store
    if response_status == 200:
            dataset_json = dataset_response.json()['dataset'] 
            print(f"--- Acquired details for {dataset_json.get('name')}")
            return dataset_json
    elif response_status:  
        print(f"Response status {response_status}: {dataset_response.text}")
    else:
        print(f"Response status unavailable. Response cannot be interpreted. Debug required.")
    return None


def get_request(filename, f_url, stream=True):
    """
    Get request for file, and stream the content back
    """

    headers = {'user_agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0',
               'content-type': 'application/json'}
    try:
        r = requests.get(f_url, headers=headers, verify=True, stream=stream)
        status_code = r.status_code
        if status_code == 200:
            return r
        else:
            print(f"{filename} request returned {status_code}")
            return None
    except Exception as e:
        print(f"{filename} request unsuccessful: {e}")
        return None
    
    
def make_store(file_request, use_idx=True, print_headers=True):
    """
    Read response and make store
    """
    file_store = {}
    csv_reader = csv.DictReader(file_request.iter_lines(decode_unicode=True))

    for idx, row in enumerate(csv_reader):
        if use_idx:
            file_store.update({f'Index {idx}': row})
            continue
        fn = row.get('File_Name')
        file_store.update({fn: row})
    
    headers = list(row.keys())
    if print_headers:
        print(f"File headers: {headers}")
    return headers, file_store


def make_pandas_df(file_url, header_rows=1, print_headers=True):
    """
    Read response and make pandas pdf from online csv file
    Designed for ESS-DIVE Sample ID and Metadata RF sample_metadata.csv files that have one header row.
    """
    p_df = pd.read_csv(file_url, skiprows=header_rows)
    
    headers = list(p_df.columns)
    if print_headers:
        print(f"File headers: {headers}")
    return headers, p_df


def inspect_dataset_distribution(dataset_detail, file_type='all'):

    print(dataset_detail.get('name'))
    print('========================================')

    count = 0
    dist = dataset_detail.get('distribution')
    
    for idx, file_info in enumerate(dist):
        fn = file_info.get('name')
        fn_url = file_info.get('contentUrl')
        f_encoding = file_info.get('encodingFormat')
        if file_type != 'all' and file_type not in f_encoding:
            continue
        print(f'Index {idx}: {fn}\n  encoding: {f_encoding}\n  url: {fn_url}')
        count += 1
        
    if count == 0:
        print(f'No files found that match the file_type: "{file_type}" criteria.')
            
            
def retrieve_file_from_essdive(file_url, file_path):
    """ Retrieve the data file 
        file_path includes file name.
    """     
    try:
        urlretrieve(file_url, file_path)
        return True, None
    except Exception as e:
        return False, f'File at url: {file_url} was not saved: {e}'
    

def download_selected_files(dataset_detail, file_indices, file_dir=local_dir, log_store=download_file_log, 
                            is_csv_zipped=False, zip_download=None, zip_member_fn=None):
    dist = dataset_detail.get('distribution')
    ds_id = dataset_detail.get('@id')
    citation = dataset_detail.get('citation')
    ds_name = dataset_detail.get('name')
    
    if log_store is None:
        log_store = {}
    
    log_store.setdefault(ds_id, {'@id': ds_id, 'name': ds_name, 'citation': citation, 'downloaded_files': []})
    ds_file_log = log_store.get(ds_id).get('downloaded_files')
    
    print(f'Saving files in {local_dir}')
    print("-------------------------------------")

    for idx, file_info in enumerate(dist):
        msg = None
        is_downloaded = None
        
        if idx not in file_indices:
            continue
            
        fn = file_info.get('name')
        file_path = local_dir / fn
        fn_url = file_info.get('contentUrl')
        
        if not is_csv_zipped:
    
            download_ts = dt.datetime.now().isoformat()
            is_downloaded, msg = retrieve_file_from_essdive(fn_url, file_path)
    
        else:
            if not zip_download or not zip_member_fn:
                print('ZipFile object and zipped member file name are required. Try again.')
                return None
            try:
                zip_download.extract(zip_member_fn, path=file_path)
                if Path.exists(file_path / zip_member_fn):
                    is_downloaded = True
                    download_ts = dt.datetime.now().isoformat()
                else:
                    msg = f'Extraction of {zip_member_fn} from {fn} was not successful.'
            except Exception as e:
                msg = f'ERROR attempting to extract {zip_member_fn} from {fn}: {e}'
        
        if is_downloaded:
            print(f'--- {fn} downloaded')
            ds_file_log.append((fn, fn_url, download_ts))
        else:
            print(msg)
            
    print("-------------------------------------")
    print(f'Remember to cite these files! Dataset DOI {ds_id}')
    return ds_id    


def inspect_zip_file_contents(dataset_detail, file_idx):
    dist = dataset_detail.get('distribution')
    file_info = dist[file_idx]
    
    if not file_info:
        print('File index not found. Please try again.')
        return
    
    fn = file_info.get('name')
    if 'zip' not in file_info.get('encodingFormat'):
        print(f'{fn} is not encoded as a zip file. Please select a different file.')
    
    fn_url = file_info.get('contentUrl')
    resp = urlopen(fn_url)
    
    zip_download = ZipFile(io.BytesIO(resp.read()))
    
    print(f'{fn} contents:')
    print('=================================')
    for idx, file_member in enumerate(zip_download.namelist()):
        print(f'Index {idx}: {file_member}')
        
    return fn, zip_download


def read_zipped_csv(zip_file_obj, csv_file_name, header_rows=1):
    #with open(zip_file_obj, mode='r') as z:
    #     csv_df = pd.read_csv(io.BytesIO(z.read(csv_file_name)))
    csv_df = pd.read_csv(zip_download.open(csv_file_name),encoding='windows-1254', skiprows=header_rows)
    return csv_df
    
    
print('Functions loaded.')

#-------------------------------------------
def in_deep_dive(doi):
    params  = {}
    params['rowStart'] = 1
    params['pageSize'] = 100
    params['doi'] = doi
    
    query_string=urlencode(params)

    r = requests.get(f"https://fusion.ess-dive.lbl.gov/api/v1/deepdive?{query_string}")
    if r.status_code == 200:
        results = r.json()['results']
        if not results:
            return False
        return True
    else:
        print("ERROR")
        print(r.text)
        return None

def search_datasets(go):
    header_authorization =  f"bearer {token}"
    response = requests.get(get_packages_response, headers={"Authorization": header_authorization})

    if response.status_code == 200:
        # Success
        global response_json 
        response_json = response.json()
        print("Success! Continue to look at the search results")  
        go = True
        return go
    else:
        # There was an error
        print("There was an error. Stop here and debug the issue. Email ess-dive-support@lbl.gov if you need assistance. \n")
        print(response.text)
        
def view_search_results():
    search_record_total = response_json['total']
    print(f"Datasets found: {search_record_total}")

    if search_record_total > 100:
        print("The search API cannot return more than 100 results at a time. See documentation for how to paginate.")

    canidate_datasets = response_json['result']


    for idx, dataset in enumerate(canidate_datasets):
        print('________________')
        print(f'Index: {idx}')
        print(dataset.get('dataset').get('name'))
        print(dataset.get('viewUrl'))
        print(dataset.get('dataset').get('datePublished'))

        if in_deep_dive(dataset.get('viewUrl')[35:]):
            print('In the DeepDive!')
            
def construct_query(essdive_api_url, text=None, keywords=None, providerName=None, creator=None, datePublished=None, rowStart=None, pageSize=None):
    # Default values for rowStart and pageSize if None
    rowStart = 1 if rowStart is None else rowStart
    pageSize = 100 if pageSize is None else pageSize

    # Start constructing the query
    query = f"{essdive_api_url}/packages?rowStart={rowStart}&pageSize={pageSize}&text={text}&isPublic=true"

    # Add additional parameters if they are not None
    if text is not None:
        query += f"&text={text}"
    if keywords is not None:
        query += f"&keywords={keywords}"
    if providerName is not None:
        query += f"&providerName={providerName}"
    if creator is not None:
        query += f"&creator={creator}"
    if datePublished is not None:
        query += f"&datePublished={datePublished}"

    return query

def extract_csv_names_and_urls(data):
    """
    Extracts CSV file names and their URLs from a dataset dictionary.

    Parameters:
    - data: A dictionary containing dataset information including distributions.

    Returns:
    - A dictionary with CSV file names as keys and their content URLs as values.
    """
    # Initialize an empty dictionary to hold the CSV file names and URLs
    csv_files = {}
    
    # Check if 'distribution' key is in the data and it is a list
    if 'distribution' in data and isinstance(data['distribution'], list):
        # Loop through each item in the distribution list
        for item in data['distribution']:
            # Check if the item is a dictionary with 'encodingFormat' and 'contentUrl' keys
            # and if the encodingFormat indicates a CSV file
            if isinstance(item, dict) and 'encodingFormat' in item and item.get('encodingFormat') == 'text/csv':
                # Add the CSV file name and its URL to the dictionary
                csv_files[item['name']] = item['contentUrl']
    
    return csv_files
            
def find_common_elements(array1, array2):
    common_elements = []
    for element in array1:
        if element in array2:
            common_elements.append(element)
    return common_elements

def find_unique_elements(array1, array2):
    unique_elements = []
    for element in array1:
        if element not in array2:
            unique_elements.append(element)
    return unique_elements

def remove_duplicates(arr):
    # Initialize an empty list to store unique subarrays
    unique_subarrays = []
    
    # Iterate through the array
    for subarray in arr:
        # Check if the subarray is already present in the list of unique subarrays
        if subarray not in unique_subarrays:
            unique_subarrays.append(subarray)
    
    return unique_subarrays


def combine_arrays(arr1, arr2):
    # Check if the arrays have the same length
    if len(arr1) != len(arr2):
        return "Arrays must have the same length"
    
    # Initialize an empty list to store combined arrays
    combined_arr = []
    
    # Iterate over the arrays and combine corresponding elements
    for i in range(len(arr1)):
        combined_arr.append([arr1[i], arr2[i]])
    
    return combined_arr

def search_array(arr, target):
    # Iterate through the array
    for i, item in enumerate(arr):
        # Check if the current item matches the target string
        if item == target:
            # Return the index of the target string
            return i
    
    # If the target string is not found, return None
    return None
    print("Error, no matching value found in the array.")

def slice_string(string):
    # Split the string by "/"
    parts = string.split("/")
    
    # Get the part of the string after the last "/"
    if len(parts) > 1:
        result = parts[-1]
    else:
        result = string
    
    return result

def slice_series(series):
    # Use the .str accessor to apply string manipulation
    result = series.str.split("/").str[-1]
    return result

def get_string_before_first_slash(input_string):
    # Split the string by '/' and get the first part
    parts = input_string.split('/', 1)
    return parts[0]

def get_string_after_first_slash(s):
    """
    Returns the substring that appears after the first slash in the input string.
    If there is no slash, returns the entire string.
    
    :param s: Input string
    :return: Substring after the first slash or the entire string if no slash is found.
    """
    # Split the string using slash as the delimiter
    parts = s.split('/', 1)  # The '1' argument makes split return two parts at most

    # Check if the string was split into two parts
    if len(parts) > 1:
        return parts[1]  # Return the part after the first slash
    else:
        return s  # Return the original string if there is no slash

# Function to handle button click
def on_button_clicked(b):
    # Using the current value of the number widget as a parameter for the function
    display_file(index_widget.value, df)

def display_file(index, df):
    #get dataset details
    dataset_details_url = f'https://api.ess-dive.lbl.gov/packages/{df["version"][index]}'
    response_status = None
    try:
        dataset_response = requests.get(dataset_details_url, headers={"Authorization": f"Bearer {token}"})
        response_status = dataset_response.status_code
    except Exception as e:
        print(f"{dataset.get('dataset').get('name')} did not have a successful return: {e}")
        return None

    # If successful response, add to dataset_store
    if response_status == 200:
            dataset_json = dataset_response.json()['dataset'] 
            #print(f"--- Acquired details for {dataset_json.get('name')}")
            dataset_details = dataset_json
    elif response_status:  
        print(f"Response status {response_status}: {dataset_response.text}")
        dataset_details = None
    else:
        print(f"Response status unavailable. Response cannot be interpreted. Debug required.")
        dataset_details = None
    
    if not dataset_details:
        print(f"{dataset.get('dataset').get('name')} did not have a successful return: {e}")
        return None
    
    #inspect dataset
    
    #count = 0
    file_names = []
    dist = dataset_details.get('distribution')
    for idx, file_info in enumerate(dist):
        fn = file_info.get('name')
        file_names.append(fn)

    
    #matches the index with the file you searched for
    file_index = None
    try:
        file_index = file_names.index(get_string_before_first_slash(df['data_file'][index]))
    except ValueError:
        print(f"'{get_string_before_first_slash(df['data_file'][index])}' is not in the datafile list.") 
        print("There was an error, move on to the next section.")
    
    # unzip if zip is there 
    zip_file_idx = file_index
    dist = dataset_details.get('distribution')
    file_info = dist[zip_file_idx]
    if not file_info:
        print('File index not found. Please try again.')
        return
    
    fn = file_info.get('name')
    if 'zip' not in file_info.get('encodingFormat'):
        print(f'{fn} is not encoded as a zip file. Please select a different file.')
    
    fn_url = file_info.get('contentUrl')
    resp = urlopen(fn_url)
    
    zip_download = ZipFile(io.BytesIO(resp.read()))
    csv_file_index = zip_download.namelist().index(get_string_after_first_slash(df['data_file'][index]))
    
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    
    header_rows = 0

    # ===================================
    csv_file_name = zip_download.namelist()[csv_file_index]
    print(f'Attempting to read: {csv_file_name} from zip file {fn}')
    print('=============================================================================')
    
    metadata_df = pd.read_csv(zip_download.open(csv_file_name),encoding='windows-1254', skiprows=header_rows)

    
    # check if metadata is at the top of the file
    if list(metadata_df.columns.values)[0].startswith("#"):
        header_rows = 1
        print("Addtional Information:", list(metadata_df.columns.values)[0][1:])
        metadata_df = pd.read_csv(zip_download.open(csv_file_name),encoding='windows-1254', skiprows=header_rows)

    if metadata_df is not None:
        is_csv_zipped = True
        headers = list(metadata_df.columns)
        display(metadata_df)
    else:
        print('ERROR: Sample metadata file was not successfully loaded.')
        
        

Functions loaded.


# 2. (Optional) Search using Dataset API

    Pros: The Dataset API can access more datasets within ESS-DIVE 

### Check if your sample identifer is in the metadata of any dataset in ESS-DIVE
Use the ESS-DIVE Dataset API to search for datasets of interest.

You can search for datasets using any of the following parameters:
- Dataset Creator (creator)
- Date Published (datePublished)
- Project Name (providerName)
- Any text (text)
- Keywords (keywords)
- Public datasets only (isPublic)

**See additional details for dataset search in the ESS-DIVE package API techincal documentation:**** https://api.ess-dive.lbl.gov/#/Data%20Package/listPackages.

Use the [ESS-DIVE's project list](https://docs.google.com/spreadsheets/d/179SOyv42wXbP4owWZtUg3RqhW9dPOyENYcVYuUCcqwg/edit?usp=sharing) to find the options for project names.

In [64]:
#The default search uses the IGSN 10.15485/1729719 as an example. If you'd like to change the search text 
#change the value of `text` to your desired search term

# For an exact match, put the string in quotes, e.g. "\"10.15485/1729719\"" is an exact match, 
# "10.15485/1729719" is any match




# More search examples 
#===========================================================
#text = "*IEWDR00OJ" # * will perform a wildcard search  
#text = "*IEWFS0002"
#text = "*1729719"
#text = "IGSN"
#text = "\"W20\"" #(Site ID)

text= "*IEWFS0002"

In [14]:
#If you'd like to search using these paramters replace None.
keywords = None
providerName = None
creator = None
datePublished = None  # "<[YYYY TO YYYY-MM-DD]>" # Not the same as data coverage 
#Ensure None is assigned to search parameters you do not want to search with.
# ==========================================================
rowStart = 1
pageSize = 100

get_packages_response = construct_query(essdive_api_url,text,keywords,providerName,creator,datePublished,rowStart,pageSize)

go = False
go = search_datasets(go)
if go:
    view_search_results()

Success! Continue to look at the search results
Datasets found: 2
________________
Index: 0
Soil Nitrogen, Water Content, Microbial Biomass, and Archaeal, Bacterial and Fungal Communities from the East River Watershed, Colorado collected in 2016-2017
https://data.ess-dive.lbl.gov/view/doi:10.15485/1577267
2019
________________
Index: 1
Sample Collection Metadata for Soil Cores from the East River Watershed, Colorado collected in 2017
https://data.ess-dive.lbl.gov/view/doi:10.21952/WTR/1573029
2019


# 3. Search using the Deep-Dive API 

### Put a Sample Identifier (IGSN/Sample ID/Site ID) of interest in the Field Value box

In [26]:
#  Search examples:  method, latitude, sample, stream
field_name = widgets.Text(
    value='',
    description='Field Name:',
    disabled=False
)
doi = widgets.Text(
    value='',
    description='DOI:',
    disabled=False
)
field_value_text = widgets.Text(
    value='',
    description='Field Value Text',
    disabled=False
)

display(field_value_text)

# example searches

# Field Value Text = S22RR (Site ID)
# Field Value Text = S19S_0037 (Sample ID)
# Field Value Text = 10.58052/IEPRS00CY (IGSN) 
# Field Value Text = 10.58052/IEWDR01TV (IGSN)
# doi = doi:10.15485/2246724
# doi = doi:10.15485/2204421

# NOTE: Once your search term is inputed, do not rerun the code or your inputs will disappear 

Text(value='', description='Field Value Text')

### If you see an a dataset from the results above says "In the Deep Dive!" you can search for it here. Uncomment display(doi) by removing the '#' in front of it. Rerun the code and the widget will appear. <br>
(Note) When searching by doi follow this format: doi:10.15485/1824222

In [27]:
#=================================
# Uncomment the lines below if you'd like to search by differnet parameters

#display(field_name) #Search by header names
display(doi)

#doi:10.15485/2204421 #CSV only dataset, no zip

Text(value='', description='DOI:')

## Run this code to see the results of your Deep Dive search
This cell is displaying all the datafiles in which your search term appears.

### Use the display button to quickly look within files and check if data is relevant. 
- Change the index to match the datafile you want to look inside of. 
- If too many files are being displayed rerun this block of code to remove them.

In [82]:
# Query by field name
# Case-insensitive search 
params  = {}
params['rowStart'] = 1
params['pageSize'] = 25
if doi.value:
    params['doi'] = doi.value
    print("Searching for: ", doi.value)
if field_name.value:
    params['fieldName']  = field_name.value
    print("Searching for: ", field_name.value)
if field_value_text.value:
    params['fieldValueText']  = field_value_text.value
    print("Searching for: ", field_value_text.value)

query_string=urlencode(params)
#Uncomment to see query string
#print(query_string)


# Creating the number widget
index_widget = widgets.IntText(value=0, description='Index:', disabled=False)
button = widgets.Button(description="Display File")
button.on_click(on_button_clicked)
widgets.HBox([button, index_widget]) #Displaying the button and number widget next to each other
display(widgets.HBox([button, index_widget]))

r = requests.get(f"https://fusion.ess-dive.lbl.gov/api/v1/deepdive?{query_string}")
if r.status_code == 200:
    # Look at search results
    results = r.json()['results']
    df = pd.read_json(json.dumps(results))
    display(df)

else:
    print("ERROR")
    print(r.text)


Searching for:  doi:10.15485/2204421


HBox(children=(Button(description='Display File', style=ButtonStyle()), IntText(value=0, description='Index:')…

Unnamed: 0,field_name,unit,definition,data_type,total_record_count,missing_values_count,values_summary,doi,version,data_file,data_file_url
0,Latitude,decimal_degrees_WGS84,Latitude of sensor measurement.,,4,0,{'unique': ['-9999']},doi:10.15485/2204421,ess-dive-d118621673376b6-20231031T065920243,InstallationMethods.csv,https://fusion.ess-dive.lbl.gov/api/v1/deepdiv...
1,Longitude,decimal_degrees_WGS84,Longitude of sensor measurement.,,4,0,{'unique': ['-9999']},doi:10.15485/2204421,ess-dive-d118621673376b6-20231031T065920243,InstallationMethods.csv,https://fusion.ess-dive.lbl.gov/api/v1/deepdiv...
2,InstallationMethod_ID,,Unique user-determined ID for a method listed ...,,4,0,"{'unique': ['Tow_01', 'Tow_02', 'Tow_03', 'Tow...",doi:10.15485/2204421,ess-dive-d118621673376b6-20231031T065920243,InstallationMethods.csv,https://fusion.ess-dive.lbl.gov/api/v1/deepdiv...
3,InstallationMethod_Description,,Free text field to describe the sensor install...,,4,0,{'unique': ['Sensor towed from boat along the ...,doi:10.15485/2204421,ess-dive-d118621673376b6-20231031T065920243,InstallationMethods.csv,https://fusion.ess-dive.lbl.gov/api/v1/deepdiv...
4,DateTime,YYYY-MM-DD hh:mm:ss,Date and time of measurement. UTC offset is re...,,46138,0,,doi:10.15485/2204421,ess-dive-d118621673376b6-20231031T065920243,February2021_Data.csv,https://fusion.ess-dive.lbl.gov/api/v1/deepdiv...
5,Latitude,decimal_degrees_WGS84,Latitude of sensor measurement.,,46138,0,,doi:10.15485/2204421,ess-dive-d118621673376b6-20231031T065920243,February2021_Data.csv,https://fusion.ess-dive.lbl.gov/api/v1/deepdiv...
6,Longitude,decimal_degrees_WGS84,Longitude of sensor measurement.,,46138,0,"{'unique': ['-119.3426', '-119.2598', '-119.35...",doi:10.15485/2204421,ess-dive-d118621673376b6-20231031T065920243,February2021_Data.csv,https://fusion.ess-dive.lbl.gov/api/v1/deepdiv...
7,Deep_Water_Temperature,degrees_celsius,Temperature of water just above riverbed.,,46138,0,"{'unique': ['2.63617', '2.64566', '2.6443', '2...",doi:10.15485/2204421,ess-dive-d118621673376b6-20231031T065920243,February2021_Data.csv,https://fusion.ess-dive.lbl.gov/api/v1/deepdiv...
8,Surface_Water_Temperature,degrees_celsius,Temperature of water just below the river surf...,,46138,0,"{'unique': ['2.88638', '2.91352', '2.89453', '...",doi:10.15485/2204421,ess-dive-d118621673376b6-20231031T065920243,February2021_Data.csv,https://fusion.ess-dive.lbl.gov/api/v1/deepdiv...
9,Deep_Electrical_Conductivity,microsiemens_per_cm,Electrical conductivity of water just above th...,,46138,0,,doi:10.15485/2204421,ess-dive-d118621673376b6-20231031T065920243,February2021_Data.csv,https://fusion.ess-dive.lbl.gov/api/v1/deepdiv...


# 3. List File Names that Include the Identifer you Searched For

In [66]:
#sample_id_dois = np.array(df['doi'])
dataset_id = np.array(df['version'])
data_file_name = np.array(slice_series(df['data_file']))

combined_array = combine_arrays(dataset_id, data_file_name)
combined_array = remove_duplicates(combined_array)

#LH edit this print
print("                  This is the file identifer:                        This is the filename:")

for i, item in enumerate(combined_array):
    #print file identifier
    print(f"Index: {i}", item)
    

                  This is the file identifer:                        This is the filename:
Index: 0 ['ess-dive-d118621673376b6-20231031T065920243', 'InstallationMethods.csv']
Index: 1 ['ess-dive-d118621673376b6-20231031T065920243', 'February2021_Data.csv']
Index: 2 ['ess-dive-d118621673376b6-20231031T065920243', 'July2021_Data.csv']


### Get dataset title and file information

In [74]:
#change the combined array index to match the dataset you're interested in
dataset_index = 1

#=======================
dataset_id = combined_array[dataset_index][0]
#dataset_id = 'ess-dive-2569191b32b447d-20230809T173212651'
# Find dataset identifier from search above or via Search Webpage
dataset_details_url = f'https://api.ess-dive.lbl.gov/packages/{dataset_id}'



dataset_detail = get_dataset_details(dataset_details_url)
inspect_dataset_distribution(dataset_detail, 'all')

--- Acquired details for Riverbed and Near-Surface Water Quality Data, Hanford Reach, Columbia River, February 2021 - April 2022
Riverbed and Near-Surface Water Quality Data, Hanford Reach, Columbia River, February 2021 - April 2022
Index 0: April2022_HoleData.csv
  encoding: text/csv
  url: https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/object/ess-dive-505312ec069ddd0-20231031T065654531
Index 1: February2021_Data.csv
  encoding: text/csv
  url: https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/object/ess-dive-b66e5f0539585dd-20231031T065654543
Index 2: April2022_Data.csv
  encoding: text/csv
  url: https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/object/ess-dive-387425f9b98db53-20231031T065654523
Index 3: InstallationMethods.csv
  encoding: text/csv
  url: https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/object/ess-dive-8baedb632fa881d-20231031T065654549
Index 4: July2021_Data.csv
  encoding: text/csv
  url: https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/object/ess-dive-cb03bbf00138398-20231

## View dataset CSV files 
### If no CSV file found move down to the section 3.5

In [75]:
# ==============================================
csv_files = extract_csv_names_and_urls(dataset_detail)

if not csv_files:
    print('No csv files. Try Zip File Option below.')

csv_index = []
idx = 0
for fn, url in csv_files.items():
    print(f'Index {idx}: {fn}\n{url}')
    csv_index.append(fn)
    idx += 1


Index 0: April2022_HoleData.csv
https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/object/ess-dive-505312ec069ddd0-20231031T065654531
Index 1: February2021_Data.csv
https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/object/ess-dive-b66e5f0539585dd-20231031T065654543
Index 2: April2022_Data.csv
https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/object/ess-dive-387425f9b98db53-20231031T065654523
Index 3: InstallationMethods.csv
https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/object/ess-dive-8baedb632fa881d-20231031T065654549
Index 4: July2021_Data.csv
https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/object/ess-dive-cb03bbf00138398-20231031T065654559
Index 5: FLMD.csv
https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/object/ess-dive-0676459a454fc11-20231031T065654546
Index 6: dd.csv
https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/object/ess-dive-9b114a25948bdcb-20231031T065654539


### Select and load the sample metadata csv file

In [80]:
metadata_file_idx = 5
# ==================================
# get file_url
fn = csv_index[metadata_file_idx]
fn_url = csv_files.get(fn)

if not fn_url:
    print('Something is amiss! Could not find file_url. Try again.')
else:
    try:
        headers, metadata_df = make_pandas_df(fn_url, print_headers=False)
        print(f'{fn} was loaded as a pandas dataframe.')
        display(metadata_df)
    except Exception as e:
        print(f'Error while attempted to read the {fn_url} into a pandas dataframe. Try again.\nError: {e}')


FLMD.csv was loaded as a pandas dataframe.


Unnamed: 0,FLMD.csv,File-level metadata associated with dataset.
0,dd.csv,Data dictionary defining column headers and ot...
1,InstallationMethods.csv,Description of methods associated with install...
2,February2021_Data.csv,This file contains all data collected during t...
3,July2021_Data.csv,This file contains all data collected during t...
4,April2022_Data.csv,This file contains the longitudinal profiles c...
5,April2022_HoleData.csv,This file contains the additional data collect...


# 3.5 Zip File
### The file you're interested in might be in a zip file. Run cell to unzip files.

In [403]:
# Run if sample_metadata csv file is not found

# ===================================
inspect_dataset_distribution(dataset_detail, 'all') 
file_names = []
dist = dataset_detail.get('distribution')
for idx, file_info in enumerate(dist):
    fn = file_info.get('name')
    file_names.append(fn)

Riverbed and Near-Surface Water Quality Data, Hanford Reach, Columbia River, February 2021 - April 2022
Index 0: InstallationMethods.csv
  encoding: text/csv
  url: https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/object/ess-dive-8baedb632fa881d-20231031T065654549
Index 1: dd.csv
  encoding: text/csv
  url: https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/object/ess-dive-9b114a25948bdcb-20231031T065654539
Index 2: February2021_Data.csv
  encoding: text/csv
  url: https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/object/ess-dive-b66e5f0539585dd-20231031T065654543
Index 3: April2022_HoleData.csv
  encoding: text/csv
  url: https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/object/ess-dive-505312ec069ddd0-20231031T065654531
Index 4: April2022_Data.csv
  encoding: text/csv
  url: https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/object/ess-dive-387425f9b98db53-20231031T065654523
Index 5: July2021_Data.csv
  encoding: text/csv
  url: https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/object/ess-dive-cb03bbf0

### Select zip file to inspect

In [404]:
# Change the index at which the zip file is located 
file_index = None
try:
    file_index = file_names.index(get_string_before_first_slash(df['data_file'][dataset_index]))
except ValueError:
    print(f"'{get_string_before_first_slash(df['data_file'][index])}' is not in the datafile list.") 
    print("There was an error, move to the CSV section.")

# unzip if zip is there 
zip_file_idx = file_index
# ===================================   
fn, zip_download = inspect_zip_file_contents(dataset_detail, zip_file_idx)

print('===================================   ')
print("Desired filename:",get_string_after_first_slash(df['data_file'][dataset_index]))

InstallationMethods.csv is not encoded as a zip file. Please select a different file.


BadZipFile: File is not a zip file

# 4. Display file contents into a Pandas Frame

In [383]:
# Run if csv file is zipped up

# Change this index to the corresponding file
csv_file_index = zip_download.namelist().index(get_string_after_first_slash(df['data_file'][dataset_index]))
# If needed adjust the number of rows to skip. The Sample ID and Metadata RF specifies 1 header row.
header_rows = 0

### Run this cell to see the contents of the datafile in a Dataframe.

In [384]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# ===================================
csv_file_name = zip_download.namelist()[csv_file_index]
print(f'Attempting to read: {csv_file_name} from zip file {fn}')
print('=============================================================================')
 

#Idetifiers: IGSN, Sample_ID, Sample_Name, Location_ID, Site_ID, 

metadata_df = read_zipped_csv(zip_download, csv_file_name, header_rows)

if metadata_df is not None:
    is_csv_zipped = True
    headers = list(metadata_df.columns)
    display(metadata_df)
else:
    print('ERROR: Sample metadata file was not successfully loaded.')


Attempting to read: v2_RCSFA_Geospatial_Site_Information.csv from zip file 6_SupplementalData.zip


Unnamed: 0,Site_ID,Latitude,Longitude,IGSN,Description,Physiographic_Feature_Name,COMID,RawCOMID_Used,Country,Methods_and_Flags,Submission_Contact_Name,Submission_Contact_Email
0,BLU-BLU,44.1623,-122.332,10.58052/IEWDR00WM,stream,Blue River,23773405,True,United States,COMID_01,James Stegen,james.stegen@pnnl.gov
1,BLU-TID,44.2179,-122.265,10.58052/IEWDR00WN,stream,Blue River below Tidbits Creek,23773429,True,United States,COMID_02,James Stegen,james.stegen@pnnl.gov
2,BOU1,45.6449,-73.4774,10.58052/IEWDR01BX,stream,St. Lawrence River,-9999,,Canada,COMID_08,James Stegen,james.stegen@pnnl.gov
3,BOU2,45.6462,-73.4637,10.58052/IEWDR01BW,stream,St. Lawrence River,-9999,,Canada,COMID_08,James Stegen,james.stegen@pnnl.gov
4,BOU3,45.645,-73.4529,10.58052/IEWDR01BV,stream,St. Lawrence River,-9999,,Canada,COMID_08,James Stegen,james.stegen@pnnl.gov
5,BOU4,45.5685,-73.5079,10.58052/IEWDR01BO,stream,St. Lawrence River,-9999,,Canada,COMID_08,James Stegen,james.stegen@pnnl.gov
6,BR08,44.6006,-75.6606,10.58052/IEWDR01AQ,stream,St. Lawrence River,-9999,,Canada,COMID_08,James Stegen,james.stegen@pnnl.gov
7,BR09,44.63438,-75.61223,10.58052/IEWDR01AO,stream,St. Lawrence River,-9999,,Canada,COMID_08,James Stegen,james.stegen@pnnl.gov
8,BR11,44.79,-75.3714,10.58052/IEWDR01AX,stream,St. Lawrence River,-9999,,Canada,COMID_08,James Stegen,james.stegen@pnnl.gov
9,BR12,44.8463,-75.2996,10.58052/IEWDR01AZ,stream,St. Lawrence River,-9999,,Canada,COMID_08,James Stegen,james.stegen@pnnl.gov


### Select specific colums to display

In [385]:
# Enter column indices from above
metadata_columns_idxs = [0, 1, 2, 3,]

# ===================================

display(metadata_df.iloc[:, metadata_columns_idxs])
print('==============================')
for col_idx in metadata_columns_idxs:
    print(f'Index {col_idx} --- {headers[col_idx]}')

Unnamed: 0,Site_ID,Latitude,Longitude,IGSN
0,BLU-BLU,44.1623,-122.332,10.58052/IEWDR00WM
1,BLU-TID,44.2179,-122.265,10.58052/IEWDR00WN
2,BOU1,45.6449,-73.4774,10.58052/IEWDR01BX
3,BOU2,45.6462,-73.4637,10.58052/IEWDR01BW
4,BOU3,45.645,-73.4529,10.58052/IEWDR01BV
5,BOU4,45.5685,-73.5079,10.58052/IEWDR01BO
6,BR08,44.6006,-75.6606,10.58052/IEWDR01AQ
7,BR09,44.63438,-75.61223,10.58052/IEWDR01AO
8,BR11,44.79,-75.3714,10.58052/IEWDR01AX
9,BR12,44.8463,-75.2996,10.58052/IEWDR01AZ


Index 0 --- Site_ID
Index 1 --- Latitude
Index 2 --- Longitude
Index 3 --- IGSN


# 5. Download
<b> Run steps 3 and 4 before downlodind

Download files that are interesting

Look at the data download `encodingFormat` to see if it is a zip (application/zip) file or a csv (text/csv)

In [389]:


# ===================================
if not is_csv_zipped:
    fn = csv_index[metadata_file_idx]
    all_file_idx = None

    for idx, filename in enumerate(dataset_detail.get('distribution')):
        if filename.get('name') == fn:
            all_file_idx = idx
            break
    if all_file_idx:
        ds_doi = download_selected_files(dataset_detail, [all_file_idx], local_dir)
    else:
        print('Could not find requested file.')
else:
    ds_doi = download_selected_files(dataset_detail, [zip_file_idx], local_dir, is_csv_zipped=is_csv_zipped, 
                                     zip_download=zip_download, zip_member_fn=csv_file_name)


Saving files in /Users/YLH/ESS-DIVE/essdive-tutorials/downloads
-------------------------------------
--- 6_SupplementalData.zip downloaded
-------------------------------------
Remember to cite these files! Dataset DOI doi:10.15485/1999774


End of Tutorial