In [1]:
from erddapy import ERDDAP
import netCDF4 as nc
import pandas as pd
from connec_functions import GDB
from collections import defaultdict

from accessibility import check_endpoint
from summarize import * #json_keys, data_frame

### EMSO ERIC ERDDAP server

In [2]:
# analysed endpoint:
endpoint_url = "https://erddap.emso.eu/erddap"

#make connection
emsoERDDAP = ERDDAP(server=endpoint_url, protocol="tabledap")

In [None]:
if check_endpoint(endpoint_url):
    print("The endpoint is machine-accessible.")
else:
    print("The endpoint is not machine-accessible.")

### Analysis results:

- datasets offered by the ERDDAP server directly return data  
(compared to other ERDDAP server where you need an additional step to retrieve actual files)
- requires knowledge on netCDF files 
- content of netCDF files:
    - in some cases, there is use of OrcID and urls for creator  
        --> good but inconsistent, incomplete (e.g. ROR-id for institutes) 
        --> why not for contributors, ...?  
- allDatasets dataset is a dataset that contains metadata about all the datasets offered by the ERDDAP server, the properties (~ i.e. columns) with which this metadata is described is the same between ERDDAP servers (cf. comparing to EMSO-ERIC ERDDAP server)  

**Exploration of the allDatasets dataset**  
~ retrieving a dataset that lists all available datasets on the ERDDAP server. The returned DataFrame will contain metadata for each dataset available on that server  
~ essentially a catalog of all datasets hosted on the server, including essential metadata that allows you to identify and filter the datasets of interest  

In [3]:
# set dataset ID
emsoERDDAP.dataset_id = "allDatasets"

#Get data (as 2D dataframe)
EMSO_alldatasets_df = emsoERDDAP.to_pandas()

In [None]:
#explore columns
EMSO_alldatasets_df.columns

In [None]:
# view
EMSO_alldatasets_df
# save
EMSO_alldatasets_df.to_csv("properties/EMSO_ERDDAP_overview_metadata.csv", index=False)

In [None]:
print(f"There are {len(EMSO_alldatasets_df['datasetID'].drop_duplicates())} datasets offered by the EMSO ERDDAP server")

In [None]:
#See if other metadata listed in allDatasets dataset
for line in EMSO_alldatasets_df.summary:
    print(line)

Exploration of **search information** for each data offered through the ERDDAP server

In [None]:
# get search information
searchinfo_df = pd.read_csv(emsoERDDAP.get_search_url(response="csv"))

In [None]:
searchinfo_df.columns

In [None]:
searchinfo_df

Exploration of **metadata information** available for each dataset offered through the ERDDAP server

In [None]:
# Metadata information for each dataset
metadatainfo_df = pd.DataFrame(columns=["Row Type", "Variable Name", "Attribute Name", "Data Type", "Value", "DatasetID"])
# get metadata for each dataset
for datasetID in EMSO_alldatasets_df['datasetID']:
    emsoERDDAP.dataset_id = datasetID
    try:
        _df = pd.read_csv(emsoERDDAP.get_info_url(response="csv")) #metadata retrieved via info_url
        _df["DatasetID"] = datasetID
        metadatainfo_df = pd.concat([metadatainfo_df, _df], ignore_index=True)
    except Exception as e:
        print(f"there was an error for {datasetID}: '{e}'")
        continue

# save to csv file
metadatainfo_df.to_csv("properties/EMSO_ERDDAP_dataset_metadata.csv", index=False)

metadatainfo_df

In [None]:
# explore returned metadata
metadatainfo_df.groupby(['DatasetID', 'Row Type']).nunique()

In [None]:
# Explore the dimensions of each dataset (Row Type = 'dimension')
dim_info = metadatainfo_df[ metadatainfo_df['Row Type'] == 'dimension'].groupby(['DatasetID', 'Variable Name']).nunique()

dim_info.to_csv("properties/EMSO_ERDDAP_dataset_dimensions.csv", index=False)

dim_info

In [None]:
# Explore the variables of each dataset (Row Type = 'variable')
#note: in netCDFs, attributes represent additional information 
var_info = metadatainfo_df[ metadatainfo_df['Row Type'] == 'variable'].groupby(['DatasetID', 'Variable Name']).agg({
        'Data Type': 'unique',
        'Value': 'unique'
    }).reset_index()

var_info.to_csv("properties/EMSO_ERDDAP_dataset_variables.csv", index=False)

var_info

In [None]:
# Explore the attributes of each dataset (Row Type = 'attribute')
metadatainfo_df[ metadatainfo_df['Row Type'] == 'attribute'].groupby(['DatasetID', 'Attribute Name']).nunique()

In [None]:
# and the variables/dimensions with which they're associated 
attr_info = metadatainfo_df[ metadatainfo_df['Row Type'] == 'attribute'].groupby(['DatasetID', 'Attribute Name', 'Variable Name']).agg({
        'Data Type': 'unique',
        'Value': 'unique'
    }).reset_index()

attr_info.to_csv("properties/EMSO_ERDDAP_dataset_attributes.csv", index=False)

attr_info

Exploration of **data** from each dataset offered through the ERDDAP server, by accessing as pd.DataFrame  
(less efficient than accessing as ncCF)

In [4]:
#list datasets
print(f"there are {len(EMSO_alldatasets_df['datasetID'].drop_duplicates())} datasets available via EMSO ERDDAP")

# Connect to server 
erddap = ERDDAP(server="https://erddap.emso.eu/erddap", protocol="tabledap")

summary_full_df = pd.DataFrame()
#first half (otherwise code takes too long to run)
for datasetID in EMSO_alldatasets_df['datasetID'][:len(EMSO_alldatasets_df) // 2]:
    print(f"Processing dataset: {datasetID}")
    
    try:
        # Set the dataset ID
        erddap.dataset_id = datasetID
        
        # Fetch data as a 2D dataframe with timeout handling
        dataset_df = erddap.to_pandas()
        
        # Process data        
        summ_df = data_frame(dataset_df, datasetID)
        summary_full_df = pd.concat([summary_full_df, summ_df])
        
    except Exception as e:
        print(f"There was an error for {datasetID}: {e}")


summary_full_df
#Property ~ variables (attributes & dimensions not clear)

there are 171 datasets available via EMSO ERDDAP
Processing dataset: allDatasets
Processing dataset: EMSO_OBSEA_CTD_30min
Processing dataset: EMSO_OBSEA_Besos_Buoy_Airmar_200WX_30min
Processing dataset: EMSO_OBSEA_Besos_Buoy_Airmar_200WX_full
Processing dataset: E2M3A_METEO
Processing dataset: E2M3A_MRDT
Processing dataset: E2M3A_PCO2PROA
Processing dataset: E2M3A_PCO2PROW
Processing dataset: E2M3A_SAMI
Processing dataset: E2M3A_SBE16PLS
Processing dataset: E2M3A_SBE37O
Processing dataset: E2M3A_CTD_meteo_CO2_pH_NRT
Processing dataset: E2M3A_2021_2022_TS
Processing dataset: Emso_Azores_Chemini_IRON
Processing dataset: EMSO-AZORES_TCM3-1_2016-2017
Processing dataset: EMSO-AZORES_TCM3-1_2017-2018
Processing dataset: EMSO-AZORES_TCM3-1_2018-2019
Processing dataset: EMSO-AZORES_TCM3-1_2021-2022
Processing dataset: EMSO-AZORES_TCM3-2_2017-2018
Processing dataset: EMSO-AZORES_TCM3-2_2018-2019
Processing dataset: EMSO-AZORES_TCM3-2_2019-2020
Processing dataset: EMSO-AZORES_TCM3-2_2020-2021
Pr

Unnamed: 0,DatasetID,Property,Count,Types,Example,UniqueValues
0,allDatasets,datasetID,171,object,allDatasets,{}
1,allDatasets,accessible,171,object,public,{public}
2,allDatasets,institution,171,object,Axiom Docker Install,{}
3,allDatasets,dataStructure,171,object,table,{table}
4,allDatasets,cdm_data_type,171,object,Other,"{Other, Point, TimeSeries}"
...,...,...,...,...,...,...
3,EMSO-AZORES_EGIM_Turbidity_2017-2018,longitude (degrees_east),25469,float64,-32.27562,{-32.27562}
4,EMSO-AZORES_EGIM_Turbidity_2017-2018,depth (m),25469,float64,1700.0,{1700.0}
5,EMSO-AZORES_EGIM_Turbidity_2017-2018,DEPH_QC,25469,int64,7,{7}
6,EMSO-AZORES_EGIM_Turbidity_2017-2018,TUR4 (NTU),25469,float64,294.0,{}


In [5]:
# write first half to csv file
summary_full_df.to_csv("properties/EMSO_ERDDAP_dataset_data_metadata.csv", index=False)

In [6]:
#second half
for datasetID in EMSO_alldatasets_df['datasetID'][(len(EMSO_alldatasets_df) // 2):]:
    print(f"Processing dataset: {datasetID}")
    
    try:
        # Set the dataset ID
        erddap.dataset_id = datasetID
        
        # Fetch data as a 2D dataframe with timeout handling
        dataset_df = erddap.to_pandas()
        
        # Process data        
        summ_df = data_frame(dataset_df, datasetID)
        summary_full_df = pd.concat([summary_full_df, summ_df], ignore_index=True)
        #overwrite each time, try to get info from as much datasets as possible before error
        summary_full_df.to_csv("properties/EMSO_ERDDAP_dataset_data_metadata2.csv", index=False)
    except Exception as e:
        print(f"There was an error for {datasetID}: {e}")

summary_full_df
summary_full_df.to_csv("properties/EMSO_ERDDAP_dataset_data_metadata2.csv", index=False)
#note: doesn't list netCDF attributes

Processing dataset: EMSO-AZORES_CHEMINI_Total-Iron_2011-2012
Processing dataset: EMSO-AZORES_CHEMINI_Total-Iron_2013-2014
Processing dataset: EMSO-AZORES_CHEMINI_Total-Iron_2014-2015
Processing dataset: EMSO-AZORES_CHEMINI_Total-Iron_2016-2017
Processing dataset: EMSO-AZORES_CHEMINI_Total-Iron_2017-2018
Processing dataset: EMSO-AZORES_CHEMINI_Total-Iron_2018-2019
Processing dataset: EMSO-AZORES_CHEMINI_Total-Iron_2019-2020
Processing dataset: EMSO-AZORES_Seamon-East_Optode-O2_2013-2014
Processing dataset: EMSO-AZORES_Seamon-East_Optode-O2_2015-2016
Processing dataset: EMSO-AZORES_Seamon-East_Optode-O2_2016-2017
Processing dataset: EMSO-AZORES_Seamon-East_Optode-O2_2012-2013
Processing dataset: EMSO-AZORES_Wetlabs_Turbidity_2011-2012
Processing dataset: EMSO-AZORES_Wetlabs_Turbidity_2012-2013
Processing dataset: EMSO-AZORES_Wetlabs_Turbidity_2013-2014
Processing dataset: EMSO-AZORES_Wetlabs_Turbidity_2015-2016
Processing dataset: EMSO-AZORES_Wetlabs_Turbidity_2016-2017
Processing datase

: 

: 

Exploration of **data** from each dataset offered through the ERDDAP server, by accessing as ncCF  
(listing the global attributes, dimensions and variables)  

--> returns the same information as does metadata retrieved through the get_info_url  
(_df = pd.read_csv(emsoERDDAP.get_info_url(response="csv")))

Comparing the properties between ERDDAP servers (ARGO & EMSO ERDDAP servers)

~ similar properties but 100% same, which can lead to confusion & result in error when combining data