In [1]:
from erddapy import ERDDAP
import pandas as pd
import requests
import netCDF4 as nc
from collections import defaultdict

from accessibility import check_endpoint
from summarize import * #json_keys, data_frame

### ARGO ERDDAP server

In [2]:
# analysed endpoint: 
endpoint_url = "https://erddap.ifremer.fr/erddap"

#connect to server
argoERDDAP = ERDDAP(server=endpoint_url,protocol="tabledap")

In [None]:
if check_endpoint(endpoint_url):
    print("The endpoint is machine-accessible.")
else:
    print("The endpoint is not machine-accessible.")

### Analysis results:

- data is findable, not always accessible 
    - for some listed datasets: an additional step is needed to open the actual netCDF files 
    (though the documentation mentions this is "OpenDAP on top of NetCDF files" still extra step needed)
    - at moment of running the code, some datasets were not accessible 
    - 'log in' column in 'allDatasets' dataset indicates that not all data is freely available

- data granularity
    - some datasets (e.g. ArgaFloats) contain file paths in 'file' column --> retrieval of actual data requires additional steps
    - other datasets directly return measurement data
      

- semantics:
    - interoperability at higher level --> allDatasets dataset offers metadata about the datasets that are available through the ERDDAP server. Properties/columns used to describe this metadata are the same as in other ERDDAP servers (eg. the EMSO-ERIC ERDDAP server).  
    - other datasets offered by ERDDAP server contain different properties --> similar columns/properties but not defined through external standard terms
        - more prone towards wrongly combining different data  



**Exploration of the allDatasets dataset**  
~ retrieving a dataset that lists all available datasets on the ERDDAP server. The returned DataFrame will contain metadata for each dataset available on that server  
~ essentially a catalog of all datasets hosted on the server, including essential metadata that allows you to identify and filter the datasets of interest  

In [3]:
#Set a dataset ID
argoERDDAP.dataset_id = "allDatasets"

#Get data (as 2D dataframe)
argoERDDAP_allDatasets_df = argoERDDAP.to_pandas()

In [None]:
#explore columns
argoERDDAP_allDatasets_df.columns

In [None]:
#view 
argoERDDAP_allDatasets_df
# save
argoERDDAP_allDatasets_df.to_csv("properties/ARGO_ERDDAP_overview_metadata.csv", index=False)

In [None]:
# other metadata (~written metadata, intended for human reading?) available with df.summary
argoERDDAP_allDatasets_df.summary

Exploration of **search information** for each data offered through the ERDDAP server

In [None]:
# get search information
searchinfo_df = pd.read_csv(argoERDDAP.get_search_url(response="csv"))

In [None]:
searchinfo_df.columns

In [None]:
searchinfo_df

Exploration of **metadata** available for each dataset offered through the ERDDAP server

In [None]:
# Metadata information for each dataset
metadatainfo_df = pd.DataFrame(columns=["Row Type", "Variable Name", "Attribute Name", "Data Type", "Value", "DatasetID"])
# get metadata for each dataset
for datasetID in argoERDDAP_allDatasets_df['datasetID']:
    argoERDDAP.dataset_id = datasetID
    try:
        _df = pd.read_csv(argoERDDAP.get_info_url(response="csv")) #metadata retrieved via info_url
        _df["DatasetID"] = datasetID
        metadatainfo_df = pd.concat([metadatainfo_df, _df], ignore_index=True)
    except Exception as e:
        print(f"there was an error for {datasetID}: '{e}'")
        continue

# save to csv file
metadatainfo_df.to_csv("properties/ARGO_ERDDAP_dataset_metadata.csv", index=False)

metadatainfo_df

In [None]:
# explore returned metadata
metadatainfo_df.groupby(['DatasetID', 'Row Type']).nunique()

In [None]:
# Explore the dimensions of each dataset (Row Type = 'dimension')
dim_info = metadatainfo_df[ metadatainfo_df['Row Type'] == 'dimension'].groupby(['DatasetID', 'Variable Name']).nunique()

dim_info.to_csv("properties/ARGO_ERDDAP_dataset_dimensions.csv", index=False)

dim_info

In [None]:
# Explore the variables of each dataset (Row Type = 'variable')
#note: in netCDFs, attributes represent additional information 
var_info = metadatainfo_df[ metadatainfo_df['Row Type'] == 'variable'].groupby(['DatasetID', 'Variable Name']).nunique()

var_info.to_csv("properties/ARGO_ERDDAP_dataset_variables.csv", index=False)

var_info

In [None]:
# Explore the attributes of each dataset (Row Type = 'attribute')
metadatainfo_df[ metadatainfo_df['Row Type'] == 'attribute'].groupby(['DatasetID', 'Attribute Name']).nunique()

In [None]:
# and the variables/dimensions with which they're associated 
attr_info = metadatainfo_df[ metadatainfo_df['Row Type'] == 'attribute'].groupby(['DatasetID', 'Attribute Name', 'Variable Name']).nunique()

attr_info.to_csv("properties/ARGO_ERDDAP_dataset_attributes.csv", index=False)

attr_info

Exploration of **data** available for each dataset offered through the ERDDAP server, by accessing as pd.DataFrame  
(less efficient than accessing as ncCF)

In [4]:
#list datasets
print(f"there are {len(argoERDDAP_allDatasets_df['datasetID'].drop_duplicates())} datasets available via ARGO ERDDAP")

# Connect to server 
erddap = ERDDAP(server="https://erddap.ifremer.fr/erddap", protocol="tabledap")

df_summary_full = pd.DataFrame(columns=["DatasetID", "Property", "Count", "Types", "Example"])
#first half (otherwise code takes too long to run)
for datasetID in argoERDDAP_allDatasets_df['datasetID'][:len(argoERDDAP_allDatasets_df) // 2]:
    print(f"Processing dataset: {datasetID}")
    
    try:
        # Set the dataset ID
        erddap.dataset_id = datasetID
        
        # Fetch data as a 2D dataframe with timeout handling
        dataset_df = erddap.to_pandas()
        
        # Process data        
        df_summary = data_frame(dataset_df, datasetID)
        df_summary_full = pd.concat([df_summary_full, df_summary], ignore_index=True)
        df_summary_full.to_csv("properties/ARGO_ERDDAP_dataset_data.csv", index=False)
    except Exception as e:
        print(f"There was an error for {datasetID}: {e}")

df_summary_full

#Property ~ variables (attributes & dimensions not clear)

there are 53 datasets available via ARGO ERDDAP
Processing dataset: allDatasets
Processing dataset: OACP-Argo-Global
There was an error for OACP-Argo-Global: Error {
    code=404;
    message="Not Found: Currently unknown datasetID=OACP-Argo-Global";
}

Processing dataset: ArgoFloats


Extra step needed to retrieve actual data