In [1]:
from erddapy import ERDDAP
import pandas as pd
import requests
import netCDF4 as nc
from collections import defaultdict

from accessibility import check_endpoint
from summarize import * #json_keys, data_frame

### ARGO ERDDAP server

In [2]:
# analysed endpoint: 
endpoint_url = "https://erddap.ifremer.fr/erddap"

#connect to server
argoERDDAP = ERDDAP(server=endpoint_url,protocol="tabledap")

In [3]:
if check_endpoint(endpoint_url):
    print("The endpoint is machine-accessible.")
else:
    print("The endpoint is not machine-accessible.")

Checking endpoint: https://erddap.ifremer.fr/erddap
Endpoint is online: 200
Content type may not be machine-readable: text/html;charset=UTF-8
The endpoint is machine-accessible.


**Exploration of the allDatasets dataset**  
~ retrieving a dataset that lists all available datasets on the ERDDAP server. The returned DataFrame will contain metadata for each dataset available on that server  
~ essentially a catalog of all datasets hosted on the server, including essential metadata that allows you to identify and filter the datasets of interest  

In [4]:
#Set a dataset ID
argoERDDAP.dataset_id = "allDatasets"

#Get data (as 2D dataframe)
argoERDDAP_allDatasets_df = argoERDDAP.to_pandas()

In [5]:
#explore columns
argoERDDAP_allDatasets_df.columns

Index(['datasetID', 'accessible', 'institution', 'dataStructure',
       'cdm_data_type', 'class', 'title', 'minLongitude (degrees_east)',
       'maxLongitude (degrees_east)', 'longitudeSpacing (degrees_east)',
       'minLatitude (degrees_north)', 'maxLatitude (degrees_north)',
       'latitudeSpacing (degrees_north)', 'minAltitude (m)', 'maxAltitude (m)',
       'minTime (UTC)', 'maxTime (UTC)', 'timeSpacing (seconds)', 'griddap',
       'subset', 'tabledap', 'MakeAGraph', 'sos', 'wcs', 'wms', 'files',
       'fgdc', 'iso19115', 'metadata', 'sourceUrl', 'infoUrl', 'rss', 'email',
       'testOutOfDate', 'outOfDate', 'summary'],
      dtype='object')

In [9]:
# save
argoERDDAP_allDatasets_df.to_csv("properties/ARGO_ERDDAP_overview_metadata.csv", index=False)
#view 
argoERDDAP_allDatasets_df

Unnamed: 0,datasetID,accessible,institution,dataStructure,cdm_data_type,class,title,minLongitude (degrees_east),maxLongitude (degrees_east),longitudeSpacing (degrees_east),...,fgdc,iso19115,metadata,sourceUrl,infoUrl,rss,email,testOutOfDate,outOfDate,summary
0,allDatasets,public,Ifremer,table,Other,EDDTableFromAllDatasets,* The List of All Active Datasets in this ERDD...,,,,...,,,https://erddap.ifremer.fr/erddap/info/allDatas...,https://erddap.ifremer.fr/erddap,https://erddap.ifremer.fr/erddap,https://erddap.ifremer.fr/erddap/rss/allDatase...,https://erddap.ifremer.fr/erddap/subscriptions...,,,This dataset is a table which has a row of inf...
1,OACP-Argo-Global,public,LOPS/Ifremer,grid,Grid,EDDGridFromNcFiles,2000-2015 climatology of the Subtropical Mode ...,-242.156,129.844,1.0,...,https://erddap.ifremer.fr/erddap/metadata/fgdc...,https://erddap.ifremer.fr/erddap/metadata/iso1...,https://erddap.ifremer.fr/erddap/info/OACP-Arg...,(local files),https://doi.org/10.17882/56503,https://erddap.ifremer.fr/erddap/rss/OACP-Argo...,https://erddap.ifremer.fr/erddap/subscriptions...,,,Maps of properties from OAC-P estimates. Therm...
2,ArgoFloats,public,Argo,table,TrajectoryProfile,EDDTableFromMultidimNcFiles,Argo Float Measurements,-179.99942,180.0,,...,https://erddap.ifremer.fr/erddap/metadata/fgdc...,https://erddap.ifremer.fr/erddap/metadata/iso1...,https://erddap.ifremer.fr/erddap/info/ArgoFloa...,(local files),https://argo.ucsd.edu/,https://erddap.ifremer.fr/erddap/rss/ArgoFloat...,https://erddap.ifremer.fr/erddap/subscriptions...,,,Argo float vertical profiles from Coriolis Glo...
3,ArgoFloats-synthetic-BGC,public,Argo,table,TrajectoryProfile,EDDTableFromMultidimNcFiles,Argo float synthetic vertical profiles : BGC data,-999.999,181.706,,...,https://erddap.ifremer.fr/erddap/metadata/fgdc...,https://erddap.ifremer.fr/erddap/metadata/iso1...,https://erddap.ifremer.fr/erddap/info/ArgoFloa...,(local files),http://www.argodatamgt.org/Documentation,https://erddap.ifremer.fr/erddap/rss/ArgoFloat...,https://erddap.ifremer.fr/erddap/subscriptions...,now-5days,1.198491,Argo float synthetic vertical profiles : BGC data
4,ArgoFloats-reference,public,Argo,table,Other,EDDTableFromMultidimNcFiles,Argo Reference Measurements,-180.0,179.999487,,...,https://erddap.ifremer.fr/erddap/metadata/fgdc...,https://erddap.ifremer.fr/erddap/metadata/iso1...,https://erddap.ifremer.fr/erddap/info/ArgoFloa...,(local files),http://www.argodatamgt.org/DMQC/Reference-data...,https://erddap.ifremer.fr/erddap/rss/ArgoFloat...,https://erddap.ifremer.fr/erddap/subscriptions...,,,Argo float vertical profiles to be used in DMQ...
5,ArgoFloats-index,public,Argo,table,Other,EDDTableFromAsciiFiles,ArgoFloats index,-999.999,346.813,,...,https://erddap.ifremer.fr/erddap/metadata/fgdc...,https://erddap.ifremer.fr/erddap/metadata/iso1...,https://erddap.ifremer.fr/erddap/info/ArgoFloa...,ftp://ftp.ifremer.fr/ifremer/argo/ar_index_glo...,http://www.argodatamgt.org/DMQC/Reference-data...,https://erddap.ifremer.fr/erddap/rss/ArgoFloat...,https://erddap.ifremer.fr/erddap/subscriptions...,,,Argo detailed index. Gathers data available at...
6,ArgoFloats-reference-CTD,log in,Argo,table,Other,EDDTableFromMultidimNcFiles,CTD Reference Measurements,,,,...,,,,,,,,,,"Conductivity, Temperature, Depth (CTD) Referen..."
7,SST_Anomalies_Caledonie,public,Jet Propulsion Laboratory,grid,Grid,EDDGridFromNcFiles,"Daily MUR SST, Final product",155.01,175.01,0.01,...,https://erddap.ifremer.fr/erddap/metadata/fgdc...,https://erddap.ifremer.fr/erddap/metadata/iso1...,https://erddap.ifremer.fr/erddap/info/SST_Anom...,(local files),https://podaac.jpl.nasa.gov/ws/metadata/datase...,https://erddap.ifremer.fr/erddap/rss/SST_Anoma...,https://erddap.ifremer.fr/erddap/subscriptions...,now-109days,9.027284,"A merged, multi-sensor L4 Foundation Sea Surfa..."
8,OS_DYFAMED_1994-2014_D_TSO2,public,IMEV Villefranche-sur-mer,grid,Profile,EDDGridFromNcFiles,EMSO Ligure DYFAMED Time Series from 1994 to 2...,,,,...,,,https://erddap.ifremer.fr/erddap/info/OS_DYFAM...,(local files),???,https://erddap.ifremer.fr/erddap/rss/OS_DYFAME...,https://erddap.ifremer.fr/erddap/subscriptions...,,,EMSO Ligure DYFAMED Time Series from 1994 to 2...
9,drifter_hourly_qc,public,NOAA Atlantic Oceanographic and Meteorological...,table,Trajectory,EDDTableFromErddap,Global Drifter Program - 1 Hour Interpolated Q...,-180.0,180.0,,...,https://erddap.ifremer.fr/erddap/metadata/fgdc...,https://erddap.ifremer.fr/erddap/metadata/iso1...,https://erddap.ifremer.fr/erddap/info/drifter_...,(local files),https://www.aoml.noaa.gov/phod/dac/dirall.html,https://erddap.ifremer.fr/erddap/rss/drifter_h...,https://erddap.ifremer.fr/erddap/subscriptions...,,,Global Drifter Program hourly drifting buoy co...


In [12]:
# other metadata (~written metadata, intended for human reading?) available with df.summary
argoERDDAP_allDatasets_df.summary

0     This dataset is a table which has a row of inf...
1     Maps of properties from OAC-P estimates. Therm...
2     Argo float vertical profiles from Coriolis Glo...
3     Argo float synthetic vertical profiles : BGC data
4     Argo float vertical profiles to be used in DMQ...
5     Argo detailed index. Gathers data available at...
6     Conductivity, Temperature, Depth (CTD) Referen...
7     A merged, multi-sensor L4 Foundation Sea Surfa...
8     EMSO Ligure DYFAMED Time Series from 1994 to 2...
9     Global Drifter Program hourly drifting buoy co...
10    Global Drifter Program 6-hourly drifting buoy ...
11    The data set consists of maps of total velocit...
12    Surface ocean velocities estimated from High F...
13    Global Ocean - In Situ Observation Copernicus....
14      This file is the raw output of Ariane software.
15    This file contains time series of simulated pa...
16    This file contains time series of 2D-histogram...
17    This file contains trajectories of simulat

Exploration of **search information** for each data offered through the ERDDAP server

In [13]:
# get search information
searchinfo_df = pd.read_csv(argoERDDAP.get_search_url(response="csv"))

In [14]:
searchinfo_df.columns

Index(['griddap', 'Subset', 'tabledap', 'Make A Graph', 'wms', 'files',
       'Accessible', 'Title', 'Summary', 'FGDC', 'ISO 19115', 'Info',
       'Background Info', 'RSS', 'Email', 'Institution', 'Dataset ID'],
      dtype='object')

In [15]:
searchinfo_df

Unnamed: 0,griddap,Subset,tabledap,Make A Graph,wms,files,Accessible,Title,Summary,FGDC,ISO 19115,Info,Background Info,RSS,Email,Institution,Dataset ID
0,,https://erddap.ifremer.fr/erddap/tabledap/allD...,https://erddap.ifremer.fr/erddap/tabledap/allD...,https://erddap.ifremer.fr/erddap/tabledap/allD...,,,public,* The List of All Active Datasets in this ERDD...,This dataset is a table which has a row of inf...,,,https://erddap.ifremer.fr/erddap/info/allDatas...,https://erddap.ifremer.fr/erddap,,,Ifremer,allDatasets
1,,,https://erddap.ifremer.fr/erddap/tabledap/Argo...,https://erddap.ifremer.fr/erddap/tabledap/Argo...,,,public,Argo Float Measurements,Argo float vertical profiles from Coriolis Glo...,https://erddap.ifremer.fr/erddap/metadata/fgdc...,https://erddap.ifremer.fr/erddap/metadata/iso1...,https://erddap.ifremer.fr/erddap/info/ArgoFloa...,https://argo.ucsd.edu/,https://erddap.ifremer.fr/erddap/rss/ArgoFloat...,https://erddap.ifremer.fr/erddap/subscriptions...,Argo,ArgoFloats
2,,https://erddap.ifremer.fr/erddap/tabledap/Argo...,https://erddap.ifremer.fr/erddap/tabledap/Argo...,https://erddap.ifremer.fr/erddap/tabledap/Argo...,,,public,Argo float synthetic vertical profiles : BGC data,Argo float synthetic vertical profiles : BGC d...,https://erddap.ifremer.fr/erddap/metadata/fgdc...,https://erddap.ifremer.fr/erddap/metadata/iso1...,https://erddap.ifremer.fr/erddap/info/ArgoFloa...,http://www.argodatamgt.org/Documentation,https://erddap.ifremer.fr/erddap/rss/ArgoFloat...,https://erddap.ifremer.fr/erddap/subscriptions...,Argo,ArgoFloats-synthetic-BGC
3,,,https://erddap.ifremer.fr/erddap/tabledap/Argo...,https://erddap.ifremer.fr/erddap/tabledap/Argo...,,,public,Argo Reference Measurements,Argo float vertical profiles to be used in DMQ...,https://erddap.ifremer.fr/erddap/metadata/fgdc...,https://erddap.ifremer.fr/erddap/metadata/iso1...,https://erddap.ifremer.fr/erddap/info/ArgoFloa...,http://www.argodatamgt.org/DMQC/Reference-data...,https://erddap.ifremer.fr/erddap/rss/ArgoFloat...,https://erddap.ifremer.fr/erddap/subscriptions...,Argo,ArgoFloats-reference
4,,https://erddap.ifremer.fr/erddap/tabledap/Argo...,https://erddap.ifremer.fr/erddap/tabledap/Argo...,https://erddap.ifremer.fr/erddap/tabledap/Argo...,,,public,ArgoFloats index,Argo detailed index. Gathers data available at...,https://erddap.ifremer.fr/erddap/metadata/fgdc...,https://erddap.ifremer.fr/erddap/metadata/iso1...,https://erddap.ifremer.fr/erddap/info/ArgoFloa...,http://www.argodatamgt.org/DMQC/Reference-data...,https://erddap.ifremer.fr/erddap/rss/ArgoFloat...,https://erddap.ifremer.fr/erddap/subscriptions...,Argo,ArgoFloats-index
5,,,,,,,log in,CTD Reference Measurements,"Conductivity, Temperature, Depth (CTD) Referen...",,,,,,,Argo,ArgoFloats-reference-CTD
6,,https://erddap.ifremer.fr/erddap/tabledap/drif...,https://erddap.ifremer.fr/erddap/tabledap/drif...,https://erddap.ifremer.fr/erddap/tabledap/drif...,,https://erddap.ifremer.fr/erddap/files/drifter...,public,Global Drifter Program - 1 Hour Interpolated Q...,Global Drifter Program hourly drifting buoy co...,https://erddap.ifremer.fr/erddap/metadata/fgdc...,https://erddap.ifremer.fr/erddap/metadata/iso1...,https://erddap.ifremer.fr/erddap/info/drifter_...,https://www.aoml.noaa.gov/phod/dac/dirall.html,https://erddap.ifremer.fr/erddap/rss/drifter_h...,https://erddap.ifremer.fr/erddap/subscriptions...,NOAA Atlantic Oceanographic and Meteorological...,drifter_hourly_qc
7,,https://erddap.ifremer.fr/erddap/tabledap/drif...,https://erddap.ifremer.fr/erddap/tabledap/drif...,https://erddap.ifremer.fr/erddap/tabledap/drif...,,https://erddap.ifremer.fr/erddap/files/drifter...,public,Global Drifter Program - 6 Hour Interpolated Q...,Global Drifter Program 6-hourly drifting buoy ...,https://erddap.ifremer.fr/erddap/metadata/fgdc...,https://erddap.ifremer.fr/erddap/metadata/iso1...,https://erddap.ifremer.fr/erddap/info/drifter_...,https://www.aoml.noaa.gov/phod/dac/dirall.html,https://erddap.ifremer.fr/erddap/rss/drifter_6...,https://erddap.ifremer.fr/erddap/subscriptions...,NOAA Atlantic Oceanographic and Meteorological...,drifter_6hour_qc
8,,https://erddap.ifremer.fr/erddap/tabledap/cope...,https://erddap.ifremer.fr/erddap/tabledap/cope...,https://erddap.ifremer.fr/erddap/tabledap/cope...,,https://erddap.ifremer.fr/erddap/files/coperni...,public,"Global Ocean, In Situ Observation Copernicus (...",Global Ocean - In Situ Observation Copernicus....,https://erddap.ifremer.fr/erddap/metadata/fgdc...,https://erddap.ifremer.fr/erddap/metadata/iso1...,https://erddap.ifremer.fr/erddap/info/copernic...,https://archimer.ifremer.fr/doc/00024/13500,https://erddap.ifremer.fr/erddap/rss/copernicu...,https://erddap.ifremer.fr/erddap/subscriptions...,IFREMER,copernicus-fos
9,,https://erddap.ifremer.fr/erddap/tabledap/aria...,https://erddap.ifremer.fr/erddap/tabledap/aria...,https://erddap.ifremer.fr/erddap/tabledap/aria...,,https://erddap.ifremer.fr/erddap/files/ariane_...,public,Numerical lagrangian data issued from river so...,This file is the raw output of Ariane software...,,,https://erddap.ifremer.fr/erddap/info/ariane_t...,https://doi.org/10.12770/8dea4c5c-f2c2-4771-93...,https://erddap.ifremer.fr/erddap/rss/ariane_tr...,https://erddap.ifremer.fr/erddap/subscriptions...,IRD,ariane_trajectories_qualitative


Exploration of **metadata** available for each dataset offered through the ERDDAP server

In [16]:
# Metadata information for each dataset
metadatainfo_df = pd.DataFrame(columns=["Row Type", "Variable Name", "Attribute Name", "Data Type", "Value", "DatasetID"])
# get metadata for each dataset
for datasetID in argoERDDAP_allDatasets_df['datasetID']:
    argoERDDAP.dataset_id = datasetID
    try:
        _df = pd.read_csv(argoERDDAP.get_info_url(response="csv")) #metadata retrieved via info_url
        _df["DatasetID"] = datasetID
        metadatainfo_df = pd.concat([metadatainfo_df, _df], ignore_index=True)
    except Exception as e:
        print(f"there was an error for {datasetID}: '{e}'")
        continue

# save to csv file
metadatainfo_df.to_csv("properties/ARGO_ERDDAP_dataset_metadata.csv", index=False)
# view
metadatainfo_df

there was an error for ArgoFloats-reference-CTD: 'HTTP Error 401: '


Unnamed: 0,Row Type,Variable Name,Attribute Name,Data Type,Value,DatasetID
0,attribute,NC_GLOBAL,cdm_data_type,String,Other,allDatasets
1,attribute,NC_GLOBAL,Conventions,String,"COARDS, CF-1.6, ACDD-1.3",allDatasets
2,attribute,NC_GLOBAL,creator_email,String,enzo.pauvy@ifremer.fr,allDatasets
3,attribute,NC_GLOBAL,creator_name,String,Sismer-helpdesk,allDatasets
4,attribute,NC_GLOBAL,creator_url,String,https://erddap.ifremer.fr/erddap,allDatasets
...,...,...,...,...,...,...
10104,attribute,elevation,sdn_parameter_urn,String,SDN:P01::ALATZZ01,gebco2021
10105,attribute,elevation,sdn_uom_name,String,Metres,gebco2021
10106,attribute,elevation,sdn_uom_urn,String,SDN:P06::ULAA,gebco2021
10107,attribute,elevation,standard_name,String,height_above_mean_sea_level,gebco2021


In [17]:
# explore returned metadata
metadatainfo_df.groupby(['DatasetID', 'Row Type']).nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,Variable Name,Attribute Name,Data Type,Value
DatasetID,Row Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ArgoFloats,attribute,60,52,4,174
ArgoFloats,variable,59,0,4,0
ArgoFloats-index,attribute,9,32,4,45
ArgoFloats-index,variable,8,0,4,0
ArgoFloats-reference,attribute,13,38,4,61
...,...,...,...,...,...
histo_coord,dimension,3,0,2,3
histo_coord,variable,24,0,1,1
traj_lonlat_init_t0,attribute,7,28,5,39
traj_lonlat_init_t0,dimension,2,0,1,2


In [18]:
# Explore the dimensions of each dataset (Row Type = 'dimension')
dim_info = metadatainfo_df[ metadatainfo_df['Row Type'] == 'dimension'].groupby(['DatasetID', 'Variable Name']).agg({
        'Data Type': 'unique',
        'Value': 'unique'
    }).reset_index()

dim_info.to_csv("properties/ARGO_ERDDAP_dataset_metadata_dimensions.csv", index=False)

dim_info

Unnamed: 0,DatasetID,Variable Name,Data Type,Value
0,HF_75c7_5b60_95d8,depth,[short],"[nValues=1, onlyValue=0.0]"
1,HF_75c7_5b60_95d8,latitude,[float],"[nValues=29, evenlySpaced=true, averageSpacing..."
2,HF_75c7_5b60_95d8,longitude,[float],"[nValues=33, evenlySpaced=true, averageSpacing..."
3,HF_75c7_5b60_95d8,time,[double],"[nValues=40240, evenlySpaced=false, averageSpa..."
4,HF_ac49_84ad_3eb6,depth,[short],"[nValues=1, onlyValue=0.0]"
...,...,...,...,...
125,histo_coord,latitude,[float],"[nValues=720, evenlySpaced=false, averageSpaci..."
126,histo_coord,longitude,[float],"[nValues=660, evenlySpaced=false, averageSpaci..."
127,histo_coord,time,[double],"[nValues=1521, evenlySpaced=true, averageSpaci..."
128,traj_lonlat_init_t0,frame,[int],"[nValues=1522, evenlySpaced=true, averageSpaci..."


In [19]:
# Explore the variables of each dataset (Row Type = 'variable')
#note: in netCDFs, attributes represent additional information 
var_info = metadatainfo_df[ metadatainfo_df['Row Type'] == 'variable'].groupby(['DatasetID', 'Variable Name']).agg({
        'Data Type': 'unique',
        'Value': 'unique'
    }).reset_index()

var_info.to_csv("properties/ARGO_ERDDAP_dataset_metadata_variables.csv", index=False)

var_info

Unnamed: 0,DatasetID,Variable Name,Data Type,Value
0,ArgoFloats,chla,[float],[nan]
1,ArgoFloats,chla_qc,[String],[nan]
2,ArgoFloats,config_mission_number,[int],[nan]
3,ArgoFloats,cycle_number,[int],[nan]
4,ArgoFloats,data_center,[String],[nan]
...,...,...,...,...
958,histo_coord,hist_stop_coast_TELUK_BENOA,[ushort],"[time, latitude, longitude]"
959,traj_lonlat_init_t0,traj_iU_init_t0,[float],"[frame, ntraj]"
960,traj_lonlat_init_t0,traj_jV_init_t0,[float],"[frame, ntraj]"
961,traj_lonlat_init_t0,traj_lat_init_t0,[float],"[frame, ntraj]"


In [20]:
# Explore the attributes of each dataset (Row Type = 'attribute')
metadatainfo_df[ metadatainfo_df['Row Type'] == 'attribute'].groupby(['DatasetID', 'Attribute Name']).nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,Row Type,Variable Name,Data Type,Value
DatasetID,Attribute Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ArgoFloats,C_format,1,9,1,2
ArgoFloats,Conventions,1,1,1,1
ArgoFloats,Easternmost_Easting,1,1,1,1
ArgoFloats,FORTRAN_format,1,9,1,2
ArgoFloats,Northernmost_Northing,1,1,1,1
...,...,...,...,...,...
traj_lonlat_init_t0,standard_name,1,4,1,4
traj_lonlat_init_t0,standard_name_vocabulary,1,1,1,1
traj_lonlat_init_t0,summary,1,1,1,1
traj_lonlat_init_t0,title,1,1,1,1


In [21]:
# and the variables/dimensions with which they're associated 
attr_info = metadatainfo_df[ metadatainfo_df['Row Type'] == 'attribute'].groupby(['DatasetID', 'Attribute Name', 'Variable Name']).agg({
        'Data Type': 'unique',
        'Value': 'unique'
    }).reset_index()

attr_info.to_csv("properties/ARGO_ERDDAP_dataset_metadata_attributes.csv", index=False)

attr_info

Unnamed: 0,DatasetID,Attribute Name,Variable Name,Data Type,Value
0,ArgoFloats,C_format,pres,[String],[%7.1f]
1,ArgoFloats,C_format,pres_adjusted,[String],[%7.1f]
2,ArgoFloats,C_format,pres_adjusted_error,[String],[%7.1f]
3,ArgoFloats,C_format,psal,[String],[%9.3f]
4,ArgoFloats,C_format,psal_adjusted,[String],[%9.3f]
...,...,...,...,...,...
9011,traj_lonlat_init_t0,title,NC_GLOBAL,[String],[Numerical lagrangian data issued from river s...
9012,traj_lonlat_init_t0,units,frame,[String],[count]
9013,traj_lonlat_init_t0,units,ntraj,[String],[count]
9014,traj_lonlat_init_t0,units,traj_lat_init_t0,[String],[degrees_north]


Exploration of **data** available for each dataset offered through the ERDDAP server, by accessing as pd.DataFrame  
(less efficient than accessing as ncCF)

In [22]:
#list datasets
print(f"there are {len(argoERDDAP_allDatasets_df['datasetID'].drop_duplicates())} datasets available via ARGO ERDDAP")

# Connect to server 
erddap = ERDDAP(server="https://erddap.ifremer.fr/erddap", protocol="tabledap")

df_summary_full = pd.DataFrame(columns=["DatasetID", "Property", "Count", "Types", "Example"])
#first half (otherwise code takes too long to run)
for datasetID in argoERDDAP_allDatasets_df['datasetID'][:len(argoERDDAP_allDatasets_df) // 2]:
    print(f"Processing dataset: {datasetID}")
    
    try:
        # Set the dataset ID
        erddap.dataset_id = datasetID
        
        # Fetch data as a 2D dataframe with timeout handling
        dataset_df = erddap.to_pandas()
        
        # Process data        
        df_summary = data_frame(dataset_df, datasetID)
        df_summary_full = pd.concat([df_summary_full, df_summary], ignore_index=True)
        df_summary_full.to_csv("properties/ARGO_ERDDAP_dataset_data.csv", index=False)
    except Exception as e:
        print(f"There was an error for {datasetID}: {e}")

df_summary_full

#Property ~ variables (attributes & dimensions not clear)

there are 53 datasets available via ARGO ERDDAP
Processing dataset: allDatasets
Processing dataset: OACP-Argo-Global
There was an error for OACP-Argo-Global: Error {
    code=404;
    message="Not Found: Currently unknown datasetID=OACP-Argo-Global";
}

Processing dataset: ArgoFloats


Extra step needed to retrieve actual data

In [12]:
#Set a dataset ID
argoERDDAP.dataset_id = "ArgoFloats-reference"

#Get data (as 2D dataframe)
argoERDDAP_ArgoFloats_reference_df = argoERDDAP.to_pandas()
argoERDDAP_ArgoFloats_reference_df

### Analysis results:

- requires knowledge on ERDDAP servers & netCDF file format  
- ERDDAP server offers 52 datasets  
- mainly NetCDF data
- exploration of 'ARGO_ERDDAP_dataset_data.csv' and 'ARGO_ERDAP_dataset_metadata.csv' (which shows information about the properties in each datasets (such as type, example, unique values, ...)) shows:
    - All data is findable, not always accessible:
        - presence of 'log in' as a value of the 'accessible' column indicates that not all data is openly available.  
    - values for license property mention known, standard licenses but no use of their URLs  
    - opportunity for more use of persistent identifiers (e.g. station codes, ROR-id for institutes, ...)
- some datasets (directly) return data, while others require an additional step to retrieve the actual data:
    - for example, 'ArgoFloats' dataset contains file paths in 'file' column
    - 'files' column in 'ARGO_ERDDAP_overview_metadata.csv' indicates that some data is retrieved through an additional step  
    - documentation does mentions "OpenDAP on top of NetCDF files"  

- see EMSO-ERIC-ERDDAPserver noteboook for the comparison of data properties between the two analysed ERDDAP servers. 



