In [3]:
from erddapy import ERDDAP
import netCDF4 as nc
import pandas as pd
from connec_functions import GDB
from collections import defaultdict

from accessibility import check_endpoint
from summarize import * #json_keys, data_frame

### EMSO ERIC ERDDAP server

In [53]:
# analysed endpoint:
endpoint_url = "https://erddap.emso.eu/erddap"

#make connection
emsoERDDAP = ERDDAP(server=endpoint_url, protocol="tabledap")

In [3]:
if check_endpoint(endpoint_url):
    print("The endpoint is machine-accessible.")
else:
    print("The endpoint is not machine-accessible.")

Checking endpoint: https://erddap.emso.eu/erddap
Endpoint is online: 200
Content type may not be machine-readable: text/html;charset=UTF-8
The endpoint is machine-accessible.


**Exploration of the allDatasets dataset**  
~ retrieving a dataset that lists all available datasets on the ERDDAP server. The returned DataFrame will contain metadata for each dataset available on that server  
~ essentially a catalog of all datasets hosted on the server, including essential metadata that allows you to identify and filter the datasets of interest  

In [54]:
# set dataset ID
emsoERDDAP.dataset_id = "allDatasets"

#Get data (as 2D dataframe)
EMSO_alldatasets_df = emsoERDDAP.to_pandas()

In [5]:
#explore columns
EMSO_alldatasets_df.columns

Index(['datasetID', 'accessible', 'institution', 'dataStructure',
       'cdm_data_type', 'class', 'title', 'minLongitude (degrees_east)',
       'maxLongitude (degrees_east)', 'longitudeSpacing (degrees_east)',
       'minLatitude (degrees_north)', 'maxLatitude (degrees_north)',
       'latitudeSpacing (degrees_north)', 'minAltitude (m)', 'maxAltitude (m)',
       'minTime (UTC)', 'maxTime (UTC)', 'timeSpacing (seconds)', 'griddap',
       'subset', 'tabledap', 'MakeAGraph', 'sos', 'wcs', 'wms', 'files',
       'fgdc', 'iso19115', 'metadata', 'sourceUrl', 'infoUrl', 'rss', 'email',
       'testOutOfDate', 'outOfDate', 'summary'],
      dtype='object')

In [6]:
# save
EMSO_alldatasets_df.to_csv("properties/EMSO_ERDDAP_overview_metadata.csv", index=False)
# view
EMSO_alldatasets_df

Unnamed: 0,datasetID,accessible,institution,dataStructure,cdm_data_type,class,title,minLongitude (degrees_east),maxLongitude (degrees_east),longitudeSpacing (degrees_east),...,fgdc,iso19115,metadata,sourceUrl,infoUrl,rss,email,testOutOfDate,outOfDate,summary
0,allDatasets,public,Axiom Docker Install,table,Other,EDDTableFromAllDatasets,* The List of All Active Datasets in this ERDD...,,,,...,,,https://erddap.emso.eu/erddap/info/allDatasets...,https://localhost:8443/erddap,https://erddap.emso.eu/erddap,https://erddap.emso.eu/erddap/rss/allDatasets.rss,https://erddap.emso.eu/erddap/subscriptions/ad...,,,This dataset is a table which has a row of inf...
1,EMSO_OBSEA_CTD_30min,public,Polytechnic University of Catalonia,table,Point,EDDTableFromErddap,CTD data at OBSEA Underwater Observatory 30 mi...,1.752570,1.752570,,...,https://erddap.emso.eu/erddap/metadata/fgdc/xm...,https://erddap.emso.eu/erddap/metadata/iso1911...,https://erddap.emso.eu/erddap/info/EMSO_OBSEA_...,(local files),https://edmo.seadatanet.org/report/2150,https://erddap.emso.eu/erddap/rss/EMSO_OBSEA_C...,https://erddap.emso.eu/erddap/subscriptions/ad...,,,CTD data measured at OBSEA underwater observatory
2,EMSO_OBSEA_Besos_Buoy_Airmar_200WX_30min,public,Polytechnic University of Catalonia,table,Point,EDDTableFromErddap,Data from Airmar 200 WX weather station deploy...,1.752570,1.752570,,...,https://erddap.emso.eu/erddap/metadata/fgdc/xm...,https://erddap.emso.eu/erddap/metadata/iso1911...,https://erddap.emso.eu/erddap/info/EMSO_OBSEA_...,(local files),https://edmo.seadatanet.org/report/2150,https://erddap.emso.eu/erddap/rss/EMSO_OBSEA_B...,https://erddap.emso.eu/erddap/subscriptions/ad...,,,Weather station from an Airmar 200WX deployed...
3,EMSO_OBSEA_Besos_Buoy_Airmar_200WX_full,public,Polytechnic University of Catalonia,table,Point,EDDTableFromErddap,Data from Airmar 200 WX weather station deploy...,1.752570,1.752570,,...,https://erddap.emso.eu/erddap/metadata/fgdc/xm...,https://erddap.emso.eu/erddap/metadata/iso1911...,https://erddap.emso.eu/erddap/info/EMSO_OBSEA_...,(local files),https://edmo.seadatanet.org/report/2150,https://erddap.emso.eu/erddap/rss/EMSO_OBSEA_B...,https://erddap.emso.eu/erddap/subscriptions/ad...,,,Weather station from an Airmar 200WX deployed...
4,E2M3A_METEO,public,National Institute of Oceanography and Applied...,table,TimeSeries,EDDTableFromErddap,"E2M3A METEO timeSeries, NRT in situ Observations",18.082417,18.082417,,...,https://erddap.emso.eu/erddap/metadata/fgdc/xm...,https://erddap.emso.eu/erddap/metadata/iso1911...,https://erddap.emso.eu/erddap/info/E2M3A_METEO...,(source database),https://nodc.ogs.it,https://erddap.emso.eu/erddap/rss/E2M3A_METEO.rss,https://erddap.emso.eu/erddap/subscriptions/ad...,,,"E2M3A METEO timeSeries, NRT in situ Observations"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,W1M3A_deploy04,public,Consiglio Nazionale delle Ricerche,table,TimeSeries,EDDTableFromErddap,W1M3A data (201705-201806),9.111700,9.118163,,...,https://erddap.emso.eu/erddap/metadata/fgdc/xm...,https://erddap.emso.eu/erddap/metadata/iso1911...,https://erddap.emso.eu/erddap/info/W1M3A_deplo...,(local files),http://www.w1m3a.cnr.it,https://erddap.emso.eu/erddap/rss/W1M3A_deploy...,https://erddap.emso.eu/erddap/subscriptions/ad...,,,Data from W1M3A observatory (062015-062016)
167,W1M3A_deploy05,public,Consiglio Nazionale delle Ricerche,table,TimeSeries,EDDTableFromErddap,W1M3A data (202010-202107),9.111700,9.118163,,...,https://erddap.emso.eu/erddap/metadata/fgdc/xm...,https://erddap.emso.eu/erddap/metadata/iso1911...,https://erddap.emso.eu/erddap/info/W1M3A_deplo...,(local files),http://www.w1m3a.cnr.it,https://erddap.emso.eu/erddap/rss/W1M3A_deploy...,https://erddap.emso.eu/erddap/subscriptions/ad...,,,Data from W1M3A observatory (062015-062016)
168,W1M3A_deploy06,public,Consiglio Nazionale delle Ricerche,table,TimeSeries,EDDTableFromErddap,W1M3A data (202107-202204),9.111700,9.118163,,...,https://erddap.emso.eu/erddap/metadata/fgdc/xm...,https://erddap.emso.eu/erddap/metadata/iso1911...,https://erddap.emso.eu/erddap/info/W1M3A_deplo...,(local files),http://www.w1m3a.cnr.it,https://erddap.emso.eu/erddap/rss/W1M3A_deploy...,https://erddap.emso.eu/erddap/subscriptions/ad...,,,Data from W1M3A observatory (062015-062016)
169,W1M3A_deploy07,public,Consiglio Nazionale delle Ricerche,table,TimeSeries,EDDTableFromErddap,W1M3A data (202310-......),9.106570,9.135168,,...,https://erddap.emso.eu/erddap/metadata/fgdc/xm...,https://erddap.emso.eu/erddap/metadata/iso1911...,https://erddap.emso.eu/erddap/info/W1M3A_deplo...,(local files),http://www.w1m3a.cnr.it,https://erddap.emso.eu/erddap/rss/W1M3A_deploy...,https://erddap.emso.eu/erddap/subscriptions/ad...,,,Data from W1M3A observatory (062015-062016)


In [7]:
print(f"There are {len(EMSO_alldatasets_df['datasetID'].drop_duplicates())} datasets offered by the EMSO ERDDAP server")

There are 171 datasets offered by the EMSO ERDDAP server


In [8]:
#See if other metadata listed in allDatasets dataset
for line in EMSO_alldatasets_df.summary:
    print(line)

This dataset is a table which has a row of information for each dataset currently active in this ERDDAP.
CTD data measured at OBSEA underwater observatory
Weather station from  an Airmar 200WX deployed at OBSEA's Besos Buoy, NW mediterranean sea
Weather station from  an Airmar 200WX deployed at OBSEA's Besos Buoy, NW mediterranean sea full sensor data
E2M3A METEO timeSeries, NRT in situ Observations
E2M3A MRDT timeSeries, NRT in situ Observations
E2M3A PCO2PROA timeSeries, NRT in situ Observations
E2M3A PCO2PROW timeSeries, NRT in situ Observations
E2M3A SAMI timeSeries, NRT in situ Observations
E2M3A SBE16PLS timeSeries, NRT in situ Observations
E2M3A SBE37O timeSeries, NRT in situ Observations
E2M3A timeSeries ALL INSTRUMENTS, NRT in situ Observations
E2M3A timeSeries, data collected from 2021 to 2022
This dataset contains dissolved iron concentrations ((Fe(II) + Fe(III); \u00b5mol/l) acquired between September 2013 and July 2017 using the CHEMINI Fe, a CHEmical MINIaturized analyser

Exploration of **search information** for each data offered through the ERDDAP server

In [9]:
# get search information
searchinfo_df = pd.read_csv(emsoERDDAP.get_search_url(response="csv"))

In [10]:
searchinfo_df.columns

Index(['griddap', 'Subset', 'tabledap', 'Make A Graph', 'wms', 'files',
       'Title', 'Summary', 'FGDC', 'ISO 19115', 'Info', 'Background Info',
       'RSS', 'Email', 'Institution', 'Dataset ID'],
      dtype='object')

In [11]:
searchinfo_df

Unnamed: 0,griddap,Subset,tabledap,Make A Graph,wms,files,Title,Summary,FGDC,ISO 19115,Info,Background Info,RSS,Email,Institution,Dataset ID
0,,https://erddap.emso.eu/erddap/tabledap/allData...,https://erddap.emso.eu/erddap/tabledap/allData...,https://erddap.emso.eu/erddap/tabledap/allData...,,,* The List of All Active Datasets in this ERDD...,This dataset is a table which has a row of inf...,,,https://erddap.emso.eu/erddap/info/allDatasets...,https://erddap.emso.eu/erddap,,,Axiom Docker Install,allDatasets
1,,https://erddap.emso.eu/erddap/tabledap/EMSO_OB...,https://erddap.emso.eu/erddap/tabledap/EMSO_OB...,https://erddap.emso.eu/erddap/tabledap/EMSO_OB...,,,CTD data at OBSEA Underwater Observatory 30 mi...,CTD data measured at OBSEA underwater observat...,https://erddap.emso.eu/erddap/metadata/fgdc/xm...,https://erddap.emso.eu/erddap/metadata/iso1911...,https://erddap.emso.eu/erddap/info/EMSO_OBSEA_...,https://edmo.seadatanet.org/report/2150,https://erddap.emso.eu/erddap/rss/EMSO_OBSEA_C...,https://erddap.emso.eu/erddap/subscriptions/ad...,Polytechnic University of Catalonia,EMSO_OBSEA_CTD_30min
2,,https://erddap.emso.eu/erddap/tabledap/EMSO_OB...,https://erddap.emso.eu/erddap/tabledap/EMSO_OB...,https://erddap.emso.eu/erddap/tabledap/EMSO_OB...,,,Data from Airmar 200 WX weather station deploy...,Weather station from an Airmar 200WX deployed...,https://erddap.emso.eu/erddap/metadata/fgdc/xm...,https://erddap.emso.eu/erddap/metadata/iso1911...,https://erddap.emso.eu/erddap/info/EMSO_OBSEA_...,https://edmo.seadatanet.org/report/2150,https://erddap.emso.eu/erddap/rss/EMSO_OBSEA_B...,https://erddap.emso.eu/erddap/subscriptions/ad...,Polytechnic University of Catalonia,EMSO_OBSEA_Besos_Buoy_Airmar_200WX_30min
3,,https://erddap.emso.eu/erddap/tabledap/EMSO_OB...,https://erddap.emso.eu/erddap/tabledap/EMSO_OB...,https://erddap.emso.eu/erddap/tabledap/EMSO_OB...,,,Data from Airmar 200 WX weather station deploy...,Weather station from an Airmar 200WX deployed...,https://erddap.emso.eu/erddap/metadata/fgdc/xm...,https://erddap.emso.eu/erddap/metadata/iso1911...,https://erddap.emso.eu/erddap/info/EMSO_OBSEA_...,https://edmo.seadatanet.org/report/2150,https://erddap.emso.eu/erddap/rss/EMSO_OBSEA_B...,https://erddap.emso.eu/erddap/subscriptions/ad...,Polytechnic University of Catalonia,EMSO_OBSEA_Besos_Buoy_Airmar_200WX_full
4,,,https://erddap.emso.eu/erddap/tabledap/E2M3A_M...,https://erddap.emso.eu/erddap/tabledap/E2M3A_M...,,,"E2M3A METEO timeSeries, NRT in situ Observations","E2M3A METEO timeSeries, NRT in situ Observatio...",https://erddap.emso.eu/erddap/metadata/fgdc/xm...,https://erddap.emso.eu/erddap/metadata/iso1911...,https://erddap.emso.eu/erddap/info/E2M3A_METEO...,https://nodc.ogs.it,https://erddap.emso.eu/erddap/rss/E2M3A_METEO.rss,https://erddap.emso.eu/erddap/subscriptions/ad...,National Institute of Oceanography and Applied...,E2M3A_METEO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166,,https://erddap.emso.eu/erddap/tabledap/W1M3A_d...,https://erddap.emso.eu/erddap/tabledap/W1M3A_d...,https://erddap.emso.eu/erddap/tabledap/W1M3A_d...,,,W1M3A data (201705-201806),Data from W1M3A observatory (062015-062016)\n\...,https://erddap.emso.eu/erddap/metadata/fgdc/xm...,https://erddap.emso.eu/erddap/metadata/iso1911...,https://erddap.emso.eu/erddap/info/W1M3A_deplo...,http://www.w1m3a.cnr.it,https://erddap.emso.eu/erddap/rss/W1M3A_deploy...,https://erddap.emso.eu/erddap/subscriptions/ad...,Consiglio Nazionale delle Ricerche,W1M3A_deploy04
167,,https://erddap.emso.eu/erddap/tabledap/W1M3A_d...,https://erddap.emso.eu/erddap/tabledap/W1M3A_d...,https://erddap.emso.eu/erddap/tabledap/W1M3A_d...,,,W1M3A data (202010-202107),Data from W1M3A observatory (062015-062016)\n\...,https://erddap.emso.eu/erddap/metadata/fgdc/xm...,https://erddap.emso.eu/erddap/metadata/iso1911...,https://erddap.emso.eu/erddap/info/W1M3A_deplo...,http://www.w1m3a.cnr.it,https://erddap.emso.eu/erddap/rss/W1M3A_deploy...,https://erddap.emso.eu/erddap/subscriptions/ad...,Consiglio Nazionale delle Ricerche,W1M3A_deploy05
168,,https://erddap.emso.eu/erddap/tabledap/W1M3A_d...,https://erddap.emso.eu/erddap/tabledap/W1M3A_d...,https://erddap.emso.eu/erddap/tabledap/W1M3A_d...,,,W1M3A data (202107-202204),Data from W1M3A observatory (062015-062016)\n\...,https://erddap.emso.eu/erddap/metadata/fgdc/xm...,https://erddap.emso.eu/erddap/metadata/iso1911...,https://erddap.emso.eu/erddap/info/W1M3A_deplo...,http://www.w1m3a.cnr.it,https://erddap.emso.eu/erddap/rss/W1M3A_deploy...,https://erddap.emso.eu/erddap/subscriptions/ad...,Consiglio Nazionale delle Ricerche,W1M3A_deploy06
169,,https://erddap.emso.eu/erddap/tabledap/W1M3A_d...,https://erddap.emso.eu/erddap/tabledap/W1M3A_d...,https://erddap.emso.eu/erddap/tabledap/W1M3A_d...,,,W1M3A data (202310-......),Data from W1M3A observatory (062015-062016)\n\...,https://erddap.emso.eu/erddap/metadata/fgdc/xm...,https://erddap.emso.eu/erddap/metadata/iso1911...,https://erddap.emso.eu/erddap/info/W1M3A_deplo...,http://www.w1m3a.cnr.it,https://erddap.emso.eu/erddap/rss/W1M3A_deploy...,https://erddap.emso.eu/erddap/subscriptions/ad...,Consiglio Nazionale delle Ricerche,W1M3A_deploy07


Exploration of **metadata information** available for each dataset offered through the ERDDAP server

In [55]:
# Metadata information for each dataset
metadatainfo_df = pd.DataFrame(columns=["Row Type", "Variable Name", "Attribute Name", "Data Type", "Value", "DatasetID"])
# get metadata for each dataset
for datasetID in EMSO_alldatasets_df['datasetID']:
    emsoERDDAP.dataset_id = datasetID
    try:
        _df = pd.read_csv(emsoERDDAP.get_info_url(response="csv")) #metadata retrieved via info_url
        _df["DatasetID"] = datasetID
        metadatainfo_df = pd.concat([metadatainfo_df, _df], ignore_index=True)
    except Exception as e:
        print(f"there was an error for {datasetID}: '{e}'")
        continue

# save to csv file
metadatainfo_df.to_csv("properties/EMSO_ERDDAP_dataset_metadata.csv", index=False)

metadatainfo_df

Unnamed: 0,Row Type,Variable Name,Attribute Name,Data Type,Value,DatasetID
0,attribute,NC_GLOBAL,cdm_data_type,String,Other,allDatasets
1,attribute,NC_GLOBAL,Conventions,String,"COARDS, CF-1.6, ACDD-1.3",allDatasets
2,attribute,NC_GLOBAL,creator_email,String,nobody@example.com,allDatasets
3,attribute,NC_GLOBAL,creator_name,String,Axiom Docker Install,allDatasets
4,attribute,NC_GLOBAL,creator_url,String,https://erddap.emso.eu/erddap,allDatasets
...,...,...,...,...,...,...
80364,attribute,depth_QC,actual_range,byte,"7, 7",EMSO_OBSEA_AWAC_waves_full
80365,attribute,depth_QC,conventions,String,OceanSITES QC Flags,EMSO_OBSEA_AWAC_waves_full
80366,attribute,depth_QC,flag_meanings,String,unknown;good_data;probably_good_data;potential...,EMSO_OBSEA_AWAC_waves_full
80367,attribute,depth_QC,flag_values,String,0;1;2;3;4;7;8;9,EMSO_OBSEA_AWAC_waves_full


In [13]:
# explore returned metadata
metadatainfo_df.groupby(['DatasetID', 'Row Type']).nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,Variable Name,Attribute Name,Data Type,Value
DatasetID,Row Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BB_505_ADCP,attribute,17,71,4,140
BB_505_ADCP,variable,16,0,4,0
BB_567_SBE56,attribute,8,72,3,107
BB_567_SBE56,variable,7,0,3,0
BB_584_SBE56,attribute,8,72,3,105
...,...,...,...,...,...
smartbay_obs_fluorometer_ecofl,variable,12,0,4,0
smartbay_obs_hour_mean,attribute,59,136,3,515
smartbay_obs_hour_mean,variable,58,0,4,0
smartbay_obs_pco2_contros,attribute,11,129,3,180


In [58]:
# Explore the dimensions of each dataset (Row Type = 'dimension')
dim_info = metadatainfo_df[ metadatainfo_df['Row Type'] == 'dimension'].groupby(['DatasetID', 'Variable Name']).agg({
        'Data Type': 'unique',
        'Value': 'unique'
    }).reset_index()

dim_info.to_csv("properties/EMSO_ERDDAP_dataset_metadata_dimensions.csv", index=False)

dim_info

Unnamed: 0,DatasetID,Variable Name,Data Type,Value


In [15]:
# Explore the variables of each dataset (Row Type = 'variable')
#note: in netCDFs, attributes represent additional information 
var_info = metadatainfo_df[ metadatainfo_df['Row Type'] == 'variable'].groupby(['DatasetID', 'Variable Name']).agg({
        'Data Type': 'unique',
        'Value': 'unique'
    }).reset_index()

var_info.to_csv("properties/EMSO_ERDDAP_dataset__metadata_variables.csv", index=False)

var_info

Unnamed: 0,DatasetID,Variable Name,Data Type,Value
0,BB_505_ADCP,CurrVelE_ADCP,[double],[nan]
1,BB_505_ADCP,CurrVelN_ADCP,[double],[nan]
2,BB_505_ADCP,CurrVelUp_ADCP,[double],[nan]
3,BB_505_ADCP,CurrVel_QC,[byte],[nan]
4,BB_505_ADCP,ECHO_BEAM_1,[double],[nan]
...,...,...,...,...
3922,smartbay_obs_pco2_contros,pco2_corrected,[float],[nan]
3923,smartbay_obs_pco2_contros,pco2_corrected_qc,[int],[nan]
3924,smartbay_obs_pco2_contros,site_bathy_depth,[double],[nan]
3925,smartbay_obs_pco2_contros,station_id,[String],[nan]


In [16]:
# Explore the attributes of each dataset (Row Type = 'attribute')
metadatainfo_df[ metadatainfo_df['Row Type'] == 'attribute'].groupby(['DatasetID', 'Attribute Name']).nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,Row Type,Variable Name,Data Type,Value
DatasetID,Attribute Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BB_505_ADCP,Conventions,1,1,1,1
BB_505_ADCP,Easternmost_Easting,1,1,1,1
BB_505_ADCP,Northernmost_Northing,1,1,1,1
BB_505_ADCP,Southernmost_Northing,1,1,1,1
BB_505_ADCP,Westernmost_Easting,1,1,1,1
...,...,...,...,...,...
smartbay_obs_pco2_contros,update_interval,1,1,1,1
smartbay_obs_pco2_contros,valid_max,1,3,1,3
smartbay_obs_pco2_contros,valid_min,1,3,1,3
smartbay_obs_pco2_contros,wfd_waterbody_name,1,1,1,1


In [17]:
# and the variables/dimensions with which they're associated 
attr_info = metadatainfo_df[ metadatainfo_df['Row Type'] == 'attribute'].groupby(['DatasetID', 'Attribute Name', 'Variable Name']).agg({
        'Data Type': 'unique',
        'Value': 'unique'
    }).reset_index()

attr_info.to_csv("properties/EMSO_ERDDAP_dataset_metadata_attributes.csv", index=False)

attr_info

Unnamed: 0,DatasetID,Attribute Name,Variable Name,Data Type,Value
0,BB_505_ADCP,Conventions,NC_GLOBAL,[String],"[OceanSITES v1.4,SeaDataNet_1.0,COARDS,CF-1.6,..."
1,BB_505_ADCP,Easternmost_Easting,NC_GLOBAL,[double],[17.19352]
2,BB_505_ADCP,Northernmost_Northing,NC_GLOBAL,[double],[41.3413]
3,BB_505_ADCP,Southernmost_Northing,NC_GLOBAL,[double],[41.3413]
4,BB_505_ADCP,Westernmost_Easting,NC_GLOBAL,[double],[17.19352]
...,...,...,...,...,...
76437,smartbay_obs_pco2_contros,valid_min,depth,[double],[0.0]
76438,smartbay_obs_pco2_contros,valid_min,latitude,[double],[-90.0]
76439,smartbay_obs_pco2_contros,valid_min,longitude,[double],[-180.0]
76440,smartbay_obs_pco2_contros,wfd_waterbody_name,NC_GLOBAL,[String],[Outer Galway Bay]


Exploration of **data** from each dataset offered through the ERDDAP server, by accessing as pd.DataFrame  
(less efficient than accessing as ncCF)

In [4]:
#list datasets
print(f"there are {len(EMSO_alldatasets_df['datasetID'].drop_duplicates())} datasets available via EMSO ERDDAP")

# Connect to server 
erddap = ERDDAP(server="https://erddap.emso.eu/erddap", protocol="tabledap")

summary_full_df = pd.DataFrame()
#first half (otherwise code takes too long to run)
for datasetID in EMSO_alldatasets_df['datasetID'][:len(EMSO_alldatasets_df) // 2]:
    print(f"Processing dataset: {datasetID}")
    
    try:
        # Set the dataset ID
        erddap.dataset_id = datasetID
        
        # Fetch data as a 2D dataframe with timeout handling
        dataset_df = erddap.to_pandas()
        
        # Process data        
        summ_df = data_frame(dataset_df, datasetID)
        summary_full_df = pd.concat([summary_full_df, summ_df])
        
    except Exception as e:
        print(f"There was an error for {datasetID}: {e}")

# write first half to csv file
summary_full_df.to_csv("properties/EMSO_ERDDAP_dataset_data_metadata.csv", index=False)
# view
summary_full_df
#Property ~ variables (attributes & dimensions not clear)

there are 171 datasets available via EMSO ERDDAP
Processing dataset: allDatasets
Processing dataset: EMSO_OBSEA_CTD_30min
Processing dataset: EMSO_OBSEA_Besos_Buoy_Airmar_200WX_30min
Processing dataset: EMSO_OBSEA_Besos_Buoy_Airmar_200WX_full
Processing dataset: E2M3A_METEO
Processing dataset: E2M3A_MRDT
Processing dataset: E2M3A_PCO2PROA
Processing dataset: E2M3A_PCO2PROW
Processing dataset: E2M3A_SAMI
Processing dataset: E2M3A_SBE16PLS
Processing dataset: E2M3A_SBE37O
Processing dataset: E2M3A_CTD_meteo_CO2_pH_NRT
Processing dataset: E2M3A_2021_2022_TS
Processing dataset: Emso_Azores_Chemini_IRON
Processing dataset: EMSO-AZORES_TCM3-1_2016-2017
Processing dataset: EMSO-AZORES_TCM3-1_2017-2018
Processing dataset: EMSO-AZORES_TCM3-1_2018-2019
Processing dataset: EMSO-AZORES_TCM3-1_2021-2022
Processing dataset: EMSO-AZORES_TCM3-2_2017-2018
Processing dataset: EMSO-AZORES_TCM3-2_2018-2019
Processing dataset: EMSO-AZORES_TCM3-2_2019-2020
Processing dataset: EMSO-AZORES_TCM3-2_2020-2021
Pr

Unnamed: 0,DatasetID,Property,Count,Types,Example,UniqueValues
0,allDatasets,datasetID,171,object,allDatasets,{}
1,allDatasets,accessible,171,object,public,{public}
2,allDatasets,institution,171,object,Axiom Docker Install,{}
3,allDatasets,dataStructure,171,object,table,{table}
4,allDatasets,cdm_data_type,171,object,Other,"{Other, Point, TimeSeries}"
...,...,...,...,...,...,...
3,EMSO-AZORES_EGIM_Turbidity_2017-2018,longitude (degrees_east),25469,float64,-32.27562,{-32.27562}
4,EMSO-AZORES_EGIM_Turbidity_2017-2018,depth (m),25469,float64,1700.0,{1700.0}
5,EMSO-AZORES_EGIM_Turbidity_2017-2018,DEPH_QC,25469,int64,7,{7}
6,EMSO-AZORES_EGIM_Turbidity_2017-2018,TUR4 (NTU),25469,float64,294.0,{}


In [6]:
#second half
for datasetID in EMSO_alldatasets_df['datasetID'][(len(EMSO_alldatasets_df) // 2):]:
    print(f"Processing dataset: {datasetID}")
    
    try:
        # Set the dataset ID
        erddap.dataset_id = datasetID
        
        # Fetch data as a 2D dataframe with timeout handling
        dataset_df = erddap.to_pandas()
        
        # Process data        
        summ_df = data_frame(dataset_df, datasetID)
        summary_full_df = pd.concat([summary_full_df, summ_df], ignore_index=True)
        #overwrite each time, try to get info from as much datasets as possible before error
        summary_full_df.to_csv("properties/EMSO_ERDDAP_dataset_data_metadata2.csv", index=False)
    except Exception as e:
        print(f"There was an error for {datasetID}: {e}")


summary_full_df.to_csv("properties/EMSO_ERDDAP_dataset_data_metadata2.csv", index=False)
summary_full_df
#note: doesn't list netCDF attributes

Processing dataset: EMSO-AZORES_CHEMINI_Total-Iron_2011-2012
Processing dataset: EMSO-AZORES_CHEMINI_Total-Iron_2013-2014
Processing dataset: EMSO-AZORES_CHEMINI_Total-Iron_2014-2015
Processing dataset: EMSO-AZORES_CHEMINI_Total-Iron_2016-2017
Processing dataset: EMSO-AZORES_CHEMINI_Total-Iron_2017-2018
Processing dataset: EMSO-AZORES_CHEMINI_Total-Iron_2018-2019
Processing dataset: EMSO-AZORES_CHEMINI_Total-Iron_2019-2020
Processing dataset: EMSO-AZORES_Seamon-East_Optode-O2_2013-2014
Processing dataset: EMSO-AZORES_Seamon-East_Optode-O2_2015-2016
Processing dataset: EMSO-AZORES_Seamon-East_Optode-O2_2016-2017
Processing dataset: EMSO-AZORES_Seamon-East_Optode-O2_2012-2013
Processing dataset: EMSO-AZORES_Wetlabs_Turbidity_2011-2012
Processing dataset: EMSO-AZORES_Wetlabs_Turbidity_2012-2013
Processing dataset: EMSO-AZORES_Wetlabs_Turbidity_2013-2014
Processing dataset: EMSO-AZORES_Wetlabs_Turbidity_2015-2016
Processing dataset: EMSO-AZORES_Wetlabs_Turbidity_2016-2017
Processing datase

: 

: 

View of some datasets

In [68]:
# set dataset ID
emsoERDDAP.dataset_id = "EMSO_OBSEA_CTD_30min"

#Get data (as 2D dataframe)
EMSO_EMSO_OBSEA_AWAC_waves_full_df = emsoERDDAP.to_pandas()
EMSO_EMSO_OBSEA_AWAC_waves_full_df

Unnamed: 0,time (UTC),latitude (degrees_north),longitude (degrees_east),depth (m),sensor_id,PSAL (Dimensionless),PRES (Decibars),SVEL (Metres per second),CNDC (Siemens per metre),TEMP (Degrees Celsius),PSAL_QC,PRES_QC,SVEL_QC,CNDC_QC,TEMP_QC,latitude_QC,longitude_QC,depth_QC
0,2022-01-01T00:00:00Z,41.18212,1.75257,20.0,16P57353-6479,34.0903,19.236,1505.240,4.17190,14.7835,3,1,1,1,1,7,7,7
1,2022-01-01T00:30:00Z,41.18212,1.75257,20.0,16P57353-6479,34.0896,19.241,1505.250,4.17190,14.7843,3,1,1,1,1,7,7,7
2,2022-01-01T01:00:00Z,41.18212,1.75257,20.0,16P57353-6479,34.0892,19.252,1505.250,4.17196,14.7854,3,1,1,1,1,7,7,7
3,2022-01-01T01:30:00Z,41.18212,1.75257,20.0,16P57353-6479,34.0891,19.275,1505.250,4.17193,14.7852,3,1,1,1,1,7,7,7
4,2022-01-01T02:00:00Z,41.18212,1.75257,20.0,16P57353-6479,34.0838,19.290,1505.240,4.17141,14.7857,3,1,1,1,1,7,7,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240635,2022-12-31T21:30:00Z,41.18212,1.75257,20.0,16P57353-6479,38.1734,,1507.588,4.63894,15.1012,1,9,1,1,1,7,7,7
240636,2022-12-31T22:00:00Z,41.18212,1.75257,20.0,16P57353-6479,38.1732,,1507.578,4.63860,15.0982,1,9,1,1,1,7,7,7
240637,2022-12-31T22:30:00Z,41.18212,1.75257,20.0,16P57353-6479,38.1730,,1507.557,4.63786,15.0914,1,9,1,1,1,7,7,7
240638,2022-12-31T23:00:00Z,41.18212,1.75257,20.0,16P57353-6479,38.1728,,1507.563,4.63806,15.0935,1,9,1,1,1,7,7,7


In [69]:
#Get data (as ncCF)
EMSO_EMSO_OBSEA_AWAC_waves_full_nc = emsoERDDAP.to_ncCF()
EMSO_EMSO_OBSEA_AWAC_waves_full_nc

<class 'netCDF4._netCDF4.Dataset'>
root group (NETCDF3_CLASSIC data model, file format NETCDF3):
    cdm_data_type: Point
    Conventions: OceanSITES;EMSO, CF-1.6
    data_mode: R
    data_type: OceanSITES time-series data
    date_created: 2023-10-25T14:49:16Z
    date_modified: 2023-10-25T14:49:16Z
    Easternmost_Easting: 1.75257
    emso_facility: OBSEA
    featureType: Point
    format_version: 1.4
    geospatial_lat_max: 41.18212
    geospatial_lat_min: 41.18212
    geospatial_lat_units: degrees_north
    geospatial_lon_max: 1.75257
    geospatial_lon_min: 1.75257
    geospatial_lon_units: degrees_east
    geospatial_vertical_max: 20.0
    geospatial_vertical_min: 20.0
    geospatial_vertical_positive: down
    geospatial_vertical_units: m
    history: 2024-08-26T06:43:08Z (local files)
2024-08-26T06:43:08Z https://data.obsea.es/erddap/tabledap/EMSO_OBSEA_CTD_30min.ncCF
    id: EMSO_OBSEA_CTD_30min
    infoUrl: https://edmo.seadatanet.org/report/2150
    institution: Polytechnic 

#### Comparing the properties between ERDDAP servers (ARGO & EMSO ERDDAP servers)

Comparing metadata of each server's 'allDatasets' dataset

In [93]:
# Function to summarize DataFrame
def summarize_dataframe(df):
    summary = []
    for col in df.columns:
        col_summary = {
            'Column': col,
            'Data Type': df[col].dtype,
            'Number of Unique Values': df[col].nunique(),
            'Number of Missing Values': df[col].isna().sum(),
            'Sample Unique Values': df[col].unique()[:5]  # List first 5 unique values as sample
        }
        summary.append(col_summary)
    
    summary_df = pd.DataFrame(summary)
    return summary_df

In [98]:
# Load ARGO general metadata
ARGO_general_metadata = pd.read_csv("properties/ARGO_ERDDAP_overview_metadata.csv")
# Load EMSO general metadata
EMSO_general_metadata = pd.read_csv("properties/EMSO_ERDDAP_overview_metadata.csv")

In [102]:
if (ARGO_general_metadata.columns == EMSO_general_metadata.columns).all():
    print("'allDatasets' dataset of both ERDDAP servers contains the same metadata properties")
else:
    print("'allDatasets' dataset of both ERDDAP servers contains different metadata properties")

'allDatasets' dataset of both servers contain the same metadata properties


In [100]:
# Get summary
ARGO_summary_df = summarize_dataframe(ARGO_general_metadata)
ARGO_summary_df

Unnamed: 0,Column,Data Type,Number of Unique Values,Number of Missing Values,Sample Unique Values
0,datasetID,object,53,0,"[allDatasets, OACP-Argo-Global, ArgoFloats, Ar..."
1,accessible,object,2,0,"[public, log in]"
2,institution,object,12,0,"[Ifremer, LOPS/Ifremer, Argo, Jet Propulsion L..."
3,dataStructure,object,2,0,"[table, grid]"
4,cdm_data_type,object,7,0,"[Other, Grid, TrajectoryProfile, Profile, Traj..."
5,class,object,7,0,"[EDDTableFromAllDatasets, EDDGridFromNcFiles, ..."
6,title,object,52,0,[* The List of All Active Datasets in this ERD...
7,minLongitude (degrees_east),float64,23,6,"[nan, -242.156, -179.99942, -999.999, -180.0]"
8,maxLongitude (degrees_east),float64,22,6,"[nan, 129.844, 180.0, 181.706, 179.99948666666..."
9,longitudeSpacing (degrees_east),float64,10,20,"[nan, 1.0, 0.01, 0.0622870625, 0.1652301818181..."


In [96]:
# Get summary
EMSO_summary_df = summarize_dataframe(EMSO_general_metadata)
EMSO_summary_df

Unnamed: 0,Column,Data Type,Number of Unique Values,Number of Missing Values,Sample Unique Values
0,datasetID,object,171,0,"[allDatasets, EMSO_OBSEA_CTD_30min, EMSO_OBSEA..."
1,accessible,object,1,0,[public]
2,institution,object,21,0,"[Axiom Docker Install, Polytechnic University ..."
3,dataStructure,object,1,0,[table]
4,cdm_data_type,object,3,0,"[Other, Point, TimeSeries]"
5,class,object,2,0,"[EDDTableFromAllDatasets, EDDTableFromErddap]"
6,title,object,136,0,[* The List of All Active Datasets in this ERD...
7,minLongitude (degrees_east),float64,59,12,"[nan, 1.75257, 18.0824167, -32.275, -32.2757]"
8,maxLongitude (degrees_east),float64,59,12,"[nan, 1.75257, 18.0824167, -32.275, -32.2757]"
9,longitudeSpacing (degrees_east),float64,0,171,[nan]


Comparing metadata of each server's datasets

In [6]:
# Load ARGO properties from ARGO ERDDAP server
ARGO_props = pd.read_csv("properties/ARGO_ERDDAP_dataset_metadata.csv")
# Load ARGO properties from EMSO ERDDAP server
EMSO_props = pd.read_csv("properties/EMSO_ERDDAP_dataset_metadata.csv")

Comparing Dimension properties

In [59]:
ARGO_dimensions = pd.read_csv("properties/ARGO_ERDDAP_dataset_metadata_dimensions.csv")
# alternative, same results:
#ARGO_dimensions = ARGO_props[ ARGO_props['Row Type'] == "dimension" ]

ARGO_dimensions = ARGO_dimensions.groupby('Variable Name').agg(
    Count=('DatasetID', 'size'),
    Values_list=('DatasetID', list)
).reset_index()
ARGO_dimensions

Unnamed: 0,Variable Name,Count,Values_list
0,PRES,1,[OS_DYFAMED_1994-2014_D_TSO2]
1,depth,29,"[HF_75c7_5b60_95d8, HF_ac49_84ad_3eb6, SDC_BAL..."
2,frame,1,[traj_lonlat_init_t0]
3,latitude,33,"[HF_75c7_5b60_95d8, HF_ac49_84ad_3eb6, OACP-Ar..."
4,longitude,33,"[HF_75c7_5b60_95d8, HF_ac49_84ad_3eb6, OACP-Ar..."
5,ntraj,1,[traj_lonlat_init_t0]
6,time,32,"[HF_75c7_5b60_95d8, HF_ac49_84ad_3eb6, OS_DYFA..."


In [60]:
EMSO_dimensions = pd.read_csv("properties/EMSO_ERDDAP_dataset_metadata_dimensions.csv")
# alternative, same results:
#EMSO_dimensions = EMSO_props[ EMSO_props['Row Type'] == "dimension" ]

EMSO_dimensions = EMSO_dimensions.groupby('Variable Name').agg(
    Count=('DatasetID', 'size'),
    Values_list=('DatasetID', list)
).reset_index()
EMSO_dimensions

Unnamed: 0,Variable Name,Count,Values_list


In [46]:
# Dimension properties common between both ERDDAP servers
common_dimensions = ARGO_dimensions[ARGO_dimensions['Variable Name'].isin(EMSO_dimensions['Variable Name'])]
common_dimensions = common_dimensions['Variable Name'].drop_duplicates()
common_dimensions

Series([], Name: Variable Name, dtype: object)

In [47]:
# Dimension properties unique to ARGO ERDDAP server
dimensions_unique_to_ARGO = ARGO_dimensions[~ARGO_dimensions['Variable Name'].isin(EMSO_dimensions['Variable Name'])]
dimensions_unique_to_ARGO = dimensions_unique_to_ARGO['Variable Name'].drop_duplicates()
dimensions_unique_to_ARGO

0         PRES
1        depth
2        frame
3     latitude
4    longitude
5        ntraj
6         time
Name: Variable Name, dtype: object

In [48]:
# Dimension properties unique to EMSO ERDDAP server
dimensions_unique_to_EMSO = EMSO_dimensions[~EMSO_dimensions['Variable Name'].isin(ARGO_dimensions['Variable Name'])]
dimensions_unique_to_EMSO = dimensions_unique_to_EMSO['Variable Name'].drop_duplicates()
dimensions_unique_to_EMSO

Series([], Name: Variable Name, dtype: object)

In [61]:
# overview of compared Dimension properties
props_dim_summ = pd.DataFrame.from_dict({
    "ARGO_dim" : [len(ARGO_dimensions)],
    "EMSO_dim" : [len(EMSO_dimensions)],
    "common_dim" : [len(common_dimensions)],
    "ARGO_unique_dim" : [len(dimensions_unique_to_ARGO)],
    "EMSO_unique_dim" : [len(dimensions_unique_to_EMSO)]
})
props_dim_summ

Unnamed: 0,ARGO_dim,EMSO_dim,common_dim,ARGO_unique_dim,EMSO_unique_dim
0,7,0,0,7,0


Comparing Variable properties

In [27]:
ARGO_variables = pd.read_csv("properties/ARGO_ERDDAP_dataset_metadata_variables.csv")
# alternative, same results:
#ARGO_variables = ARGO_props[ ARGO_props['Row Type'] == "variable" ]

ARGO_variables = ARGO_variables.groupby('Variable Name').agg(
    Count=('DatasetID', 'size'),
    Values_list=('DatasetID', list)
).reset_index()
ARGO_variables

Unnamed: 0,Variable Name,Count,Values_list
0,Access_ordering_of_data,1,[SDC_BLS_DATA_TS_V1]
1,Access_restriction,1,[SDC_BLS_DATA_TS_V1]
2,Alternative_cruise_name,1,[SDC_BLS_DATA_TS_V1]
3,Alternative_station_name,1,[SDC_BLS_DATA_TS_V1]
4,BBP700,1,[OceanGlidersGDACTrajectories]
...,...,...,...
536,vertical_sampling_scheme,1,[ArgoFloats]
537,vn,2,"[drifter_6hour_qc, drifter_hourly_qc]"
538,wcs,1,[allDatasets]
539,wmo_inst_type,2,"[ArgoFloats, ArgoFloats-synthetic-BGC]"


In [28]:
EMSO_variables = pd.read_csv("properties/EMSO_ERDDAP_dataset_metadata_variables.csv")
# alternative, same results:
#EMSO_variables = EMSO_props[ EMSO_props['Row Type'] == "variable" ]

EMSO_variables = EMSO_variables.groupby('Variable Name').agg(
    Count=('DatasetID', 'size'),
    Values_list=('DatasetID', list)
).reset_index()
EMSO_variables

Unnamed: 0,Variable Name,Count,Values_list
0,AIRT,2,"[EMSO_OBSEA_Besos_Buoy_Airmar_200WX_30min, EMS..."
1,AIRT_QC,2,"[EMSO_OBSEA_Besos_Buoy_Airmar_200WX_30min, EMS..."
2,ALKY,2,"[EMSO_OBSEA_Besos_Buoy_SA8065_30min, EMSO_OBSE..."
3,ALKY_QC,2,"[EMSO_OBSEA_Besos_Buoy_SA8065_30min, EMSO_OBSE..."
4,ATMP,1,[Emso_Ligure_Dyfamed_FCO2TW]
...,...,...,...
413,velocity_upward,2,"[smartbay_obs_adcp, smartbay_obs_hour_mean]"
414,velocity_upward_qc,2,"[smartbay_obs_adcp, smartbay_obs_hour_mean]"
415,wcs,1,[allDatasets]
416,wms,1,[allDatasets]


In [23]:
# Variable properties common between both ERDDAP servers
common_variables = ARGO_variables[ARGO_variables['Variable Name'].isin(EMSO_variables['Variable Name'])]
common_variables = common_variables['Variable Name'].drop_duplicates()
common_variables

19          latitude
20         longitude
52              time
62       institution
275          CSPD_QC
           ...      
820             OSAT
821          OSAT_QC
827             TUR4
828          TUR4_QC
832    platform_name
Name: Variable Name, Length: 67, dtype: object

In [24]:
# Variable properties unique to ARGO ERDDAP server
variables_unique_to_ARGO = ARGO_variables[~ARGO_variables['Variable Name'].isin(EMSO_variables['Variable Name'])]
variables_unique_to_ARGO = variables_unique_to_ARGO['Variable Name'].drop_duplicates()
variables_unique_to_ARGO

0                             chla
1                          chla_qc
2            config_mission_number
3                     cycle_number
4                      data_center
                  ...             
958    hist_stop_coast_TELUK_BENOA
959                traj_iU_init_t0
960                traj_jV_init_t0
961               traj_lat_init_t0
962               traj_lon_init_t0
Name: Variable Name, Length: 474, dtype: object

In [25]:
# Variable properties unique to EMSO ERDDAP server
variables_unique_to_EMSO = EMSO_variables[~EMSO_variables['Variable Name'].isin(ARGO_variables['Variable Name'])]
variables_unique_to_EMSO = variables_unique_to_EMSO['Variable Name'].drop_duplicates()
variables_unique_to_EMSO

0                    CurrVelE_ADCP
1                    CurrVelN_ADCP
2                   CurrVelUp_ADCP
3                       CurrVel_QC
4                      ECHO_BEAM_1
                   ...            
3890         sbe16plus_salinity_qc
3891       sbe16plus_soundvelocity
3892    sbe16plus_soundvelocity_qc
3893         sbe16plus_temperature
3894      sbe16plus_temperature_qc
Name: Variable Name, Length: 351, dtype: object

In [64]:
# overview of compared Variable properties
props_var_summ = pd.DataFrame.from_dict({
    "ARGO_var" : [len(ARGO_variables)],
    "EMSO_var" : [len(EMSO_variables)],
    "common_var" : [len(common_variables)],
    "ARGO_unique_var" : [len(variables_unique_to_ARGO)],
    "EMSO_unique_var" : [len(variables_unique_to_EMSO)]
})
props_var_summ

Unnamed: 0,ARGO_var,EMSO_var,common_var,ARGO_unique_var,EMSO_unique_var
0,541,418,67,474,351


Comparing attribute properties

In [30]:
ARGO_attributes = pd.read_csv("properties/ARGO_ERDDAP_dataset_metadata_attributes.csv")
# alternative, same results:
#ARGO_attributes = ARGO_props[ ARGO_props['Row Type'] == "attribute" ]

#ARGO_attributes.head()
ARGO_attributes = ARGO_attributes.groupby('Attribute Name').agg(
    Count=('DatasetID', 'size'),
    Values_list=('DatasetID', list)
).reset_index()
ARGO_attributes

Unnamed: 0,Attribute Name,Count,Values_list
0,Acknowledgements,6,"[SDC_NAT_CLIM_TS_V1_025_m, SDC_NAT_CLIM_TS_V1_..."
1,Author_e_mail,23,"[SDC_BLS_CLIM_TS_V1_m, SDC_BLS_CLIM_TS_V1_s, S..."
2,CDI,4,"[SDC_NAT_CLIM_TS_V1_025_m, SDC_NAT_CLIM_TS_V1_..."
3,CDO,4,"[SDC_NAT_CLIM_TS_V1_025_m, SDC_NAT_CLIM_TS_V1_..."
4,C_format,59,"[ArgoFloats, ArgoFloats, ArgoFloats, ArgoFloat..."
...,...,...,...
309,version,2,"[ArgoFloats-reference, OACP-Argo-Global]"
310,w_surf_option,1,[ariane_trajectories_qualitative]
311,wmo_instrument_type,1,[copernicus-fos]
312,wmo_platform_code,2,"[OS_DYFAMED_1994-2014_D_TSO2, drifter_6hour_qc]"


In [31]:
EMSO_attributes = pd.read_csv("properties/EMSO_ERDDAP_dataset_metadata_attributes.csv")
# alternative, same results:
#EMSO_attributes = EMSO_props[ EMSO_props['Row Type'] == "attribute" ]

#EMSO_attributes.head()
EMSO_attributes = EMSO_attributes.groupby('Attribute Name').agg(
    Count=('DatasetID', 'size'),
    Values_list=('DatasetID', list)
).reset_index()
EMSO_attributes

Unnamed: 0,Attribute Name,Count,Values_list
0,Conventions,220,"[BB_505_ADCP, BB_567_SBE56, BB_584_SBE56, BB_5..."
1,DM_indicator,29,"[Emso_Ligure_Dyfamed_SedimentTrap, Emso_Ligure..."
2,Easternmost_Easting,159,"[BB_505_ADCP, BB_567_SBE56, BB_584_SBE56, BB_5..."
3,Northernmost_Northing,159,"[BB_505_ADCP, BB_567_SBE56, BB_584_SBE56, BB_5..."
4,Processing_level,34,"[Emso_Azores_Chemini_IRON, Emso_Azores_Chemini..."
...,...,...,...
266,valid_min,1646,"[E1M3A_20070528_20080226, E1M3A_20070528_20080..."
267,valid_range,10,[Emso_Western_Ligurian_Bathybot_ADCP_NetCDF_20...
268,wfd_waterbody_name,7,"[smartbay_obs_acoustic, smartbay_obs_adcp, sma..."
269,wfd_waterbody_type,7,"[smartbay_obs_acoustic, smartbay_obs_adcp, sma..."


In [32]:
# Attribute properties common between both ERDDAP servers
common_attributes = ARGO_attributes[ARGO_attributes['Attribute Name'].isin(EMSO_attributes['Attribute Name'])]
common_attributes = common_attributes['Attribute Name'].drop_duplicates()
common_attributes

6                Conventions
13              DM_indicator
16       Easternmost_Easting
21     Northernmost_Northing
22              QC_procedure
               ...          
304                    units
305          update_interval
307                valid_max
308                valid_min
312        wmo_platform_code
Name: Attribute Name, Length: 135, dtype: object

In [33]:
# Attribute properties unique to ARGO ERDDAP server
attributes_unique_to_ARGO = ARGO_attributes[~ARGO_attributes['Attribute Name'].isin(EMSO_attributes['Attribute Name'])]
attributes_unique_to_ARGO = attributes_unique_to_ARGO['Attribute Name'].drop_duplicates()
attributes_unique_to_ARGO

0         Acknowledgements
1            Author_e_mail
2                      CDI
3                      CDO
4                 C_format
              ...         
306    user_manual_version
309                version
310          w_surf_option
311    wmo_instrument_type
313                 zsigma
Name: Attribute Name, Length: 179, dtype: object

In [34]:
# Attribute properties unique to EMSO ERDDAP server
attributes_unique_to_EMSO = EMSO_attributes[~EMSO_attributes['Attribute Name'].isin(ARGO_attributes['Attribute Name'])]
attributes_unique_to_EMSO = attributes_unique_to_EMSO['Attribute Name'].drop_duplicates()
attributes_unique_to_EMSO

4        Processing_level
5            QC_indicator
17     ancllary_variables
20      atlantos_EOV_name
21       atlantos_EOV_urn
              ...        
261      type_of_analysis
262           uncertainty
267           valid_range
268    wfd_waterbody_name
269    wfd_waterbody_type
Name: Attribute Name, Length: 136, dtype: object

In [63]:
# overview of compared Attribute properties
props_attr_summ = pd.DataFrame.from_dict({
    "ARGO_attr" : [len(ARGO_attributes)],
    "EMSO_attr" : [len(EMSO_attributes)],
    "common_attr" : [len(common_attributes)],
    "ARGO_unique_attr" : [len(attributes_unique_to_ARGO)],
    "EMSO_unique_attr" : [len(attributes_unique_to_EMSO)]
})
props_attr_summ

Unnamed: 0,ARGO_attr,EMSO_attr,common_attr,ARGO_unique_attr,EMSO_unique_attr
0,314,271,135,179,136


In [65]:
props_var_summ

Unnamed: 0,ARGO_var,EMSO_var,common_var,ARGO_unique_var,EMSO_unique_var
0,541,418,67,474,351


In [62]:
props_dim_summ

Unnamed: 0,ARGO_dim,EMSO_dim,common_dim,ARGO_unique_dim,EMSO_unique_dim
0,7,0,0,7,0


### Analysis results:

- requires knowledge on ERDDAP servers & netCDF file format  
- ERDDAP server offers 170 datasets  
- mainly NetCDF data
- exploration of 'EMSO_ERDAP_dataset_metadata.csv' and 'EMSO_ERDDAP_dataset_data_metadata(2).csv', which shows information about the properties in each datasets (such as type, example, unique values, ...), shows:
    - in some cases, use of urls (e.g. for creator or OrcIDs)  
    - use of URLs of known, standard licenses 
    - opportunity for more use of persistent identifiers (e.g. station codes, ROR-id for institutes, ...)

#### Comparison of properties of data between the ARGO and EMSO ERDDAP servers

- the ERDDAP server interface return data in a semi-standard manner
- 'allDatasets' dataset returns information about the datasets that are available through both servers. The metadata to describe this information is the same in both servers.  
- both ERDDAP servers return netDCF datasets: data properties can be divided into three categories: dimensions, variables and attributes 
- the data properties of datasets have some degree of overlap
    - 67 of variable properties 
    - 135 of attribute properties 
- but there is still large number of properties unique to each server:
    - 7 of dimension properties
    - 825 of variable properties
    - 315 of attribute properties
- properties were analysed based on name only, no information available about the properties meaning/semantics 

