In [1]:
import pandas as pd
from connec_functions import GDB

### ARGO SPARQL Endpoint

#### Notes/Findings:
- simple / general / large SPARQL queries result in a 502 error
- querying the SPARQL endpoint requires knowledge on the data model
- there is linking to externally defined standard terms = good
    - leveraging use of linked data 
- usage of internally defined predicates = good, but less good (see point 2)
    - not known by external machines (e.g. dct:title vs ifremer:thisisourname)
    - solution would require community effort to develop standard data model for described entity kinds
- identifier for publisher information can be improved, for example 'http://www.argodatamgt.org/Data-Mgt-Team/ADMT-team-and-Executive-Committee'
  - could alternatively use ROR-ID for institutes, ORC-ID for people
  - currently links to just html page (no ttl or json-ld with content negotiation), could be described as linked data
- https://co.ifremer.fr/co/argo-linked-data/doc/argo-floats.ttl#Datacenter not in the described ontology

- data granularity: file level 
  - data files are 'accessible' with dcat:downloadUrl within a dcat:distribution
  (datapoints not described in RDF - to use data itself the file needs to be downloaded and data needs to be read into memory)

In [2]:
# Define the SPARQL endpoint URL
sparql_endpoint_url = "https://sparql.ifremer.fr/argo/query"
gdb = GDB(sparql_endpoint_url, "endpoint_queries")

In [None]:
# general exploration
gdb.execute_to_df("general.sparql")

### Catalog

In [None]:
#list the predicates used to describe the catalog
gdb.execute_to_df("catalog_predicates.sparql")

In [None]:
# list the catalogs available through the SPARQL-endpoint, with label, description and publisher information  
catalog_info = gdb.execute_to_df("catalog-info.sparql")
catalog_info

In [4]:
#List the number of datasets in each of the catalogs
catalog_datasets = gdb.execute_to_df("catalog-datasets.sparql")
catalogs = catalog_datasets.groupby('catalog').nunique()
catalogs

Unnamed: 0_level_0,dataset
catalog,Unnamed: 1_level_1
https://argo.ucsd.edu/data/data-from-gdacs#aoml,8280
https://argo.ucsd.edu/data/data-from-gdacs#bodc,817
https://argo.ucsd.edu/data/data-from-gdacs#coriolis,3498
https://argo.ucsd.edu/data/data-from-gdacs#csio,523
https://argo.ucsd.edu/data/data-from-gdacs#csiro,1086
https://argo.ucsd.edu/data/data-from-gdacs#incois,490
https://argo.ucsd.edu/data/data-from-gdacs#jma,1880
https://argo.ucsd.edu/data/data-from-gdacs#kma,258
https://argo.ucsd.edu/data/data-from-gdacs#kordi,115
https://argo.ucsd.edu/data/data-from-gdacs#meds,648


### Publisher information

In [None]:
# check if publisher url offers information in RDF
example_publisher = catalog_info['publisher'][0]
gdb_publisher = GDB(example_publisher, "endpoint_queries")
gdb_publisher.execute_to_df("general.sparql")
#not available as RDF

In [None]:
gdb.execute_to_df("publisher.sparql")
#not available as RDF

### Datasets

In [None]:
#dataset predicates
gdb.execute_to_df("dataset_predicates.sparql") 
# datasets info
gdb.execute_to_df("dataset-info.sparql")

# --> trying to get an overview of redicates associated to a dcat:Dataset or listing all dataset with their title doesn't work, 
# query takes too long to execute 

In [5]:
#example datasets
for dataset in catalog_datasets.head()['dataset']:
    example_dataset = gdb.execute_to_df("example_dataset_predicates.sparql", dataset=dataset)
    print(f"predicates of {dataset}")
    for predicate in example_dataset['p']:
        print(f"    {predicate}")

predicates of https://fleetmonitoring.euro-argo.eu/float/1900045
    http://www.w3.org/1999/02/22-rdf-syntax-ns#type
    http://www.w3.org/1999/02/22-rdf-syntax-ns#type
    http://www.w3.org/1999/02/22-rdf-syntax-ns#type
    http://purl.org/dc/terms/description
    http://purl.org/dc/terms/title
    http://xmlns.com/foaf/0.1/maker
    https://co.ifremer.fr/co/argo-linked-data/doc/argo-floats.ttl#type
    http://purl.org/dc/terms/identifier
    http://purl.org/dc/terms/modified
    http://www.w3.org/ns/dcat#distribution
    http://www.w3.org/ns/dcat#distribution
    http://www.w3.org/ns/dcat#distribution
    http://www.w3.org/ns/dcat#distribution
    http://www.w3.org/ns/dcat#distribution
    http://www.w3.org/ns/dcat#distribution
    http://www.w3.org/ns/dcat#distribution
    http://www.w3.org/ns/dcat#distribution
    http://www.w3.org/ns/dcat#distribution
    http://www.w3.org/ns/dcat#distribution
    http://www.w3.org/ns/dcat#distribution
    http://www.w3.org/ns/dcat#distribution
  

In [6]:
for dataset in catalog_datasets.head()['dataset']:
    print(dataset)

https://fleetmonitoring.euro-argo.eu/float/1900045
https://fleetmonitoring.euro-argo.eu/float/1900145
https://fleetmonitoring.euro-argo.eu/float/1900185
https://fleetmonitoring.euro-argo.eu/float/1900492
https://fleetmonitoring.euro-argo.eu/float/1900500


In [7]:
example_dataset = gdb.execute_to_df("example_dataset_predicates.sparql", dataset="https://fleetmonitoring.euro-argo.eu/float/1900045")
print(f"predicate --> object")
for i, row in example_dataset.iterrows():
    print(f"{row['p']} --> {row['o']}")

predicate --> object
http://www.w3.org/1999/02/22-rdf-syntax-ns#type --> http://www.w3.org/ns/dcat#Dataset
http://www.w3.org/1999/02/22-rdf-syntax-ns#type --> http://www.w3.org/ns/sosa/Platform
http://www.w3.org/1999/02/22-rdf-syntax-ns#type --> https://co.ifremer.fr/co/argo-linked-data/doc/argo-floats.ttl#ArgoFloat
http://purl.org/dc/terms/description --> 
                Argo float WMO 1900045 metadata
                
http://purl.org/dc/terms/title --> Argo float 1900045
http://xmlns.com/foaf/0.1/maker --> http://vocab.nerc.ac.uk/collection/R24/current/WRC
https://co.ifremer.fr/co/argo-linked-data/doc/argo-floats.ttl#type --> http://vocab.nerc.ac.uk/collection/R23/current/APEX
http://purl.org/dc/terms/identifier --> 1900045
http://purl.org/dc/terms/modified --> 2016-05-27T00:35:37
http://www.w3.org/ns/dcat#distribution --> https://fleetmonitoring.euro-argo.eu/float/1900045#D1900045_001
http://www.w3.org/ns/dcat#distribution --> https://fleetmonitoring.euro-argo.eu/float/1900045#D190

### Standard terms

todo: rerun queries from here onwards (because 502 proxy server error due to too many requests?)

#### outside-institute-standard terms

In [13]:
aoml_result = gdb.execute_to_df("external-term.sparql") # catalog="https://argo.ucsd.edu/data/data-from-gdacs#aoml"
aoml_result

HTTPError: HTTP Error 502: Proxy Error

In [None]:
coriolis_result = gdb.execute_to_df(f"external-term.sparql",catalog="https://argo.ucsd.edu/data/data-from-gdacs#coriolis")
coriolis_result.groupby(['p', 'o']).nunique()

#### inside-institute standard-terms

In [None]:
#examples: <https://co.ifremer.fr/co/argo-linked-data/doc/argo-floats.ttl#Datacenter> 
#OR <https://argo.ucsd.edu/data/argo-software-tools#WJO>

### Dataset distributions 
'HTTP Error 502: Proxy Error' when querying dcat:Distribution entity itself 


In [None]:
# distributions of datasets in meds datalog (meds catalog contains 648 datasets)
gdb.execute_to_df("distribution.sparql")

for catalog in catalogs.index:
    name = catalog.split('#')[-1]

    

In [None]:
meds_distributions.groupby('p').nunique()

In [None]:
# distributions of datasets in kordi datalog (Kordi catalog contains 115 datasets)
sparql_query = """
SELECT DISTINCT 
    ?p ?o
WHERE {
        <https://argo.ucsd.edu/data/data-from-gdacs#kordi> <http://www.w3.org/ns/dcat#dataset> ?dataset . 
        ?dataset <http://www.w3.org/ns/dcat#distribution> ?distribution . 
        ?distribution ?p ?o . 
    }
"""

kordi_distributions = query(sparql_endpoint_url, sparql_query)
kordi_distributions

In [None]:
distribution_predicates = kordi_distributions.groupby('p').nunique()
distribution_predicates

In [None]:
# Define the SPARQL endpoint URL
sparql_endpoint_url = "https://meta.icos-cp.eu/sparql"
gdb = GDB(sparql_endpoint_url, "endpoint_queries")
gdb.execute_to_df("general.sparql") 
# curl https://meta.icos-cp.eu/sparql -X POST --data 'query=PREFIX%20cpmeta%3A%20%3Chttp%3A%2F%2Fmeta.icos-cp.eu%2Fontologies%2Fcpmeta%2F%3E%0APREFIX%20rdfs%3A%20%3Chttp%3A%2F%2Fwww.w3.org%2F2000%2F01%2Frdf-schema%23%3E%0APREFIX%20cpst%3A%20%3Chttp%3A%2F%2Fmeta.icos-cp.eu%2Fontologies%2Fstationentry%2F%3E%0ASELECT%0A%28coalesce%28%3FtcName%2C%20%3FhoName%29%20as%20%3FName%29%0A%3FTheme%0A%28coalesce%28%3FtcClass%2C%20%3FhoClass%29%20as%20%3FClass%29%0A%28coalesce%28%3FtcCountry%2C%20%3FhoCountry%29%20as%20%3FCountry%29%0AFROM%20%3Chttp%3A%2F%2Fmeta.icos-cp.eu%2Fresources%2Fstationentry%2F%3E%0AFROM%20%3Chttp%3A%2F%2Fmeta.icos-cp.eu%2Fontologies%2Fstationentry%2F%3E%0AFROM%20%3Chttp%3A%2F%2Fmeta.icos-cp.eu%2Fresources%2Ficos%2F%3E%0AWHERE%20%7B%0A%09%3Fs%20cpst%3AhasCountry%20%3FhoCountry%20%3B%20cpst%3AhasLongName%20%3FhoName%20%3B%20cpst%3AhasStationClass%20%3FhoClass0%20%3B%20a%2Frdfs%3Alabel%20%3FTheme%20.%0A%09optional%7B%0A%09%09%3Fs%20cpst%3AhasProductionCounterpart%20%3Fprods%20.%0A%09%09bind%28iri%28%3Fprods%29%20as%20%3Fprod%29%20.%0A%09%09optional%7B%3Fprod%20cpmeta%3AhasName%20%3FtcName%20%7D%0A%09%09optional%7B%3Fprod%20cpmeta%3AhasStationClass%20%3FtcClass%7D%0A%09%09optional%7B%3Fprod%20cpmeta%3AcountryCode%20%3FtcCountry%7D%0A%09%7D%0A%09bind%28if%28contains%28%3FhoClass0%2C%20%27Ass%27%29%2C%20%27Associated%27%2C%20%3FhoClass0%29%20as%20%3FhoClass%29%0A%09filter%20exists%20%7B%3Fs%20cpst%3AhasShortName%20%5B%5D%7D%0A%7D%0Aorder%20by%20%3FTheme%20%3FName'
# different compared to ARGO-ifremer SPARQL-endpoint (curl -X POST "https://sparql.ifremer.fr/argo/query" --data-urlencode "query=select ?s ?o ?p where{?s ?o ?p.} limit 10")
# curl -X POST "https://meta.icos-cp.eu/sparql" --data-urlencode "query=select ?s ?o ?p where{?s ?o ?p.} limit 10" --> does work ...