In [1]:
import pandas as pd
from connec_functions import GDB

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# SPARQL endpoint

#### notes/fidings:
- simple / general / large SPARQL queries result in a 502 error
- querying the SPARQL endpoint requires knowledge on the data model
- there is linking to externally defined standard terms = good
    - leveraging use of linked data 
- usage of internally defined predicates = good, but less good (see point 2)
    - not known by external machines (e.g. dct:title vs ifremer:thisisourname)
    - solution would require community effort to develop standard data model for described entity kinds
- identifier for publisher information can be improved, for example 'http://www.argodatamgt.org/Data-Mgt-Team/ADMT-team-and-Executive-Committee'
  - could alternatively use ROR-ID for institutes, ORC-ID for people
  - currently links to just html page (no ttl or json-ld with content negotiation), could be described as linked data
- https://co.ifremer.fr/co/argo-linked-data/doc/argo-floats.ttl#Datacenter not in the described ontology

- data granularity: file level 
  - data files are 'accessible' with dcat:downloadUrl within a dcat:distribution
  (data within file is not machine readable)

In [2]:
# Define the SPARQL endpoint URL
sparql_endpoint_url = "https://sparql.ifremer.fr/argo/query"
gdb = GDB(sparql_endpoint_url, "endpoint_queries")

In [3]:
# general exploration
gdb.execute_to_df("general.sparql")

Unnamed: 0,s,p,o
0,https://argo.ucsd.edu/data/data-from-gdacs#aoml,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://www.w3.org/ns/dcat#Catalog
1,https://argo.ucsd.edu/data/data-from-gdacs#aoml,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,https://co.ifremer.fr/co/argo-linked-data/doc/...
2,https://argo.ucsd.edu/data/data-from-gdacs#aoml,http://www.w3.org/2000/01/rdf-schema#label,aoml
3,https://argo.ucsd.edu/data/data-from-gdacs#aoml,http://purl.org/dc/terms/description,\n Catalog of the Argo data...
4,https://argo.ucsd.edu/data/data-from-gdacs#aoml,http://purl.org/dc/terms/publisher,http://www.argodatamgt.org/Data-Mgt-Team/ADMT-...
5,https://argo.ucsd.edu/data/data-from-gdacs#aoml,http://purl.org/dc/terms/title,aoml Argo DAC metadata
6,https://argo.ucsd.edu/data/data-from-gdacs#aoml,http://www.w3.org/ns/dcat#dataset,https://fleetmonitoring.euro-argo.eu/float/190...
7,https://argo.ucsd.edu/data/data-from-gdacs#aoml,http://www.w3.org/ns/dcat#dataset,https://fleetmonitoring.euro-argo.eu/float/190...
8,https://argo.ucsd.edu/data/data-from-gdacs#aoml,http://www.w3.org/ns/dcat#dataset,https://fleetmonitoring.euro-argo.eu/float/190...
9,https://argo.ucsd.edu/data/data-from-gdacs#aoml,http://www.w3.org/ns/dcat#dataset,https://fleetmonitoring.euro-argo.eu/float/190...


### Catalog

In [None]:
#catalog predicates
gdb.execute_to_df("catalog_predicates.sparql")

In [None]:
# catalogs 
catalog_info = gdb.execute_to_df("catalog-info.sparql")
catalog_info

In [None]:
#datasets per catalog
catalog_datasets = gdb.execute_to_df("catalog-datasets.sparql")
catalogs = catalog_datasets.groupby('catalog').nunique()
catalogs

### Publisher information

In [None]:
example_publisher = catalog_info['publisher'][0]
gdb_publisher = GDB(example_publisher, "endpoint_queries")
#gdb_publisher.execute_to_df("general.sparql")
#not available as RDF

In [None]:
#gdb.execute_to_df("publisher.sparql")
#not available as RDF

### Datasets

In [None]:
#dataset predicates
gdb.execute_to_df("dataset_predicates.sparql") 

In [None]:
# datasets 
gdb.execute_to_df("dataset-info.sparql")

### Standard terms

#### outside-institute-standard terms

In [None]:
for catalog in catalogs.index:
    name = catalog.split('#')[-1]
    print(name)
    result = gdb.execute_to_df(f"{name}-external-term.sparql")
    print(result.groupby(['p', 'o']).nunique())

#### inside-institute standard-terms

In [None]:
#examples: <https://co.ifremer.fr/co/argo-linked-data/doc/argo-floats.ttl#Datacenter> OR <https://argo.ucsd.edu/data/argo-software-tools#WJO>

### Dataset distributions 
'HTTP Error 502: Proxy Error' when querying dcat:Distribution entity itself 


In [None]:
# distributions of datasets in meds datalog (meds catalog contains 648 datasets)
gdb.execute_to_df("distribution.sparql")

for catalog in catalogs.index:
    name = catalog.split('#')[-1]

    

In [None]:
meds_distributions.groupby('p').nunique()

In [None]:
# distributions of datasets in kordi datalog (Kordi catalog contains 115 datasets)
sparql_query = """
SELECT DISTINCT 
    ?p ?o
WHERE {
        <https://argo.ucsd.edu/data/data-from-gdacs#kordi> <http://www.w3.org/ns/dcat#dataset> ?dataset . 
        ?dataset <http://www.w3.org/ns/dcat#distribution> ?distribution . 
        ?distribution ?p ?o . 
    }
"""

kordi_distributions = query(sparql_endpoint_url, sparql_query)
kordi_distributions

In [None]:
distribution_predicates = kordi_distributions.groupby('p').nunique()
distribution_predicates