In [None]:
# Get all metadata structure from FDP using the API
# V1.2

import requests
import rdflib
from rdflib import Graph, Literal, Namespace, URIRef, BNode
import os 
from datetime import datetime
import re

# Format the current date as YYYY-MM-DD
today_date = datetime.now().strftime('%Y-%m-%d')

LDP = Namespace('http://www.w3.org/ns/ldp#')
RDF = Namespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#')
DCT = Namespace('http://purl.org/dc/terms/')
DCAT = Namespace('http://www.w3.org/ns/dcat#')
ADMS = Namespace('http://www.w3.org/ns/adms#')
HEALTHDCATAP = Namespace('http://healthdataportal.eu/ns/health#')

# Modify here the FDP API URL
# fdpURL = "http://ehelse.healthdataportal.eu/"
# fdpURL = "http://fdp1.healthdataportal.eu/"
fdpURL = "http://backup.healthdataportal.eu/"

# Create a folder to save the files
folder_name = f'FDP-Backup-{today_date}'
current_directory = os.getcwd()
folder_path = os.path.join(current_directory, folder_name)
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

print(f'Backup process started for {fdpURL}')

# Function to sanitize file and folder names
def sanitize_filename(name):
    name = re.sub(r'@.*', '', name)  # Remove any character after @, including the @
    name = re.sub(r'[^A-Za-z0-9-_ ]', '', name)  # Remove any character that is not a letter, number, hyphen, underscore, or space
    name = name.replace(' ', '_')  # Replace spaces with underscores
    return name.strip().lower()  # Trim leading and trailing whitespace


# Get all catalogues

headers = {'Accept': 'text/turtle'}
res=requests.get(url=fdpURL, headers=headers)

fdpStore = Graph()
fdpStore.parse(data=res.text, format="turtle")

for catalogue in fdpStore.subjects(RDF.type, LDP.DirectContainer):
    allCatalogues = list(fdpStore.objects(catalogue, LDP.contains))

filename = 'FDP.ttl'  # Make sure this is a valid filename
full_path = os.path.join(folder_path, filename)
fdpStore.serialize(full_path, format="turtle")


print(f'Found {len(allCatalogues)} catalogues.')

# Get catalogue
for index, catalogue in enumerate(allCatalogues):

    #if index > 0:
    #    break
    
    catalogueStore = Graph()

    try:
        resCatalogue = requests.get(url=catalogue, headers=headers).text  
        catalogueStore.parse(data=resCatalogue, format="turtle")
        fdpStore += catalogueStore # merge graph 
    except Exception as e:
        print(f"Error processing catalogue {catalogue}: {e}")
        continue  # Skip to next catalogue

    # Extracting and saving catalogue details
    for catalogue in catalogueStore.subjects(RDF.type, DCAT.Catalog):
        catalogueTitles = list(catalogueStore.objects(catalogue, DCT.title))
        allDatasets = list(catalogueStore.objects(catalogue, DCAT.dataset))
        print(f"Catalogue: Saving {catalogueTitles[0]}.. with {len(allDatasets)} datasets.")
    

    if catalogueTitles:
        catalogue_title = sanitize_filename(str(catalogueTitles[0]))
    else:
        catalogue_title = index

    # Create a subfolder for the catalogue
    catalogue_folder = os.path.join(folder_path, f'catalogue_{catalogue_title}')
    if not os.path.exists(catalogue_folder):
        os.makedirs(catalogue_folder)
        
    # Save the catalogue RDF file
    catalogue_file = os.path.join(catalogue_folder, f'catalogue_{catalogue_title}.ttl')
    catalogueStore.serialize(catalogue_file, format="turtle")

    # Iterating datasets within the catalogue
    for dataset in allDatasets:
        datasetStore = Graph()
        
        try:
            resDataset = requests.get(url=dataset, headers=headers).text
            datasetStore.parse(data=resDataset, format="turtle")
            fdpStore += datasetStore # merge graph 

        except Exception as e:
            print(f"Failed to get content for {dataset}")
            continue  # Skip to next dataset
        
        datasetTitles = []
        for datasetClass in datasetStore.subjects(RDF.type, DCAT.Dataset):
            datasetTitles = list(datasetStore.objects(datasetClass, DCT.title))
            allDistributions = list(datasetStore.objects(datasetClass, DCAT.distribution))
            allSamples = list(datasetStore.objects(datasetClass, ADMS.sample))
            allAnalytics = list(datasetStore.objects(datasetClass, HEALTHDCATAP.analytics))

        # Iterating through datasets' components: distributions, samples, analytics
        datasetSubClasses = [('distribution', allDistributions, DCAT.Distribution),
                            ('sample', allSamples, ADMS.Sample),
                            ('analytics', allAnalytics, HEALTHDCATAP.Analytics)]

        for dcatPropertyName, allItems, dcatClass in datasetSubClasses:
            for item in allItems:
                tempStore = Graph()
                try:
                    resSubClassDataset = requests.get(url=item, headers=headers).text
                    tempStore.parse(data=resSubClassDataset, format="turtle")
                    for compClass in tempStore.subjects(RDF.type, dcatClass):
                        compTitles = list(tempStore.objects(compClass, DCT.title))
                        print(f"{dcatPropertyName.capitalize()}: Saving: {compTitles[0]}..")
                    datasetStore += tempStore  # Merge graph
                    fdpStore += tempStore # merge graph 
                except Exception as e:
                    print(f"Failed to process {dcatPropertyName} {item}: {e}")
            
        if datasetTitles:
            dataset_title = sanitize_filename(str(datasetTitles[0]))
        else:
            dataset_title = index
            
        # export a RDF turtle file each cataiogue with associated datasets, distributions, samples, analytics
        dataset_file = os.path.join(catalogue_folder, f'dataset_{dataset_title}.ttl')
        datasetStore.serialize(dataset_file, format="turtle")

print('Backup done!')

# Download all TTL in one file

fdpStore.serialize('all_content.ttl', format="turtle")



Backup process started for http://backup.healthdataportal.eu/
Found 12 catalogues.
Catalogue: Saving Associated National HDABies.. with 2 datasets.
Catalogue: Saving Belgian National HDAB.. with 8 datasets.
Distribution: Saving: Antimicrobial Resistance -Test..
Distribution: Saving: Belgian Health Data Agency..
Sample: Saving: Proxy data generating for the EHDS2 Pilot project Sciensano Use Case..
Sample: Saving: ID_TU_STATBEL_POP..
Analytics: Saving: Technical report number of unique study subjects available by environment for project HDBP0250..


Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#duration, Converter=<function parse_duration at 0x0000019927874EE0>
Traceback (most recent call last):
  File "C:\Users\ChVa5406\AppData\Roaming\Python\Python310\site-packages\rdflib\term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "C:\Users\ChVa5406\AppData\Roaming\Python\Python310\site-packages\isodate\isoduration.py", line 104, in parse_duration
    raise ISO8601Error("Unable to parse duration string %r" % datestring)
isodate.isoerror.ISO8601Error: Unable to parse duration string 'NA'


Distribution: Saving: Next Generation Sequencing..
Sample: Saving: Next Generation Sequencing..


Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x0000019927874820>
Traceback (most recent call last):
  File "C:\Users\ChVa5406\AppData\Roaming\Python\Python310\site-packages\rdflib\term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "C:\Users\ChVa5406\AppData\Roaming\Python\Python310\site-packages\isodate\isodates.py", line 203, in parse_date
    raise ISO8601Error('Unrecognised ISO 8601 date format: %r' % datestring)
isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 date format: '6/17/2021'


Sample: Saving: Metadata - Precision central MAB..
Sample: Saving: Metadata - Precision local MAB..


Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x0000019927874820>
Traceback (most recent call last):
  File "C:\Users\ChVa5406\AppData\Roaming\Python\Python310\site-packages\rdflib\term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "C:\Users\ChVa5406\AppData\Roaming\Python\Python310\site-packages\isodate\isodates.py", line 203, in parse_date
    raise ISO8601Error('Unrecognised ISO 8601 date format: %r' % datestring)
isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 date format: '?'


Distribution: Saving: Precision_Treatment..
Sample: Saving: Precision_Treatment..
Catalogue: Saving Cancer Registry examples.. with 2 datasets.
Sample: Saving: Data structure description..
Analytics: Saving: Krebserkrankungen in ÃÂsterrreich 2022..
Analytics: Saving: Dashboard for querying cancer data..
Catalogue: Saving Croatian National HDAB.. with 7 datasets.
Distribution: Saving: Request for access to public health data..
Catalogue: Saving Danish National HDAB.. with 13 datasets.
Catalogue: Saving EU datasets.. with 3 datasets.


Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#duration, Converter=<function parse_duration at 0x0000019927874EE0>
Traceback (most recent call last):
  File "C:\Users\ChVa5406\AppData\Roaming\Python\Python310\site-packages\rdflib\term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "C:\Users\ChVa5406\AppData\Roaming\Python\Python310\site-packages\isodate\isoduration.py", line 104, in parse_duration
    raise ISO8601Error("Unable to parse duration string %r" % datestring)
isodate.isoerror.ISO8601Error: Unable to parse duration string 'year'


Distribution: Saving: Union data access service..
Sample: Saving: AMR metadata..
Analytics: Saving: Surveillance Atlas of Infectious Diseases..
Distribution: Saving: CRC-Cohort..
Sample: Saving: CRC-Cohort..
Catalogue: Saving Finnish National HDAB.. with 11 datasets.
Distribution: Saving: Findata..
Sample: Saving: Variable and code list descriptions in Finnish Data resources catalogue...
Analytics: Saving: Population by age group in Terveys-Hilmo..
Distribution: Saving: Metadata description including variable descriptions in the Finnish Data Resources Catalogue..
Distribution: Saving: Metadata description including variable descriptions in the Finnish Data Resources Catalogue..
Distribution: Saving: Metadata description including variable level description in Finnish Data resources catalogue..
Distribution: Saving: Metadata description including variable level description in the Finnish Data resources catalogue..
Distribution: Saving: Metadata description in the Finnish Data Resources 

Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#dateTime, Converter=<function parse_datetime at 0x0000019927874D30>
Traceback (most recent call last):
  File "C:\Users\ChVa5406\AppData\Roaming\Python\Python310\site-packages\isodate\isodatetime.py", line 51, in parse_datetime
    datestring, timestring = datetimestring.split('T')
ValueError: not enough values to unpack (expected 2, got 1)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\ChVa5406\AppData\Roaming\Python\Python310\site-packages\rdflib\term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "C:\Users\ChVa5406\AppData\Roaming\Python\Python310\site-packages\isodate\isodatetime.py", line 53, in parse_datetime
    raise ISO8601Error("ISO 8601 time designator 'T' missing. Unable to"
isodate.isoerror.ISO8601Error: ISO 8601 time designator 'T' missing. Unable to par

Distribution: Saving: Metadata description including variable level description in the Finnish Data Resources Catalogue..
Analytics: Saving: Data Resource Profile..
Catalogue: Saving French National HDAB.. with 2 datasets.


Failed to convert Literal lexical form to value. Datatype=http://www.w3.org/2001/XMLSchema#date, Converter=<function parse_date at 0x0000019927874820>
Traceback (most recent call last):
  File "C:\Users\ChVa5406\AppData\Roaming\Python\Python310\site-packages\rdflib\term.py", line 2084, in _castLexicalToPython
    return conv_func(lexical)  # type: ignore[arg-type]
  File "C:\Users\ChVa5406\AppData\Roaming\Python\Python310\site-packages\isodate\isodates.py", line 203, in parse_date
    raise ISO8601Error('Unrecognised ISO 8601 date format: %r' % datestring)
isodate.isoerror.ISO8601Error: Unrecognised ISO 8601 date format: '_g_L46C3063'


Catalogue: Saving Hungarian National HDAB.. with 3 datasets.
Catalogue: Saving Norwegian National HDAB.. with 12 datasets.
Distribution: Saving: Core variables - additional..
Distribution: Saving: Core variables - diagnostics and primary treatment..
Distribution: Saving: National Clinical Registry for Colorectal Cancer..
Distribution: Saving: Medisinsk fÃÂ¸dselsregister..
Analytics: Saving: Statistikkbank MFR..
Distribution: Saving: DÃÂ¸dsÃÂ¥rsaksregisteret..
Analytics: Saving: DÃÂ¸dsÃÂ¥rsaksregisterets statistikkbank..
Distribution: Saving: Norwegian Surveillance System for Communicable Diseases..
Distribution: Saving: Norwegian Patient Registry..
Analytics: Saving: Aktivitet i somatiske sykehus, spesialisthelsetjenesten..
Distribution: Saving: Prescription register..
Sample: Saving: List of variables for the prescription registry..
Distribution: Saving: StatBank Norway..
Distribution: Saving: Troms 1-7..
Catalogue: Saving Slovenian Catalogue.. with 1 datasets.
Catalogue: Saving 

<Graph identifier=N75e250a515194480838b52d4cc2d8d6f (<class 'rdflib.graph.Graph'>)>