# Download ESRF datasets from DOIs


This notebook provides a few functions that allows to download datasets from the ESRF ICAT catalogue given their DOIs.

See below for an [example](#Example).

License: [CC-0](https://creativecommons.org/publicdomain/zero/1.0/)

In [1]:
from collections import OrderedDict
import json
import shutil
import urllib.request
import urllib.parse
import zipfile


def get_session_id(user, password):
    """Returns a session ID to access ESRF ICAT catalogue

    :param str user: ICAT user name
    :param str password: ICAT password
    :rtype: str
    """
    session_id_request = urllib.request.Request(
        'https://icatplus.esrf.fr/session',
        data=urllib.parse.urlencode({"plugin": "db", "username": user, "password": password}).encode('ascii'),
        method='POST')

    with urllib.request.urlopen(session_id_request) as f:
        return json.loads(f.read())["sessionId"]


def get_datasets(doi, session_id):
    """Returns a mapping of dataset name: dataset ID for a given DOI.
    
    :param str doi: The DOI for which to get the datasets information.
    :param str session_id: ESRF ICAT session ID.
    :return: Mapping of dataset name (str) to dataset ID (int)
    :rtype: dict
    """
    content = OrderedDict({})

    url = 'https://icatplus.esrf.fr/doi/' + doi + '/datasets?sessionId=' + session_id

    with urllib.request.urlopen(url) as f:
        response = f.read()

    for dataset in json.loads(response):
        content[dataset['name']] = dataset['id']

    return content


def download_dataset(dataset_id, filename, session_id):
    """Download a dataset given its ID and save it to a file.

    :param int dataset_id: The ID of the dataset to download.
    :param str filename: The name of the zip file where to save the dataset.
    :param str session_id: ESRF ICAT session ID.
    """
    url = 'https://ids.esrf.fr/ids/getData?sessionId=' + session_id + '&datasetIds=' + str(dataset_id)

    with urllib.request.urlopen(url) as response, open(filename, 'wb') as out_file:
        shutil.copyfileobj(response, out_file)
        

def download_datasets(doi, session_id):
    """Download all datasets associated to a DOI.

    :param str doi: The DOI for which to get the datasets.
    :param str session_id: ESRF ICAT session ID.
    """
    datasets = get_datasets(doi, session_id).items()
    count = 1

    for dataset_name, dataset_id in datasets:
        file_name = dataset_name + '.zip'
        print('Downloading dataset %s (%d/%d)' % (file_name, count, len(datasets)))
        download_dataset(dataset_id, file_name, session_id)
        count += 1

        
def unzip(filename):
    """Unzip a file in the current working directory.

    :param str filename: zip file to extract.
    """
    with zipfile.ZipFile(filename) as f:
        count = 1
        names = f.namelist()
        for name in names:
            print('Extracting %s (%d/%d)' % (name, count, len(names)))
            f.extract(name)
            count += 1

## Example

### Get a session ID for ESRF ICAT access

Run the following cell and fill-up the form to acquire a session ID to access the data catalogue.

In [2]:
import ipywidgets

session_id = None

def session_id_form():
    user_widget = ipywidgets.Text(description='Username:')
    password_widget = ipywidgets.Password(description='Password:')
    button_widget = ipywidgets.Button(description='Get session ID')
    status_widget = ipywidgets.Label()
    
    def on_button_clicked(button):
        global session_id
        session_id = None
        try:
            session_id = get_session_id(user_widget.value, password_widget.value)
        except:
            status_widget.value = 'Error while getting session ID'
        else:
            status_widget.value = 'Successfully acquired a session ID'

    button_widget.on_click(on_button_clicked)
    return ipywidgets.VBox(
        [ipywidgets.Label(value='Get an ESRF ICAT session ID:'),
         user_widget, password_widget, button_widget, status_widget])

session_id_form()

VBox(children=(Label(value='Get an ESRF ICAT session ID:'), Text(value='', description='Username:'), Password(…

### Retrieve information from all dataset associated to a DOI

`get_datasets(doi, session_id)` returns a mapping of dataset name to dataset ID:

In [8]:
datasets = get_datasets(doi='10.15151/ESRF-DC-186933507', session_id=session_id)
datasets

OrderedDict([('dataset_10', 186930019),
             ('dataset_11', 186930090),
             ('dataset_12', 186930107),
             ('dataset_20', 186930127),
             ('dataset_21', 186932665),
             ('dataset_22', 186932677),
             ('dataset_23', 186932689),
             ('dataset_24', 186932703),
             ('dataset_25', 186932727),
             ('dataset_26', 186932743),
             ('dataset_27', 186932754),
             ('dataset_28', 186932765),
             ('dataset_29', 186932823),
             ('dataset_x', 186932921),
             ('dataset_30', 186932928),
             ('dataset_31', 186933048),
             ('dataset_32', 186933055),
             ('dataset_33', 186933063),
             ('dataset_RAD', 186933094),
             ('dataset_40', 186933105),
             ('dataset_41', 186933443),
             ('dataset_42', 186933450),
             ('dataset_43', 186933459)])

### Download a dataset

`download_dataset(dataset_id, filename, session_id)` downloads all files associated to a dataset as a single zip file:

In [9]:
download_dataset(datasets['dataset_11'], 'dataset_11.zip', session_id=session_id)

### Unzip the downloaded dataset

In [10]:
unzip('dataset_11.zip')

Extracting HG-62/dataset_11/15092501.dat (1/1)
