In [None]:
import copy
import glob
import itertools
import json
import os
import pprint as pp
import shutil
import sys
import urllib
from datetime import datetime, timezone
from pathlib import Path as Path
from typing import Sequence

import netCDF4 as nc
import numpy as np
import pandas as pd
import simplejson
import xarray as xr
import zarr
from mergedeep import merge

In [None]:
os.environ["USE_PYGEOS"] = "0"
import aiohttp
import fsspec
import geopandas as gpd

In [None]:
from fastapi import FastAPI

In [None]:
from fastapi import APIRouter, Depends, HTTPException
from fastapi import Path as apiPath
from fastapi.responses import HTMLResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from requests import HTTPError
from xpublish import Dependencies, Plugin, Rest, hookimpl

# from xpublish.utils.api import JSONResponse, HTMLResponse

In [None]:
# time to live cache, wrapper function
import cachetools.func


def list_to_tuple(function):
    def wrapper(*args):
        args = [tuple(x) if isinstance(x, list) else x for x in args]
        result = function(*args)
        result = tuple(result) if isinstance(result, list) else result
        return result

    return wrapper

def dataset_to_tuple(function):
    def wrapper(*args):
        args = tuple([*args])
        result = function(*args)
        result = tuple(result) if isinstance(result, list) else result
        return result

    return wrapper    

In [None]:
# $ jupyter nbconvert --to script [YOUR_NOTEBOOK].ipynb

In [None]:
def rebase_path(path_base="urbisphere-dm", path_root=None):
    """return abs path of a higher level directory"""
    from pathlib import Path

    path_root = Path("__file__").parent.resolve() if not path_root else path_root
    path_parts = lambda p: p[0 : (p.index(path_base) + 1 if path_base in p else len(p))]
    return str(Path(*[n for n in path_parts(Path(path_root).parts)]))


sys.path.append(os.path.join(rebase_path(), "interfaces/metadb/notebooks/"))
sys.path.append(os.path.join(rebase_path(), "interfaces/metadb/src/"))

from ipynb.fs.defs.metadb_publications import metadb_publication_query
from ipynb.fs.full.metadb_query import metadb_query as metadb_query_subset_table
from metadb_attributes import metadb_combine_globalattrs

In [None]:
class StopExecution(Exception):
    def _render_traceback_(self):
        pass


def get_dictlist_permutations(input_subset):
    """Expand a dict of (strs|lists) into all possible permutations."""
    import itertools

    keys, values = zip(*input_subset.items())
    values = [v if isinstance(v, list) else [v] for v in values]
    permutations_dicts = [dict(zip(keys, v)) for v in itertools.product(*values)]
    return permutations_dicts


def get_dictlist_flatten(input_subset, joinstr="+"):
    """Flatten dict of (strs|lists) into dict of (strs)."""
    import itertools

    keys, values = zip(*input_subset.items())
    values = [joinstr.join(v) if isinstance(v, list) else (v) for v in values]
    return dict(zip(keys, values))

In [None]:
def get_creation_time(d=datetime.utcnow()):
    d_str = d.replace(microsecond=0, tzinfo=timezone.utc).isoformat()
    return d_str


def get_gattrs(gattrs, sep=";\n"):
    ga = gattrs[0]
    gs = {}
    if len(ga) > 1:
        for g in gattrs[1:]:
            bo = False
            for k in gs.keys():
                if k in g and not bo:
                    bo = any(re.compile(m).match(gs[k]) for m in g[k])
                    if bo:
                        for k, v in g.items():
                            if k in ga:
                                ga[k] = v

    ct = get_creation_time()
    gd = {
        "version_id": version["id"],
        "version_time": version["time"],
        "version_date": version["time"],
        "creation_time": ct,
        "creation_date": ct[0:10],
    }
    ga = {k: sep.join(v) if isinstance(v, list) else v for k, v in ga.items()}
    ga = {k: v.format(**gd) for k, v in ga.items()}
    return ga


def get_gattrs_pub(gattrs):
    publication_gattrs = {}
    for ga in gattrs:
        if "production_profile" in ga:
            pga = metadb_publication_query(
                ga["production_profile"],
                publication_name="datasets_api",
            )
            pga = {k: v for k, v in pga.items() if v != ""}
            publication_gattrs = {**publication_gattrs, **pga}
    return publication_gattrs

In [None]:
def parse_config(ioconf_file, ioconf_name=None, version_dict=None):
    """Read configuration file and extract dict that matches version['id']."""
    import toml
    from mergedeep import merge

    def read_config(ioconfig_file):
        if os.path.exists(ioconfig_file):
            with open(ioconfig_file) as f:
                ioconf = toml.load(f)
            return ioconf
        else:
            return None

    # read TOML config file
    ioconf = read_config(ioconfig_file)

    if isinstance(version_dict, dict):
        # lookup of settings
        if ioconfig_name:
            group = ioconfig_name
        else:
            group = Path(ioconfig_file).stem

        if group in ioconf:
            conf_list = [
                d
                for d in ioconf[group]
                if version_dict["id"].startswith(
                    d["version"]["id"] if "version" in d else "v"
                )
            ]
            config = merge(*conf_list)
        else:
            config = {}

        return config
    else:
        return ioconf

# Definitions

In [None]:
def get_query_range(query_from, query_period, query_to=None):
    if not query_to and query_period:
        query_to = pd.to_datetime(query_from) + pd.tseries.frequencies.to_offset(
            query_period
        )
        if any(query_period.endswith(n) for n in ["M", "Y"]):
            query_to = query_to + pd.tseries.frequencies.to_offset("1D")
    query_range = pd.to_datetime([query_from, query_to])
    return query_range


def get_time_bounds(query_range):
    tr = query_range.strftime("%Y%m%dT%H%M%S%z").tolist()
    res = tr[0] if tr[0] == tr[1] else "{}_{}".format(*tr)
    return res

In [None]:
def datasets_fsspec_args(filepath):
    """generate a compounded url for fsspec, if relevant, plus arguments for xarray"""
    url = ""
    args = {}

    p = Path(filepath).resolve()

    # is the path an archive?
    if p.suffixes[-1] == ".zip" and len(p.suffixes) > 1:
        url += f"zip::"

        # define the suffix, in case of a trailing archive suffix
        sufx = p.suffixes[-2]
    else:
        sufx = p.suffixes[-1]

    if sufx == ".zarr":
        args["engine"] = "zarr"
    elif sufx == ".nc":
        args["engine"] = "netcdf4"

    # add the file path to the url
    # url += p.as_uri()
    if url != "":
        url += "file://"

    url += f"{p}"  # local files

    return url, args


def datasets_query_time(filepath):
    """extract time stamp information from the attributes of datasets"""

    store, store_args = datasets_fsspec_args(filepath)
    query_time = None
    query_res = {}
    a_list = ["production_time", "creation_time"]

    root_group = (
        input_subset["campaign_location"]
        if "campaign_location" in input_subset
        else "FR"
    )

    if not Path(filepath).exists():
        return None, {}

    for group in [None, root_group]:
        if not query_res:
            try:
                if store.endswith("nc"):
                    with nc.Dataset(store, mode="r", group=group) as ds:
                        query_res = {
                            a: ds.getncattr(a) for a in ds.ncattrs() if a in a_list
                        }
                elif store.endswith("zarr") or store.endswith("zarr/"):
                    with zarr.open(store, mode="r", path=group) as ds:
                        query_res = {
                            a: ds.attrs[a] for a in ds.attrs.keys() if a in a_list
                        }
                elif store.startswith("zip::"):
                    with xr.open_dataset(
                        store, **store_args, mode="r", group=group
                    ) as ds:
                        query_res = {
                            a: ds.attrs[a] for a in ds.attrs.keys() if a in a_list
                        }
            except:
                pass

    if "production_time" in query_res:
        query_time = query_res["production_time"].split("/")
        query_time = [pd.to_datetime(qt, format="ISO8601") for qt in query_time]
    elif "creation_time" in query_res:
        query_time = [query_res["creation_time"][1:21]]
        query_time = [pd.to_datetime(qt, format="ISO8601") for qt in query_time]

    query_mtime = [
        pd.Timestamp(
            datetime.fromtimestamp(Path(filepath).stat().st_mtime).replace(
                tzinfo=timezone.utc
            )
        )
    ]

    return query_time, {
        "st_mtime": query_mtime,
        "attrs": query_res,
        "fsspec_args": (store, store_args),
    }

In [None]:
def datastore_cache(input_file_list, output_path):
    def datastore_to_zarr(fn, zfn=None, replace=True, root_group="FR"):
        p = Path(fn)
        t = p.with_suffix(".zarr") if not zfn else Path(zfn)

        if t.exists() and t.is_dir() and replace:
            logging.warning("Removing directory `%s`", str(t))
            shutil.rmtree(t)

        with xr.open_dataset(p) as ds:
            groups = ds["station_id"].values.tolist()
            ds.close()

        if not t.is_dir() or replace:
            with xr.open_dataset(p) as ds:
                ds.to_zarr(t, mode="w", group=root_group, consolidated=True)
                ds.close()

            for group in groups:
                with xr.open_dataset(p, group=group) as ds:
                    ds.to_zarr(t, mode="a", group=group, consolidated=True)
                    ds.close()

        return t, groups

    # create output_path
    Path(output_path).mkdir(parents=True, exist_ok=True)

    root_group = (
        input_subset["campaign_location"]
        if "campaign_location" in input_subset
        else "FR"
    )

    # to zarr
    output_files = []
    for input_file_dict in input_file_list:
        input_file = Path(input_file_dict["file"])
        groups = []

        if input_file.exists():
            logging.info("input file: `%s`", input_file)
            if input_file.suffixes[-1] == ".zip" and input_file.suffixes[-2] == ".zarr":
                output_zarr = Path(output_path) / input_file.with_suffix("").name
                output_zarr_tmp = output_zarr.with_suffix(output_zarr.suffix + ".tmp")
                if datasets_update_check(input_file, output_zarr) in [
                    None,
                    "not_equal",
                ]:
                    logging.info("unpacking file: `%s`", str(input_file))
                    shutil.unpack_archive(input_file, output_zarr_tmp)
                    shutil.rmtree(output_zarr, ignore_errors=True)
                    output_zarr_tmp.rename(output_zarr)

                if Path(output_zarr).exists():
                    cache_file = Path(output_zarr)

            if input_file.suffixes[-1] == ".nc":
                output_zarr = Path(output_path) / input_file.with_suffix(".zarr").name
                output_zarr_tmp = output_zarr.with_suffix(output_zarr.suffix + ".tmp")
                if datasets_update_check(input_file, output_zarr) in [
                    None,
                    "not_equal",
                ]:
                    logging.info("unpacking file: `%s`", str(input_file))
                    fn, groups = datastore_to_zarr(
                        input_file,
                        zfn=output_zarr_tmp,
                        replace=True,
                        root_group=root_group,
                    )
                    shutil.rmtree(output_zarr, ignore_errors=True)
                    output_zarr_tmp.rename(output_zarr)
                if Path(output_zarr).exists():
                    cache_file = Path(output_zarr)

            if cache_file.suffixes[-1] == ".zarr":
                with zarr.open_consolidated(input_file, mode='r') as dz:
                    group_keys = list(dz.group_keys())
                groups = [
                    n
                    for n in group_keys
                    if n != root_group and n.isupper() and n.isalpha()
                ]

            # add missing groups from metadata lookup of stations
            
            
            for g in groups:
                output_dict = merge(
                    copy.deepcopy(input_file_dict),
                    {
                        "catalogue": {"station_id": g},
                        "file": output_zarr,
                    },
                )
                output_files.append(copy.deepcopy(output_dict))

            logging.info("cache file: `%s`", output_zarr)

    return output_files


def datastore_cache_index(catalogue_path):
    output_files = Path(catalogue_path).glob("*.zarr")
    for output_zarr in output_files:
        logging.info("cache file (attach): `%s`", output_zarr)
    return output_files

In [None]:
def datasets_update_check(src_filepath, dst_filepath):
    """compare timestamp attributes of two urbisphere data stores"""
    src_time, src_query = datasets_query_time(src_filepath)
    dst_time, dst_query = datasets_query_time(dst_filepath)

    if isinstance(src_time, list) and isinstance(dst_time, list):
        if src_time == dst_time:
            # idential lists, no change
            return "equal"
        elif src_time[0] != dst_time[0] and src_time[-1] != dst_time[-1]:
            return "not_equal"
    else:
        return None

In [None]:
def encode_dataset_id(dataset_dict):
    dataset_dict_valid = {
        k: v
        for k, v in dataset_dict.items()
        if k not in ["file"] and isinstance(v, str)
    }
    return urllib.parse.urlencode(dataset_dict_valid)


def dicts_equal_all(dict1, dict2, excluded):
    keys = dict1.keys() | dict2.keys()
    return all((dict1[key] == dict2[key]) ^ (key in excluded) for key in keys)


def dicts_equal_any(dict1, dict2, excluded):
    keys = dict1.keys() & dict2.keys()
    return all((dict1[key] == dict2[key]) ^ (key in excluded) for key in keys)


def decode_dataset_id(dataset_id, catalogue_lookup=True):
    # default = {
    #    "id": None,
    #    "system_group": "AWS",
    #    "production_name": "urbisphere",
    #    "production_level": "L2R",
    #    "api_name": "StadtFreiburg-latest",
    #    "api_token": query_index,
    # }
    default = {
        **copy.deepcopy(root_dict[0]),
        **{
            "id": None,
        },
    }

    dataset_id_unquote = urllib.parse.unquote_plus(dataset_id)
    dataset_dict = {**default, **dict(urllib.parse.parse_qsl(dataset_id_unquote))}
    logging.debug(
        "`dataset_dict` dict:\n# start of item\n%s\n# end of item\n",
        pformat(dataset_dict, sort_dicts=False),
    )
    logging.debug("`dataset_id` str: %s", encode_dataset_id(dataset_dict))

    if not catalogue_lookup:
        return dataset_dict
    try:
        exclude_match = ["id"]
        exclude_keys = ["subset", "type", "path", "file"]
        catalogue_match = next(
            entry
            for entry in catalogue_list  # revise: reference to a global() variable
            if dicts_equal_all(
                {k: v for k, v in entry["catalogue"].items() if k not in exclude_keys},
                dataset_dict,
                exclude_match,
            )
        )
    except StopIteration:
        return {}

    catalogue_item = catalogue_match
    catalogue_item["catalogue"] = {**dataset_dict, **catalogue_match["catalogue"]}
    return catalogue_item


def encode_catalogue(catalogue):
    query_list = [
        encode_dataset_id(
            {k: v for k, v in entry["catalogue"].items() if k in ["station_id"]}
        )
        for entry in catalogue  # if entry['file'].exists
    ]
    return sorted(set(query_list))

In [None]:
def decode_output_item(item):
    items = []
    try:
        for coord in item["catalogue"]["subset"]["coords"]:
            coord_id_unquote = urllib.parse.unquote_plus(coord)
            coord_dict = dict(urllib.parse.parse_qsl(coord_id_unquote))
            newitem = {k: v for k, v in item.items() if not k in ["catalogue"]}
            newitem["catalogue"] = {
                **{k: v for k, v in item["catalogue"].items() if not k in ["subset"]},
                **coord_dict,
            }
            newsubs = {
                k: v
                for k, v in item["catalogue"]["subset"].items()
                if k not in ["coords"]
            }
            if newsubs:
                newitem["catalogue"]["subset"] = newsubs

            items.append(copy.deepcopy(newitem))
    except:
        items.append(copy.deepcopy(item))
    return items


def filter_catalogue(catalogue, output_file):
    from mergedeep import merge

    res = []
    for catalogue_entry in catalogue:
        for output_entry in output_file:
            items = decode_output_item(output_entry)
            for item in items:
                if dicts_equal_any(catalogue_entry["catalogue"], item["catalogue"], []):
                    d = copy.deepcopy(
                        merge(copy.deepcopy(catalogue_entry), copy.deepcopy(item))
                    )
                    res.append(d)
                    # print(item)
                    # print(catalogue_entry)
    return res

In [None]:
def get_input_files(input_subset, default_subset={}, custom_subset={}, debug=False):
    if not default_subset:
        default_subset = dict(
            path_base=input_path_base,
            path=input_path,
            file=input_file,
        )

    def force_list(x):
        return x if isinstance(x, list) else [x]

    input_files = [
        {
            "type": n["file"]["type"],
            "path": os.path.join(n["path_base"], n["path"], n["file"]["path"]),
            "file": n["file"]["file"],
            "catalogue": n["file"]["catalogue"],
        }
        for n in get_dictlist_permutations(
            dict(
                path_base=force_list(default_subset["path_base"]),
                path=force_list(default_subset["path"]),
                file=default_subset["file"],
            )
        )
    ]

    # expand input_files
    fn_list = [
        {
            "type": input_file["type"],
            "file": os.path.join(input_file["path"], input_file["file"]).format(
                **{**ss, **custom_subset}
            ),
            "catalogue": input_file["catalogue"],
        }
        for input_file in input_files
        for ss in get_dictlist_permutations(input_subset)
    ]  #
    return (input_files, fn_list)

In [None]:
def mask_dataset_zarr(ds, entry):
    """Mask invalid data as listed in the QC test"""
    ds_mod = None

    # tests qc in subset
    if "subset" in entry["catalogue"].keys():
        if ds and "qc" in entry["catalogue"]["subset"].keys():
            logging.debug(
                "`mask_dataset_zarr`: catalogue lookup `entry['catalogue']['subset']['qc']` OK"
            )

            # test qc dataset_id
            if "dataset_id" in entry["catalogue"]["subset"]["qc"].keys():
                dataset_id = entry["catalogue"]["subset"]["qc"]["dataset_id"]
                logging.debug(
                    "`mask_dataset_zarr`: catalogue lookup `entry['catalogue']['subset']['qc']['dataset_id']` OK"
                )

                logging.debug(
                    "`dataset_id` dict:\n# start of item\n%s\n# end of item\n",
                    pformat(dataset_id, sort_dicts=False),
                )

                logging.debug(
                    "`entry` dict:\n# start of item\n%s\n# end of item\n",
                    pformat(entry, sort_dicts=False),
                )

                entry_qc_dict = decode_dataset_id(dataset_id, catalogue_lookup=False)

                # fix:
                entry_qc_dict["station_id"] = entry["catalogue"]["station_id"]

                logging.debug(
                    "`entry_qc_dict` dict:\n# start of item\n%s\n# end of item\n",
                    pformat(entry_qc_dict, sort_dicts=False),
                )

                dataset_id_qc = encode_dataset_id(entry_qc_dict)

                logging.debug(
                    "`dataset_id_qc` dict:\n# start of item\n%s\n# end of item\n",
                    pformat(dataset_id_qc, sort_dicts=False),
                )

                entry_qc = decode_dataset_id(dataset_id_qc)

                logging.debug(
                    "`entry_qc` dict:\n# start of item\n%s\n# end of item\n",
                    pformat(entry_qc, sort_dicts=False),
                )

                # test catalogue entry.
                if entry_qc:
                    # open qc dataset
                    ds_qc = open_dataset_zarr(dataset_id_qc)

                    if ds_qc:
                        logging.debug("ds_qc exists")

                        # apply qc dataset as mask
                        ds_qc_mod, ds_mod = mask_datasets_qc(ds, ds_qc)

    if ds_mod:
        logging.info("`mask_dataset_zarr`: QC curation applied.")
        return ds_mod
    else:
        logging.debug("`mask_dataset_zarr`: QC no curation applied.")
        return ds


def mask_datasets_qc(
    ds1,
    ds2,
    snames=None,
    qnames=[
        "vdi3786_min",
        "vdi3786_max",
        "vdi3786_absolute_deviation_600S",
        "vdi3786_absolute_deviation_3600S",
        "vdi3786_stationarity_duration_min",
        "vdi3786_stationarity_duration_max",
        "vdi3786_stationarity_min",
        "vdi3786_stationarity_max",
        "vdi3786_ensemble",
    ],
):
    import itertools

    # ds1 = xr.open_dataset(path_ds, group=group_id)
    # ds2 = xr.open_dataset(path_qc, group=group_id)
    # alignment
    # ds2, ds1 = xr.align(ds2, ds1)
    ds1, ds2 = xr.align(ds1, ds2)

    # attributes dataframe
    attrs_df = (
        ds2.drop_dims([k for k in list(ds2.dims.keys()) if not k in ["attributes"]])
        .reset_coords()
        .to_dataframe()
    )

    # qc variable lists
    snames = ds1["station_id"].values.tolist()
    vnames = attrs_df["attributes_name"].values.tolist()

    def ok_filter(attr_index, attr_name, qname):
        # some filter combinations lead to unusable results.
        if (
            attr_index in [0]
            and attr_name == "pr_rate"
            and qname in ["vdi3786_stationarity_max"]
        ):
            return False
        elif (
            attr_index in [1]
            and attr_name == "pr_rate"
            and qname in ["vdi3786_stationarity_min"]
        ):
            return False
        else:
            return True

    # apply qc mask
    ds3 = ds1.copy(deep=False)
    for sname, vname in list(itertools.product(snames, vnames)):
        for i, row in attrs_df.iterrows():
            if vname in ds1.data_vars and vname == row["attributes_name"]:
                logging.debug("%s", vname)

                # subset masking
                va2 = {
                    "attributes": [
                        "attributes_group",
                        "attributes_index",
                        "attributes_name",
                    ]
                }
                da2 = ds2.set_index(**va2)
                da2 = da2.sel(row.to_dict()).drop_vars(
                    list(va2.values())[0] + list(va2.keys())
                )
                da3 = ds3[vname]
                da3_attrs = copy.deepcopy(ds3[vname].attrs)
                for qname in qnames:
                    if qname in da2.data_vars:
                        if ok_filter(
                            row["attributes_index"], row["attributes_name"], qname
                        ):
                            logging.debug(
                                " (%s,%s) %s",
                                row["attributes_index"],
                                row["attributes_name"],
                                qname,
                            )
                            da3 = xr.where(da2[qname], np.nan, da3)

                            # reassign updated/masked variable
                            ds3[vname] = da3.assign_attrs(da3_attrs)
                        else:
                            logging.debug(
                                " Excluded (%s,%s) %s",
                                row["attributes_index"],
                                row["attributes_name"],
                                qname,
                            )

    # update global attributes (revise logic)
    gattrs_list = [
        get_gattrs(ioconf["qc"]["gattrs"]),
        {k: v for k, v in ds2.attrs.items() if k in ["history"]},
        ds1.attrs,
    ]
    gattrs_dict = metadb_combine_globalattrs(gattrs_list)

    ds3 = ds3.assign_attrs(gattrs_dict)

    # return (ds1, ds2, ds3, attrs, vnames, snames, qnames)
    return (ds2, ds3)

In [None]:
def filter_dataset_zarr(dataset, entry):
    d_attr_dict = copy.deepcopy(dataset.attrs)
    g_attr_dict = get_gattrs(gattrs)

    if "subset" in entry["catalogue"].keys():
        if "data_vars" in entry["catalogue"]["subset"].keys():
            av = entry["catalogue"]["subset"]["data_vars"]
            dv = [dv for dv in list(dataset.data_vars) if dv not in av]
            logging.debug(" catalogue: %s", ",".join(av))
            logging.debug("   dropped: %s", ",".join(dv))
            dataset = dataset.drop_vars(dv)
        else:
            logging.debug("   dropped: <none>")

        if "coords" in entry["catalogue"]["subset"].keys():
            logging.debug(
                "    coords: %s", pformat(entry["catalogue"]["subset"]["coords"])
            )
        else:
            logging.debug("    coords: <none>")

    # update global attributes
    if (
        "production_level" in entry["catalogue"].keys()
        and "production_profile" in d_attr_dict
    ):
        if entry["catalogue"]["production_level"] in ["L2", "L2R"]:
            pub_attr_dict = get_gattrs_pub(
                [{"production_profile": d_attr_dict["production_profile"]}]
            )
            d_attr_dict = {**d_attr_dict, **pub_attr_dict}

    n_attrs = metadb_combine_globalattrs([d_attr_dict, g_attr_dict])
    dataset = dataset.assign_attrs(**n_attrs)

    return dataset

In [None]:
# ds1, ds2, ds3, attrs, vnames, snames, qnames = dataset_mask(path_ds, path_qc, group_id="FRASHA")

In [None]:
@list_to_tuple
@cachetools.func.ttl_cache(maxsize=50, ttl=5 * 60)
def open_dataset_zarr(dataset_id):
    entry = decode_dataset_id(dataset_id)
    if entry:
        fn_zarr = Path(entry["file"])
        ds = xr.open_zarr(fn_zarr, group=entry["catalogue"]["station_id"])

        # report success
        logging.debug("`open_dataset_zarr`: `dataset id` = `%s`", dataset_id)

        try:
            # modify dataset, as configured in catalogue entry
            ds = filter_dataset_zarr(ds, entry)

            # apply qc masks, if configured in catalogue entry
            ds = mask_dataset_zarr(ds, entry)

            # fix a bug
            ds = ds.load()

            # modify dataset properties for xpublish=>0.30
            for var in ds.coords:
                if str(ds[var].dtype) == "object":
                    ds[var] = ds[var].astype("<U42")
            # print(ds.data_vars)
            for var in ds.data_vars:
                if "chunks" in ds[var].encoding:
                    del ds[var].encoding["chunks"]
            for var in ds.coords:
                if "chunks" in ds[var].encoding:
                    del ds[var].encoding["chunks"]
            ds = ds.unify_chunks()
            ds = ds.reset_encoding()

        except Exception as error:
            print("The following error occurred:", error)

        return ds
    else:
        raise

#@dataset_to_tuple
#@cachetools.func.ttl_cache(maxsize=50, ttl=5 * 60)
def get_dataset_json(dataset):
    # convert non-numeric dtypes
    dataset["time"] = dataset.time.dt.strftime("%Y-%m-%dT%H:%M:%S.%f%z")

    # largest dim last
    # dataset = dataset.transpose(*list(dataset.dims)[::-1])
    dataset = dataset.transpose("station", "system", "time", missing_dims="ignore")

    return simplejson.dumps(dataset.to_dict(data=True), ignore_nan=True)

In [None]:
def uvicorn_log_yaml():
    import yaml

    yml_str = """
    version: 1
    disable_existing_loggers: False
    formatters:
      default:
        "()": uvicorn.logging.DefaultFormatter
        format: '[%(asctime)s] %(levelname)s %(message)s'
      access:
        "()": uvicorn.logging.AccessFormatter
        format: "[%(asctime)s %(process)d] %(name)s - %(levelname)s - %(message)s"
      logformat:
        format: "[%(asctime)s %(process)d] %(name)s - %(levelname)s - %(message)s"
    handlers:
      file_handler:
        class: logging.FileHandler
        level: INFO
        formatter: logformat
        filename: info.log
        encoding: utf8
        mode: a
      default:
        formatter: default
        class: logging.StreamHandler
        stream: ext://sys.stderr
      access:
        formatter: access
        class: logging.StreamHandler
        stream: ext://sys.stdout
    loggers:
      uvicorn.error:
        level: INFO
        handlers:
          - default
          - file_handler
        propagate: no
      uvicorn.access:
        level: INFO
        handlers:
          - access
          - file_handler
        propagate: no
    """
    f_log = os.path.join(log_path, log_file).format(version_id=version["id"])
    f_cfg = Path(ioconfig_file).with_suffix(".yaml")

    yml = yaml.safe_load(yml_str)
    yml["handlers"]["file_handler"]["filename"] = f_log

    with open(f_cfg, "w", encoding="utf-8") as f:
        yaml.dump(yml, f)

    if f_cfg.exists():
        return str(f_cfg)

In [None]:
class UrbisphereDatasetPlugin(Plugin):
    name: str = "dataset_urbisphere"
    version: str = "0.0.2"

    @hookimpl
    def get_datasets(self):
        # list datasets
        return encode_catalogue(catalogue_list)

    @hookimpl
    def get_dataset(self, dataset_id: str) -> xr.Dataset | None:
        # redirect dataset_id to the correct file and group
        try:
            return open_dataset_zarr(dataset_id)
        except:
            return None

In [None]:
class JSONPlugin(Plugin):
    """Adds JSON endpoints for datasets"""

    name: str = "json"
    version: str = "0.0.2"

    dataset_router_prefix: str = "/json"
    dataset_router_tags: Sequence[str] = ["json"]

    @hookimpl
    def dataset_router(self, deps: Dependencies):
        router = APIRouter(
            prefix=self.dataset_router_prefix, tags=list(self.dataset_router_tags)
        )

        @router.get("/info")
        def get_json_info(
            dataset=Depends(deps.dataset),
            cache=Depends(deps.cache),
        ):
            """Return the schema as a dictionary."""
            return JSONResponse(dataset.to_dict(data=False))

        @router.get("/vars")
        def get_variable_list(
            dataset=Depends(deps.dataset),
            cache=Depends(deps.cache),
        ):
            """Return a list of available variables."""
            return JSONResponse(list(dataset.data_vars))

        @router.get("/{var}/isel/{timedelta}/{chunk}")
        def get_variable_chunk(
            var: str = apiPath(
                description="Variable. A single item from the list of {vars} or '-' for all."
            ),
            timedelta: str = apiPath(
                description="Interval duration in timedelta notation, such as '1D', '1H'."
            ),
            chunk: str = apiPath(
                description="Chunk index, such as '0', '1', '2', ..., or from the end, '-2', '-1'."
            ),
            dataset=Depends(deps.dataset),
            cache=Depends(deps.cache),
        ):
            """Return a dataset in a JSON representation. Query {chunk} from {timedelta} sized chunks."""
            if var in ["-", "*"]:
                var = list(dataset.data_vars)
            elif var not in dataset.data_vars:
                raise HTTPException(
                    status_code=404, detail=f"Variable '{var}' not found in dataset"
                )
            var_list = [var] if not isinstance(var, list) else var

            # not implemented: evaluation of chunks, timedelta
            cdx = [int(n) for n in chunk.split(".")]

            idx = np.cumsum(
                dataset["time"].to_pandas().resample(timedelta).count().values
            ).tolist()
            idx = (
                [0] + idx
                if idx[-1] == dataset["time"].shape[0]
                else [0] + idx + [dataset["time"].shape[0]]
            )
            idx_pairs = list(itertools.pairwise(idx))

            if abs(cdx[0]) <= len(idx_pairs) and abs(
                pd.to_timedelta(timedelta)
            ) <= pd.to_timedelta("1D"):
                # subset
                dataset = dataset.isel(time=slice(*idx_pairs[cdx[0]]))[var_list]
                return JSONResponse(json.loads(get_dataset_json(dataset)))
            else:
                raise HTTPException(status_code=403, detail="Not available")

        @router.get("/{var}/sel/{timedelta}/{timestamp}")
        def get_variable_slice(
            var: str = apiPath(
                description="Variable. A single item from the list of {vars} or '-' for all."
            ),
            timedelta: str = apiPath(
                description="Interval duration in timedelta notation, such as '1D', '1H', or backwards, '-1D', '-1H'."
            ),
            timestamp: str = apiPath(
                description="ISO8601 date and time notation, such as '20230101T000000'."
            ),
            dataset=Depends(deps.dataset),
            cache=Depends(deps.cache),
        ):
            """Return a dataset in a JSON representation. Query a {timedelta} interval from (or until) a reference {timestamp}."""
            if var in ["-", "*"]:
                var = list(dataset.data_vars)
            elif var not in dataset.data_vars:
                raise HTTPException(
                    status_code=403, detail=f"Variable '{var}' not found in dataset"
                )
            var_list = [var] if not isinstance(var, list) else var

            # not implemented: evaluation of timestamp, timedelta
            time_ref = pd.to_datetime(timestamp, format="ISO8601")
            time_delta = pd.to_timedelta(["0S", timedelta])
            time_range = [
                str(n) for n in [time_ref + min(time_delta), time_ref + max(time_delta)]
            ]
            if np.diff(time_delta) <= pd.to_timedelta("1D"):
                dataset = dataset.sel(time=slice(*time_range))[var_list]
                return JSONResponse(json.loads(get_dataset_json(dataset)))
            else:
                raise HTTPException(status_code=403, detail="Not available")

        return router

# GeoJSON

In [None]:
def get_dataset_chunk(var, timedelta, chunk, dataset):
    def get_idx_pairs(dataset, timedelta):
        """Convert timedelta to index start and end"""
        idx = np.cumsum(
            dataset["time"].to_pandas().resample(timedelta).count().values
        ).tolist()
        idx = (
            [0] + idx
            if idx[-1] == dataset["time"].shape[0]
            else [0] + idx + [dataset["time"].shape[0]]
        )
        idx_pairs = list(itertools.pairwise(idx))
        return idx_pairs

    # eval inputs
    # not implemented: evaluation of chunks, timedelta
    if var in ["-", "*"]:
        var = list(dataset.data_vars)
    elif var not in dataset.data_vars:
        return None
    var_list = [var] if not isinstance(var, list) else var

    if abs(pd.to_timedelta(timedelta)) > pd.to_timedelta("1D"):
        return None

    cdx = [int(n) for n in chunk.split(".")]

    idx_pairs = get_idx_pairs(dataset, timedelta)

    # return slice
    if abs(cdx[0]) <= len(idx_pairs):
        # subset
        dataset = dataset.isel(time=slice(*idx_pairs[cdx[0]]))[var_list]
        return dataset
    else:
        return None


def get_dataset_slice(var, timedelta, timestamp, dataset):
    if var in ["-", "*"]:
        var = list(dataset.data_vars)
    elif var not in dataset.data_vars:
        return None
    var_list = [var] if not isinstance(var, list) else var

    # not implemented: evaluation of timestamp, timedelta
    time_ref = pd.to_datetime(timestamp, format="ISO8601")
    time_delta = pd.to_timedelta(["0S", timedelta])
    time_range = [
        str(n) for n in [time_ref + min(time_delta), time_ref + max(time_delta)]
    ]
    if np.diff(time_delta) <= pd.to_timedelta("1D"):
        dataset = dataset.sel(time=slice(*time_range))[var_list]
        return dataset
    else:
        return None

In [None]:
def dataset_aggregator(
    data,
    data_vars=None,
    agg_methods=["mean", "minimum", "maximum"],
    agg_periods=["24H", "1H"],
    delimiter=".",
):
    data_agg = []
    data_att = []

    if not data_vars or data_vars == ["-"]:
        data_vars = list(data.data_vars.keys())

    variables = {
        "data_vars": [n for n in data_vars if n in list(data.data_vars.keys())]
    }

    def func_apply(resampled, method):
        """Expose functions within the ResampleDataset class"""
        if method == "mean":
            return resampled.mean()
        elif method == "maximum":
            return resampled.max()
        elif method == "minimum":
            return resampled.min()

    # data_vars
    for var_group, var_list in variables.items():
        for var in var_list:
            for ts in agg_periods:
                for agg_method in agg_methods:
                    # resample
                    ds = func_apply(
                        data[[var]].resample(time=ts, label="right", origin="end"),
                        agg_method,
                    )

                    # get last value only
                    ds = ds.resample(time="24H", label="right", origin="end").last()

                    # rename
                    agg_key = tuple([var_group, var, "values", agg_method, ts])
                    ds = ds.rename({var: delimiter.join(agg_key)})

                    # convert attributes to data variables
                    att = []
                    for k, v in data[var].attrs.items():
                        att_key = tuple([var_group, var, "attrs", k])
                        ds = ds.assign({delimiter.join(att_key): (("aggregate"), [v])})

                    # store result in list
                    data_agg.append(ds.copy())

    dx = xr.concat(data_agg, dim="aggregate")

    # convert coordinates to data_vars, retain coordinate space
    dc = dx.coords.to_dataset()
    dc = dc.rename(
        dict(
            [
                *[
                    (n, delimiter.join(("coords", n)))
                    for n in dc.coords.keys()
                    if not n in ["time"]
                ]
            ]
        )
    )

    # convert attributes to data_vars, retain some coordinate space
    da_dims = ("station", "system")
    da = dx.coords.to_dataset()
    da = da.drop_dims([d for d in da.dims if not d in da_dims]).assign(
        {
            k: (
                da_dims,
                np.full(
                    [da.dims[d] for d in da_dims],
                    dtype="<U{}".format(len(v)),
                    fill_value=v,
                ),
            )
            for k, v in da.attrs.items()
        }
    )
    da = da.rename(
        dict(
            [
                *[
                    (n, delimiter.join(("coords", n)))
                    for n in da.coords.keys()
                    if not n in ["time"]
                ]
            ]
        )
    )
    da = da.rename_vars(
        dict([*[(n, delimiter.join(("attrs", n))) for n in da.data_vars.keys()]])
    )

    # Merge
    dx = dx.merge(dc.reset_coords(drop=False)).merge(da.reset_coords(drop=False))

    # Convert to pandas, restructure indices
    df = (
        dx.to_dataframe()
        .reset_index(["time"], drop=False)
        .groupby(list(dx.coords.keys()))
        .last()
    )

    # Final restructuring
    df = df.reindex(sorted(df.columns), axis=1)
    df = df.sort_index(level=["station_id"])

    return df

In [None]:
@list_to_tuple
@cachetools.func.ttl_cache(maxsize=5, ttl=5 * 60)
def get_gjson(dataset_ids, var, timedelta, chunk, mode):
    dx = []
    for dataset_id in dataset_ids:
        if decode_dataset_id(dataset_id):
            ds = open_dataset_zarr(dataset_id)
            if mode == "chunk":
                ds = get_dataset_chunk(var, timedelta, chunk, ds)
            elif mode == "slice":
                ds = get_dataset_slice(var, timedelta, chunk, ds)
            if ds:
                dx.append(ds)

    # multi-dim workaround
    dx = [ds.isel(system=[0], drop=False) for ds in dx]

    # concat is sensitive to dimension changes, consider mergeß
    data = xr.concat(dx, dim="station")

    df_agg = dataset_aggregator(
        data,
        data_vars=[
            v for v in var if v in ["hur", "plev", "pr_rate", "pwv", "rsd", "ta", "ws"]
        ],
        agg_periods=(
            ["24H", "1H"] if timedelta.endswith("D") else list(set([timedelta, "1H"]))
        ),
    )

    # pandas dataframe preparation
    df = df_agg.reset_index(
        ["time", "station_id"], drop=False
    )  # reset the index as columns (t.b.d.)
    df["time"] = df["time"].astype(str)  # convert Timestamp to string
    df = df.set_index("station_id", drop=True).rename(columns={"time": "coords.time"})

    # geopandas
    gdf = gpd.GeoDataFrame(
        df,
        geometry=gpd.points_from_xy(df["coords.station_lon"], df["coords.station_lat"]),
        crs="EPSG:4326",
    )

    gjson = gdf.to_json()

    gjson_dict = json.loads(gdf.head(1).to_json())
    logging.info("Generating GeoJSON")
    # print(
    #    pformat(
    #        gjson_dict,  # read the json as a dict
    #        sort_dicts=False,
    #        width=500,
    #    )
    # )

    return gjson

In [None]:
class GeoJSONPlugin(Plugin):
    """Adds GeoJSON endpoints for datasets"""

    name: str = "geojson"
    version: str = "0.0.1"

    dataset_router_prefix: str = "/geojson"
    dataset_router_tags: Sequence[str] = ["geojson"]

    @hookimpl
    def dataset_router(self, deps: Dependencies):
        router = APIRouter(
            prefix=self.dataset_router_prefix, tags=list(self.dataset_router_tags)
        )

        @router.get("/info")
        def get_json_info(
            dataset=Depends(deps.dataset),
            cache=Depends(deps.cache),
        ):
            """Return the schema as a dictionary."""
            return JSONResponse(dataset.to_dict(data=False))

        @router.get("/vars")
        def get_variable_list(
            dataset=Depends(deps.dataset),
            cache=Depends(deps.cache),
        ):
            """Return a list of available variables."""
            return JSONResponse(list(dataset.data_vars))

        @router.get("/{var}/isel/{timedelta}/{chunk}")
        def get_variable_chunk(
            var: str = apiPath(
                description="Variable. A single item from the list of {vars} or '-' for all."
            ),
            timedelta: str = apiPath(
                description="Interval duration in timedelta notation, such as '1D', '1H'."
            ),
            chunk: str = apiPath(
                description="Chunk index, such as '0', '1', '2', ..., or from the end, '-2', '-1'."
            ),
            dataset_ids=Depends(deps.dataset_ids),
        ):
            """Return a dataset in a JSON representation. Query {chunk} from {timedelta} sized chunks."""

            gjson = get_gjson(dataset_ids, var, timedelta, chunk, "chunk")

            return JSONResponse(json.loads(gjson))

        @router.get("/{var}/sel/{timedelta}/{timestamp}")
        def get_variable_slice(
            var: str = apiPath(
                description="Variable. A single item from the list of {vars} or '-' for all."
            ),
            timedelta: str = apiPath(
                description="Interval duration in timedelta notation, such as '1D', '1H', or backwards, '-1D', '-1H'."
            ),
            timestamp: str = apiPath(
                description="ISO8601 date and time notation, such as '20230101T000000'."
            ),
            dataset_ids=Depends(deps.dataset_ids),
        ):
            """Return a dataset in a JSON representation. Query a {timedelta} interval from (or until) a reference {timestamp}."""

            gjson = get_gjson(dataset_ids, var, timedelta, timestamp, "slice")

            return JSONResponse(json.loads(gjson))

        return router

# MapBox

In [None]:
def get_mapbox_html(selected, dataset_ids, var, timedelta, chunk, mode):
    # geojson feature properties key delimiter
    delimiter = "."

    # settings
    locations = [input_subset["campaign_location"]]

    # default
    default_selected = (input_subset["campaign_location"], "ta", "mean", "24H")

    def get_default_url(mode, host="https://someserver.uni-freiburg.de"):
        if mode == "chunk":
            default_url = "{api_host}/{api_path}/datasets/{dataset_id}/mapbox/{var}/isel/{timedelta}/{chunk}/{selection}"
            default_url_dict = dict(
                api_host=host.strip("/"),
                api_path=output_path.strip("/"),
            )
        elif mode == "slice":
            default_url = "{api_host}/{api_path}/datasets/{dataset_id}/mapbox/{var}/sel/{timedelta}/{chunk}/{selection}"
            default_url_dict = dict(
                api_host=host.strip("/"),
                api_path=output_path.strip("/"),
            )
        else:
            default_url = (
                "{api_host}/{api_path}/datasets/{dataset_id}/mapbox/-/isel/1D/-1"
            )
            default_url_dict = dict(
                api_host=host.strip("/"),
                api_path=output_path.strip("/"),
            )

        return (default_url, default_url_dict)

    default_url, default_url_dict = get_default_url(None)

    def get_default_mapbox_config():
        default_mapbox_config = {
            "container": "map",
            "style": "mapbox://styles/christen/clxjemj7y00bp01qngu6gbd4t",
            "center": [-2.58654735763921, 51.4637473335781],
            "zoom": 11.5,
            "customAttribution": 'Data: <a href="http://www.uni-freiburg.de/en/">University of Freiburg</a>, Funding: European Research Council (ERC) Grant: 855005',
        }
        return(default_mapbox_config)

    def get_default_template_config():
        default_template_config = {
            "input_file" : 'conf/templates/mapbox_template1.html.jinja2',          
        }
        return(default_template_config)
    
    def nested_dict(d, n=None):
        if len(d) == 1:
            return {d[0]: n}
        else:
            return {d[0]: nested_dict(d[1:], n)}

    def get_features(gjson_dict, locations=[""]):
        """Filter features"""
        if not "features" in gjson_dict:
            return None

        return [
            feature
            for feature in gjson_dict["features"]
            for location in locations
            if "properties" in feature and feature["id"].startswith(location)
        ]

    def select_options(
        item_dict,
        selected=("FR", "ta", "mean", "24H"),
        value_key="option_value",
        href_key="option_url",
        text_key="option_text",
    ):
        from lxml import etree

        root = etree.Element("select", attrib={"id": "variable"})
        for k, v in item_dict:
            att = {"selected": "selected"} if k == selected else {}
            att["value"] = v[value_key]
            att["href"] = v[href_key]
            el = etree.Element("option", attrib=copy.deepcopy(att))
            el.text = v[text_key]
            root.append(el)

        etree.indent(root, space="    ")

        return etree.tostring(root).decode("utf-8")

    def get_colorscale(var_long_name, indent="                "):
        src = os.path.join(
            rebase_path(), "common/colormap/data/ColorRampsWetterstation.nc"
        )

        with xr.open_dataset(src) as ds:
            if var_long_name in ds["name"]:
                df = ds.sel(name=var_long_name)["variable_bounds"].to_pandas()
            else:
                df = ds.sel(name="precipitation")["variable_bounds"].to_pandas()

            javascript = f"// colorscale: {var_long_name}\n"

            for index, bounds in df.iterrows():
                ifelse = f"if" if index == 0 else f"}} else if"
                bounds_from = (
                    str(bounds["From"])
                    if not bounds["From"] == -np.inf
                    else "-Infinity"
                )
                bounds_to = (
                    str(bounds["To"]) if not bounds["To"] == np.inf else "Infinity"
                )
                javascript += f"{indent}{ifelse} ( DisplayValue >= { bounds_from } && DisplayValue < { bounds_to } ) {{ \n{indent}\tColorScale = {index}\n"

            javascript += f"{indent}}} else {{\n{indent}\tColorScale = 0\n"

            javascript += f"{indent}}};\n"

            ds.close()

        return javascript

    def get_colormap(var_long_name, indent="                "):
        src = os.path.join(
            rebase_path(), "common/colormap/data/ColorRampsWetterstation.nc"
        )

        with xr.open_dataset(src) as ds:
            if var_long_name in ds["name"]:
                df = ds.sel(name=var_long_name, bounds=["From"])["HEX"].to_pandas()
            else:
                df = ds.sel(name="precipitation", bounds=["From"])["HEX"].to_pandas()

            javascript = f"// colormap: {var_long_name}\n"
            javascript += f"{indent}" + "switch(ColorScale){\n"

            for index, hex_color in df.iterrows():
                javascript += f"{indent}\tcase {index}: // en\n"
                javascript += (
                    f"{indent}\t\tel.style.backgroundColor = '{ hex_color.iloc[0] }';\n"
                )
                javascript += f"{indent}\t\tbreak;\n"

            javascript += f"{indent}\t}};\n"

            ds.close()

        return javascript

    def get_properties_table():
        feature_props = [
            ["coords.time","coords.time"],
            "coords.station_lat",
            "coords.station_lon",
            "coords.system_group",
            "coords.system_id",
            "coords.system_name",
        ]
        table = ""
        
        table +='<table align="left">'
        for item in feature_props:
            if isinstance(item,list):
                table += f'<tr><td>"{item[1]}":</td><td>"/${{feature.properties["{item[1]}"]}}"</td></tr>'
            else:
                table += f'<tr><td>"{item}":</td><td>"${{feature.properties["{item}"]}}"</td></tr>'

        table += "</table>"
        return table

    # translations and title formating and urls
    def get_options(selected, options, mode, timedelta, chunk):
        # geojson feature properties key delimiter
        delimiter = "."

        # settings
        locations = [input_subset["campaign_location"]]

        # default
        default_selected = (input_subset["campaign_location"], "ta", "mean", "24H")

        default_var = options[1][0] if len(options[1]) == 1 else "-"
        default_url, default_url_dict = get_default_url(mode)

        # Mapbox template
        default_mapbox_config = get_default_mapbox_config()
        mapbox_config = {**default_mapbox_config, **output_api['mapbox']}

        # HTML template
        default_template_config = get_default_template_config()        
        template_config = {**default_template_config, **output_api['template']}

        # translations and title formating
        units_dict = {"degree_": "°", "degrees": "°", "degree": "°", "percent": "%"}
        locations_dict = {"FR": "Freiburg, Germany", "BR": "Bristol, UK"}

        options_list = []
        for location, variable, method, period in itertools.product(*options):
            key = (location, variable, method, period)

            # city/area
            city = location[:2]
            for k in locations_dict.keys():
                city = city.replace(k, locations_dict[k])

            # variable name
            name = feature_properties["data_vars"][variable]["attrs"][
                "long_name"
            ].title()

            # unit
            units = feature_properties["data_vars"][variable]["attrs"]["units"]
            for k in units_dict.keys():
                units = units.replace(k, units_dict[k])

            # title
            option_str = (
                f"{ name } { method.title() } last { period.lower() } ({ units })"
            )
            option_url_selection = f"location={location}&variable={variable}&period={period}&method={method}"
            option_url_dict = {
                **default_url_dict,
                **dict(
                    dataset_id=f"station_id={location}",
                    selection=option_url_selection,
                    timedelta=timedelta,
                    var=default_var,
                    chunk=chunk,
                    mode=mode,
                ),
            }
            option_url = default_url.format(**option_url_dict)
            option_geojson_dict = {
                **default_url_dict,
                **dict(
                    dataset_id=location,
                    var=default_var,
                    timedelta=timedelta,
                    chunk=chunk,
                    mode=mode,
                ),
            }
            option_geojson = (
                default_url.replace("/mapbox/", "/geojson/")
                .replace("/{selection}", "")
                .format(**option_geojson_dict)
            )
            option_key = f"data_vars.{ variable }.values.{ method }.{ period }"

            # colormaps
            if key == selected:
                colorscale = get_colorscale(
                    feature_properties["data_vars"][variable]["attrs"]["long_name"]
                )
                colormap = get_colormap(
                    feature_properties["data_vars"][variable]["attrs"]["long_name"]
                )

            result = [
                ("city", city),
                ("option_text", option_str),
                ("option_value", option_key),
                ("option_url", option_url),
                ("option_geojson", option_geojson),
            ]
            options_list.append(
                (
                    key,
                    dict(
                        [
                            *zip(["location", "variable", "method", "period"], key),
                            *result,
                        ]
                    ),
                )
            )

        return (options_list, mapbox_config, template_config, colorscale, colormap)

    def mapbox_html(selected, options, centroid, mode, chunk):
        """Jinja2 Template"""
        from jinja2 import Template

        # options
        options_list, mapbox_config, template_config, colorscale, colormap = get_options(
            selected, options, mode, timedelta, chunk
        )

        # Define a macro
        macro_values = {
            "mapbox_config": json.dumps({**mapbox_config, **centroid}),
            "select_variables": select_options(options_list, selected=selected),
            "select_variable": dict(options_list)[selected]["option_value"],
            "title": f"Weather Station Network of { dict(options_list)[selected]['city']}",
            "colorscale": colorscale,
            "colormap": colormap,
            "properties": get_properties_table(),
            "geojson_url": dict(options_list)[selected]["option_geojson"],
        }

        # Load the template from file ("./conf/templates/mapbox_template1.html.jinja2")
        with open(template_config['input_file']) as file_:
            template = Template(file_.read())

        # render from template and macro
        html_content = template.render(**macro_values)

        return html_content

    # read data
    gjson = get_gjson(dataset_ids, var, timedelta, chunk, mode)

    # conversion
    gjson_dict = json.loads(gjson)

    # features
    features = get_features(gjson_dict, locations)

    # centroid
    import geopandas as gpd

    gdf = gpd.read_file(gjson, driver="GeoJSON")
    gdf = gdf.to_crs("EPSG:4326")

    import warnings

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        centroid = {
            "center": [
                [x, y]
                for x, y in zip(gdf.dissolve().centroid.x, gdf.dissolve().centroid.y)
            ][0]
        }
        #print(centroid)
        
    # single feature/station properties as a dict
    feature_properties = merge(
        *[
            nested_dict(d.split(delimiter), n)
            for feature in [features[0]]
            for d, n in feature["properties"].items()
        ]
    )

    # list of data_var variables
    variables = list(feature_properties["data_vars"].keys())

    # unique list of methods
    methods = list(
        dict.fromkeys(
            list(
                itertools.chain(
                    *[
                        list(feature_properties["data_vars"][var]["values"].keys())
                        for var in variables
                    ]
                )
            )
        )
    )

    # unique list of averaging periods
    periods = list(
        dict.fromkeys(
            list(
                itertools.chain(
                    *[
                        list(
                            feature_properties["data_vars"][var]["values"][
                                method
                            ].keys()
                        )
                        for method in methods
                        for var in variables
                    ]
                )
            )
        )
    )

    options = (locations, variables, methods, periods)

    # evaluate
    # options_dict, colorscale, colormap = get_options(selected, locations, variables, methods, periods)
    # print(dict(options)[default_selected])

    html_content = mapbox_html(
        selected, options, centroid, mode, chunk
    )
    return html_content

In [None]:
class MapBoxPlugin(Plugin):

    """Adds MapBox endpoints for datasets"""

    name: str = "mapbox"
    version: str = "0.0.1"

    dataset_router_prefix: str = "/mapbox"
    dataset_router_tags: Sequence[str] = ["mapbox"]

    def decode_dataset_id(dataset_id, catalogue_lookup=False):
        default = {
            "location": input_subset["campaign_location"],
            "variable": "ta",
            "period": "1H",
            "method": "mean",
            "api_name": root_dict[0]["api_name"],
            "api_token": "***",
        }
        dataset_id_unquote = urllib.parse.unquote_plus(dataset_id)
        dataset_dict = {
            **default,
            **dict(urllib.parse.parse_qsl(dataset_id_unquote)),
        }
        logging.debug(
            "`dataset_dict` dict:\n# start of item\n%s\n# end of item\n",
            pformat(dataset_dict, sort_dicts=False),
        )
        logging.debug("`dataset_id` str: %s", encode_dataset_id(dataset_dict))
        return dataset_dict

    @hookimpl
    def dataset_router(self, deps: Dependencies):
        router = APIRouter(
            prefix=self.dataset_router_prefix, tags=list(self.dataset_router_tags)
        )

        @router.get("/info")
        def get_json_info(
            dataset=Depends(deps.dataset),
            cache=Depends(deps.cache),
        ):
            """Return the schema as a dictionary."""
            return JSONResponse(dataset.to_dict(data=False))

        @router.get("/vars")
        def get_variable_list(
            dataset=Depends(deps.dataset),
            cache=Depends(deps.cache),
        ):
            """Return a list of available variables."""
            return JSONResponse(list(dataset.data_vars))

        @router.get("/{var}/isel/{timedelta}/{chunk}/{options}")
        def get_variable_chunk(
            var: str = apiPath(
                description="Variable. A single item from the list of {vars} or '-' for all."
            ),
            timedelta: str = apiPath(
                description="Interval duration in timedelta notation, such as '1D', '1H'."
            ),
            chunk: str = apiPath(
                description="Chunk index, such as '0', '1', '2', ..., or from the end, '-2', '-1'."
            ),
            options: str = apiPath(description="Map options"),
            dataset_ids=Depends(deps.dataset_ids),
        ):
            """Return a dataset in a JSON representation. Query {chunk} from {timedelta} sized chunks."""

            """ handle selection and mapbox html"""
            dataset_dict = decode_dataset_id(options, catalogue_lookup=False)
            selected = tuple(
                [dataset_dict[k] for k in ["location", "variable", "method", "period"]]
            )

            html_content = get_mapbox_html(
                selected, dataset_ids, var, timedelta, chunk, "chunk"
            )
            return HTMLResponse(content=html_content, status_code=200)

        @router.get("/{var}/sel/{timedelta}/{timestamp}/{options}")
        def get_variable_slice(
            var: str = apiPath(
                description="Variable. A single item from the list of {vars} or '-' for all."
            ),
            timedelta: str = apiPath(
                description="Interval duration in timedelta notation, such as '1D', '1H', or backwards, '-1D', '-1H'."
            ),
            timestamp: str = apiPath(
                description="ISO8601 date and time notation, such as '20230101T000000'."
            ),
            options: str = apiPath(description="Map options"),
            dataset_ids=Depends(deps.dataset_ids),
        ):
            """Return a dataset in a JSON representation. Query a {timedelta} interval from (or until) a reference {timestamp}."""

            """ handle selection and mapbox html"""
            dataset_dict = decode_dataset_id(options, catalogue_lookup=False)
            selected = tuple(
                [dataset_dict[k] for k in ["location", "variable", "method", "period"]]
            )

            print(dataset_dict)

            html_content = get_mapbox_html(
                selected, dataset_ids, var, timedelta, timestamp, "slice"
            )
            return HTMLResponse(content=html_content, status_code=200)

        return router

# Configuration

## Static Configuration

In [None]:
# Version
version = {
    "id": "v1.0.0",
    "time": "2023-07-14",
}  # first version.
version = {
    "id": "v1.0.1",
    "time": "2023-09-30",
}  # modified for using a TOML config.
version = {
    "id": "v1.0.2",
    "time": "2023-12-05",
}  # modified for using curation based on QC tests.
version = {
    "id": "v1.0.3",
    "time": "2025-02-27",
}  # modified for using maps and metadb


# Configuration file for input / output files
ioconfig_name = "datasets_api"
try:
    import ipynbname

    ioconfig_file = "{}.toml".format(ipynbname.name())
except:
    ioconfig_file = "../conf/{}.toml".format(ioconfig_name)

# ----- Papermill injection below this cell -----

In [None]:
# input/output config
ioconf = parse_config(ioconfig_file, ioconfig_name, version)

# validate config (to do)

In [None]:
"""
Note: the approach to set global helper variables should be revised. 
But was/is used in combination with papermill automation.
"""

# set global variables
query_from = ioconf["query"]["start"]
query_to = None if not "end" in ioconf["query"] else ioconf["query"]["end"]
query_period = ioconf["query"]["period"]
query_index = ioconf["query"]["system_index"]
query_latest = ioconf["query"]["latest"]
query_cache = ioconf["query"]["cache"]
query_tasks = ioconf["query"]["tasks"]

input_path_base = ioconf["input"]["path_base"]
input_path = ioconf["input"]["path"]
input_file = ioconf["input"]["file"]
input_subset = ioconf["input"]["subset"]
cache_path_base = ioconf["cache"]["path_base"]
cache_path = ioconf["cache"]["path"]
cache_file = ioconf["cache"]["file"]
output_path_base = ioconf["output"]["path_base"]
output_path = ioconf["output"]["path"]
output_file = ioconf["output"]["file"]
output_api = ioconf["output"]["api"]  # deviation from other notebooks

log_path = ioconf["logging"]["path"]
log_file = ioconf["logging"]["file"]
log_format = ioconf["logging"]["format"]
log_filemode = (
    "a" if not "filemode" in ioconf["logging"] else ioconf["logging"]["filemode"]
)

gattrs = ioconf["gattrs"]

## Logging configuration

In [None]:
# create logger
import logging
import logging.handlers
from pprint import pformat

logging.basicConfig(
    encoding="utf-8",
    format=log_format,
    level=logging.INFO,
    # Declare handlers
    handlers=[
        logging.FileHandler(
            os.path.join(log_path, log_file).format(version_id=version["id"]),
            mode=log_filemode,
        ),
        logging.StreamHandler(sys.stdout),
    ],
)

## Dynamic Configuration

In [None]:
# summarize
logging.info("`ioconf` file: %s", ioconfig_file)
logging.info(
    "`ioconf` dict:\n# start of item\n%s\n# end of item\n",
    pformat(ioconf, sort_dicts=False),
)

In [None]:
# init root dict
root_dict = [{}]

In [None]:
# reset structure
dim_list = ["time", "station", "system", "sensor", "channel", "cell", "attributes"]
enc_conf = {
    "time": {
        "units": "nanoseconds since 1970-01-01 00:00:00",  # " +0000"
        "calendar": "proleptic_gregorian",
    }
}

In [None]:
query_range = get_query_range(query_from, query_period, query_to)

# MAIN

In [None]:
if __name__ == "__main__":
    # catalogue path
    catalogue_path = (
        Path(cache_path_base) / Path(cache_path) / Path(output_path).relative_to("/")
        if output_path[0] == "/"
        else Path(output_path)
    )

    # get input files, built a catalogue
    input_files_dict, input_files = get_input_files(
        input_subset,
        custom_subset={
            "time_bounds": get_time_bounds(
                # get_query_range(pd.Timestamp.now().floor("1D"), "1D") # 1 D
                get_query_range(pd.Timestamp.now().floor("1D"), "2D") - pd.to_timedelta("1D") # 3 D
            )
        },
    )

    if "cache" in query_tasks:
        catalogue_file = datastore_cache(
            input_files,
            catalogue_path,
        )
    else:
        catalogue_file = datastore_cache(
            input_files,
            catalogue_path,
        )  # ToDo; limit re-caching.

    # filter catalogue list
    catalogue_list = filter_catalogue(catalogue_file, output_file)

    # root defaults
    if isinstance(query_index, list):
        root_dict = [
            {
                k: v
                for k, v in decode_dataset_id(qi, catalogue_lookup=False).items()
                if not k in ["id"]
            }
            for qi in query_index
        ]
    else:
        root_dict = [
            {
                "system_group": "AWS",
                "production_level": "L2R",
                "production_name": "urbisphere",
                "api_name": "latest",
                "api_token": query_index,
            }
        ]
    logging.info(
        "`root_dict` dict:\n# start of item\n%s\n# end of item\n",
        pformat(root_dict, sort_dicts=False),
    )

    # the public catalogues,
    # currently a union with root_dict...
    catalogue_public = [
        entry
        for item in root_dict
        for entry in catalogue_list  # revise: reference to a global() variable
        if dicts_equal_any(
            entry["catalogue"],
            item,
            [],
        )
    ]

    # ... extended with all other in the list
    catalogue_public.extend(
        [
            entry
            for item in root_dict
            for entry in catalogue_list  # revise: reference to a global() variable
            if not dicts_equal_any(
                entry["catalogue"],
                item,
                [],
            )
        ]
    )

    logging.debug(
        "`catalogue_public` dict:\n# start of item\n%s\n# end of item\n",
        pformat(catalogue_public, sort_dicts=False),
    )

    # catalogue_root = decode_dataset_id(encode_dataset_id(root_dict))
    catalogue_root = catalogue_public[0]
    logging.debug(
        "`catalogue_root` dict:\n# start of item\n%s\n# end of item\n",
        pformat(catalogue_root, sort_dicts=False),
    )

    if "serve" in query_tasks:
        # extract app meta information
        datasets_root = xr.open_zarr(
            catalogue_root["file"], group=catalogue_root["catalogue"]["station_id"]
        )
        datasets_attrs = copy.deepcopy(datasets_root.attrs)
        datasets_name = datasets_attrs["title"]
        datasets_info = "\n".join(
            [
                f"<h4>{n.title()}</h4><p>\n\n{str(datasets_attrs[n])}\n</p>"
                for n in [*datasets_attrs.keys()]
                if n in ["keywords", "references"]
            ]
        )
        datasets_version = datasets_attrs["production_version"]

        if datasets_version == "":
            datasets_version = "v1.0.5"

        # api server settings
        rest_config = dict(
            app_kws=dict(
                title=datasets_name,
                description=datasets_info,
                version=datasets_version,
                docs_url="/docs",
                redoc_url="/redoc",
                openapi_url="/openapi.json",
            ),
            cache_kws=dict(available_bytes=0),
        )
        serve_log_config = uvicorn_log_yaml()
        serve_config = dict(
            host=output_path_base.split(":")[0],  # e.g., "127.0.0.1",
            port=int(output_path_base.split(":")[1]),  # e.g., 49240,
            log_level="debug",
            log_config=serve_log_config,
            root_path=output_path,  # e.g., "services/x/api/v1/"
            app_dir=output_path,
        )

        # configure REST API
        rest = Rest({}, **rest_config)
        rest.register_plugin(UrbisphereDatasetPlugin())
        rest.register_plugin(JSONPlugin())
        rest.register_plugin(GeoJSONPlugin())
        rest.register_plugin(MapBoxPlugin())

        if Path(output_api['assets']['input_path']).exists():
            rest.app.mount("/assets", StaticFiles(directory=output_api['assets']['input_path']), name="static")

        # Start the server
        if True:
            rest.serve(**serve_config)