# Introduction

Created on Tue May 5 15:33:36 2022

Modified on Tue Aug 3 12:59:23 2022

Modified on Mon Dec 12 16:48:02 2022

Modified on Mon Sep 25 14:01:34 2023

Modified on Fri Oct 11 10:59:01 2024

@author: zeeman-m

## Requirements

In [1]:
# FieldcClimate connection
from datetime import datetime

import requests
from Crypto.Hash import HMAC, SHA256
from dateutil.tz import tzlocal
from requests.auth import AuthBase

In [2]:
# Other requirements
import glob
import json
import os
import re
import sys
import time
from collections import OrderedDict
from datetime import datetime, timedelta, timezone
from pathlib import Path

import numpy as np
import pandas as pd
import toml
import xarray as xr
from mergedeep import merge
from tqdm.notebook import tqdm

# Definitions

In [None]:
def rebase_path(path_base="urbisphere-dm", path_root=None):
    """return abs path of a higher level directory"""
    from pathlib import Path

    path_root = Path("__file__").parent.resolve() if not path_root else path_root
    path_parts = lambda p: p[0 : (p.index(path_base) + 1 if path_base in p else len(p))]
    return str(Path(*[n for n in path_parts(Path(path_root).parts)]))


sys.path.append(os.path.join(rebase_path(), "interfaces/metadb/notebooks/"))
sys.path.append(os.path.join(rebase_path(), "processing/datasets/pub/notebook/"))
from ipynb.fs.defs.metadb_publications import metadb_publication_query
from ipynb.fs.full.metadb_query import (
    inventardb_query_deployment as get_metadata_inventory_deployment,
)
from ipynb.fs.full.metadb_query import metadb_query as metadb_query_subset_table

In [4]:
def get_metadata_metadb_deployment(query_city, query_time):
    subset = {"system": {"i_model": ["LoRAIN","nMETOS100+"]}}
    if query_city and query_city != "":
        subset["station"] = {
            "y_code": (query_city if isinstance(query_city, list) else [query_city])
        }
    if len(query_range) == 2:
        subset["time"] = dict(zip(["start", "end"], query_range.astype(str).tolist()))

    metadb = metadb_query_subset_table(subset=subset)
    return metadb

In [None]:
def parse_config(ioconf_file, ioconf_name=None, version_dict=None, merge_additive = ['gattrs']):
    """Read configuration file and extract dict that matches version['id']."""
    import toml
    from mergedeep import merge, Strategy
    from collections import ChainMap

    def read_config(ioconfig_file):
        if os.path.exists(ioconfig_file):
            with open(ioconfig_file) as f:
                ioconf = toml.load(f)
            return ioconf
        else:
            return None

    # read TOML config file
    ioconf = read_config(ioconfig_file)

    if isinstance(version_dict, dict):
        # lookup of settings
        if ioconfig_name:
            group = ioconfig_name
        else:
            group = Path(ioconfig_file).stem

        if group in ioconf:
            # replace merge
            conf_list = [
                d
                for d in ioconf[group]
                if version_dict["id"].startswith(d["version"]["id"])
            ]
            config = merge(*conf_list)
            
            # additive merge
            deep_conf_list = [
                d
                for d in ioconf[group]
                if version_dict["id"] == (d["version"]["id"])
            ]            
            deep_config = merge(*deep_conf_list,strategy=Strategy.TYPESAFE_ADDITIVE)    
            
            for g in merge_additive:
                if g in config and g in deep_config:
                    if isinstance(deep_config[g],list):
                        config[g] = [dict(ChainMap(*reversed(deep_config[g])))]
        else:
            config = {}

        return config
    else:
        return ioconf

In [None]:
def get_gattrs(gattrs, data_subset, sep=";\n"):
    ga = gattrs[0]
    gs = {}
    if len(ga) > 1:
        for g in gattrs[1:]:
            bo = False
            for k in gs.keys():
                if k in g and not bo:
                    bo = any(re.compile(m).match(gs[k]) for m in g[k])
                    if bo:
                        for k, v in g.items():
                            if k in ga:
                                ga[k] = v

    ct = get_creation_time()
    gd = {
        "version_id": version["id"],
        "version_time": version["time"],
        "version_date": version["time"],
        "creation_time": ct,
        "creation_date": ct[0:10],
    }
    ga = {k: sep.join(v) if isinstance(v, list) else v for k, v in ga.items()}
    ga = {k: v.format(**gd) for k, v in ga.items()}
    return ga


def get_gattrs_pub(gattrs):
    publication_gattrs = {}
    for ga in gattrs:
        if "production_profile" in ga:
            pga = metadb_publication_query(
                ga["production_profile"], publication_name="datasets_default"
            )
            # pga = {k:v for k,v in pga.items() if v != ''}
            publication_gattrs = {**publication_gattrs, **pga}
    return publication_gattrs

In [None]:
def get_creation_time(d=datetime.utcnow()):
    d_str = d.replace(microsecond=0, tzinfo=timezone.utc).isoformat()
    return d_str


def get_time_bounds(query_range):
    tr = query_range.strftime("%Y%m%dT%H%M%S%z").tolist()
    res = tr[0] if tr[0] == tr[1] else "{}_{}".format(*tr)
    return res


def get_global_attributes(data_subset):
    from copy import deepcopy

    ga = deepcopy(get_gattrs(gattrs, data_subset))
    pga = deepcopy(get_gattrs_pub(gattrs))

    for k in ga.keys():
        if not k in pga:
            pga[k] = ga[k]
        if k in pga and ga[k] != "":
            if pga[k] == "":
                pga[k] = ga[k]
    return pga


def get_global_reference():
    crs = OrderedDict(
        origin_time="nanoseconds since 1970-01-01 00:00:00 +0000",
        origin_lon=0.0,
        origin_lat=0.0,
        origin_utm_x=0.0,
        origin_utm_y=0.0,
        origin_x=0.0,
        origin_y=0.0,
        origin_z=0.0,  # or "station: h"
        origin_h="meters above mean sea level",  # or "meters above Normaal Amsterdams Peil"
        origin_azimuth=0.0,
    )
    return crs

## Connection to Fieldclimate

In [None]:
# Modified from source: https://api.fieldclimate.com/v2/docs#authentication-hmac


# Class to perform HMAC encoding
class AuthHmacMetosGet(AuthBase):
    # Creates HMAC authorization header for Metos REST service GET request.
    def __init__(self, apiRoute, publicKey, privateKey):
        self._publicKey = publicKey
        self._privateKey = privateKey
        self._method = "GET"
        self._apiRoute = apiRoute

    def __call__(self, request):
        dateStamp = datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S GMT")
        request.headers["Date"] = dateStamp
        msg = (self._method + self._apiRoute + dateStamp + self._publicKey).encode(
            encoding="utf-8"
        )
        h = HMAC.new(self._privateKey.encode(encoding="utf-8"), msg, SHA256)
        signature = h.hexdigest()
        request.headers["Authorization"] = "hmac " + self._publicKey + ":" + signature
        return request

In [None]:
def get_fieldclimate_metadata(
    apiConf,
    meta_conf={
        "user": "/user",
        "user_stations": "/user/stations",
        "system_groups": "/system/groups",
        "system_sensors": "/system/sensors",
    },
):
    """Get Selected Observatory Meta Data via REST API"""
    response = OrderedDict()
    for k, apiRoute in meta_conf.items():
        auth = AuthHmacMetosGet(apiRoute, apiConf["publicKey"], apiConf["privateKey"])
        logging.info("Fieldclimate API Route:  %s", apiRoute)
        response[k] = requests.get(
            apiConf["apiURI"] + apiRoute,
            headers={"Accept": "application/json"},
            auth=auth,
            timeout=(3.05, 27), # connection, read            
        )
    return response


def get_fieldclimate_metadata_dataframe(response, section="user_stations"):
    """Convert selected response into pandas dataframe"""

    if isinstance(response[section], requests.models.Response):
        response_json = response[section].json()
    else:
        response_json = response[section]

    if not isinstance(response_json, list):
        df = pd.json_normalize(response_json)
    else:
        df_list = []
        for k in response_json:
            df = pd.json_normalize(k)
            df_list.append(df)
        df = pd.concat(df_list)

    # filters
    if section in ["user_stations"]:
        df = df.sort_values(["name.custom"])
    elif section in ["system_groups", "system_sensors"]:
        df.columns = df.columns.str.split(".", expand=True)
        df = df.loc[0].unstack(level=0).transpose()

    return df

In [None]:
with pd.option_context(
    "display.max_rows",
    None,
    "display.max_columns",
    None,
    "display.precision",
    3,
):
    pass

## Fieldclimate Queries

In [None]:
def get_fieldclimate_cell_methods(aggr, cell_dim="time"):
    lut = {"max": "maximum", "min": "minimum", "avg": "mean"}
    if aggr in [
        "sum",
        "mean",
        "maximum",
        "minimum",
        "mid_range",
        "standard_deviation",
        "variance",
        "mode",
        "median",
    ]:
        res = OrderedDict(
            cell_methods="{}: {}".format(cell_dim, aggr),
            cell_bounds_domain={"{}_{}".format(cell_dim, "bounds"): [0, 1]},
        )
    elif aggr in lut.keys():
        res = OrderedDict(
            cell_methods="{}: {}".format(cell_dim, lut[aggr]),
            cell_bounds_domain={"{}_{}".format(cell_dim, "bounds"): [0, 1]},
        )
    elif aggr in ["point"]:
        res = OrderedDict(
            cell_methods="{}: {}".format(cell_dim, aggr),
            cell_bounds_domain={"{}_{}".format(cell_dim, "bounds"): [0, 0]},
        )
    elif aggr in ["last"]:
        res = OrderedDict(
            cell_methods="{}: {}".format(cell_dim, "point"),
            cell_bounds_domain={"{}_{}".format(cell_dim, "bounds"): [1, 1]},
        )
    else:
        res = OrderedDict()

    return res


def get_fieldclimate_cell_bounds(domain, cell_bounds_domain):
    pass

In [None]:
def get_fieldclimate_station(apiConf, apiQueryList):
    """Get Selected Observatory Data via REST API"""
    response = OrderedDict()
    for apiQuery in tqdm(apiQueryList):
        for k, apiRoute in apiQuery.items():
            # print(k)
            auth = AuthHmacMetosGet(
                apiRoute, apiConf["publicKey"], apiConf["privateKey"]
            )
            response[k] = requests.get(
                apiConf["apiURI"] + apiRoute,
                headers={"Accept": "application/json"},
                auth=auth,
                timeout=(3.05, 27), # connection, read                
            )
    return response


def get_fieldclimate_station_queries(
    station_ids,
    time_range,
    time_offset=["0S", "-1S"],
    data_groups=["1"],  # ['4','7','8','5','3','9','6','1','2']
    convert_to_unix_timestamp=True,
):
    """Convert selected response into pandas dataframe"""
    query_list = []
    query_str = "/data/{STATION_ID}/{DATA_GROUP}/from/{FROM_UNIX_TIMESTAMP}/to/{TO_UNIX_TIMESTAMP}"
    query_id_str = "{STATION_ID}_{FROM_UNIX_TIMESTAMP}_{DATA_GROUP}"

    time_from, time_to = time_range
    time_from_offset, time_to_offset = time_offset

    # station ids
    if isinstance(station_ids, str):
        station_ids = [station_ids]

    if not data_groups:
        data_groups = [""]

    # time variable.
    time_span = {
        "start": pd.to_datetime(time_from, format="ISO8601")
        + pd.Timedelta(time_from_offset),
        "end": pd.to_datetime(time_to, format="ISO8601") + pd.Timedelta(time_to_offset),
    }
    time_delta = list(time_span.values())[1] - list(time_span.values())[0]

    # split the queries by time range (max 1 day)
    if time_delta < pd.Timedelta("1D"):
        time_list = [
            dict(
                zip(
                    ["FROM_UNIX_TIMESTAMP", "TO_UNIX_TIMESTAMP"],
                    list(time_span.values()),
                )
            )
        ]
    else:
        time_days = pd.date_range(**time_span, freq="1D").tolist()
        time_list = [
            dict(zip(["FROM_UNIX_TIMESTAMP", "TO_UNIX_TIMESTAMP"], l))
            for l in list(zip(time_days, time_days[1:] + [(time_span["end"])]))
        ]
        if time_list[-1]["FROM_UNIX_TIMESTAMP"] == time_list[-1]["TO_UNIX_TIMESTAMP"]:
            time_list = time_list[:-1]

    # convert or debug
    if convert_to_unix_timestamp:
        for i in range(len(time_list)):
            for k, v in time_list[i].items():
                time_list[i][k] = (v - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")

    # loop
    for sid in station_ids:
        for dg in data_groups:
            for tr in time_list:
                query_dict = {**{"STATION_ID": sid, "DATA_GROUP": dg}, **tr}
                query = (query_str.format(**query_dict)).replace("//", "/")
                query_id = query_id_str.format(**query_dict)
                query_list.append({query_id: query})

    return query_list


def get_fieldclimate_station_dataframe(response_dict):
    df_list = []
    for response in response_dict:
        for k, v in response:
            df = pd.json_normalize(v.json())
            df_list.append(df)
    df = pd.concat(df_list).sort_values(["name.custom"]).set_index(["name.custom"])
    return df

## Query response handling

In [None]:
def get_fieldclimate_station_datasets_org(
    response_dict, meta_stations, meta_deployment
):
    import re

    def get_deployment(meta_deployment, station, timestamp):
        ts = pd.to_datetime(float(timestamp), unit="s", origin="unix")

        res = (
            {}
        )  # meta_deployment.loc[:,['model.manufacturer_name','model.model_name','model.model_description']].iloc[0].to_dict()

        depl = meta_deployment.loc[
            (
                (meta_deployment["sensor.sensor_serial"] == station)
                & (meta_deployment["location.location_startdate"] <= ts)
                & (meta_deployment["location.location_enddate"] > ts)
            ),
            :,
        ]

        if not depl.empty:
            res = {**res, **depl.iloc[0].to_dict()}

        return res

    def evaluate_attributes(d):
        """Encoding of list and dicts to json."""
        import json

        res = {
            "attributes_id": [],
            "attributes_key": [],
            "attributes_type": [],
            "attributes_val": [],
        }
        for i, j in d.items():
            for k, v in j.items():
                res["attributes_id"].append(i)
                res["attributes_key"].append(k)
                res["attributes_type"].append(str(type(v).__name__))
                if isinstance(v, list) or isinstance(v, dict):
                    v = json.dumps(v)
                else:
                    v = str(v)
                    v = json.dumps(v)

                res["attributes_val"].append(v)
        return res

    ds_dict = OrderedDict()

    for ind, response in response_dict.items():
        # print(ind)

        # processing identifiers
        station, timestamp, group = ind.split("_")

        if not station in ds_dict:
            ds_dict[station] = OrderedDict()

        if not timestamp in ds_dict[station]:
            ds_dict[station][timestamp] = []

        res_json = response.json()
        res_meta = meta_stations[meta_stations.index == station].reset_index().loc[0]
        # res_locs = (
        #    meta_locations[meta_locations["Serial No."] == station].reset_index().loc[0]
        # )
        res_locs = get_deployment(meta_deployment, station, timestamp)
        res_refs = get_global_reference()
        res_glob = get_global_attributes(output_subset)

        for var in res_json["data"]:
            # filters
            dattr = OrderedDict(
                [
                    (k, v)
                    for k, v in var.items()
                    if not isinstance(v, list) and not isinstance(v, dict)
                ]
            )
            # attributes
            dattr = OrderedDict(
                [
                    ("global", OrderedDict(res_refs)),
                    ("station", OrderedDict(res_locs)),
                    ("system", OrderedDict(res_meta)),
                    (
                        "cell",
                        OrderedDict(
                            [(k, v) for k, v in var.items() if k not in ["values"]]
                        ),
                    ),
                ]
            )

            # flatten attributes, convert values to json
            ddattr = evaluate_attributes(dattr)

            # remove meaningless variables? (e.g. Midnight)
            if not "name_original" in list(var.keys()):
                # print('SKIPPED: {}'.format(dattr['name']))
                continue

            if not res_locs:
                # if station is not defined in metadata, skip
                continue

            # extraction of meaningful attributes:
            station_res = re.findall(
                r"(Station.*?)\s?\((FR.*?)\)",
                str(res_locs.copy()["location.location_text"]),
            )

            station_id = station_res[0][1] if station_res else ""
            station_name = (
                str(station_res[0][0]).replace("Station Freiburg ", "")
                if station_res
                else ""
            )

            cell_attrs = [
                get_fieldclimate_cell_methods(m) for m in var["values"].keys()
            ]
            # print(cell_attrs)
            cell_methods = [
                n["cell_methods"] if "cell_methods" in n.keys() else ""
                for n in cell_attrs
            ]

            var_name = dattr["cell"]["name_original"].replace(" ", "_").replace(",", "")
            var_long_name = dattr["cell"]["name_original"]
            var_units = dattr["cell"]["unit"]

            # Data variables
            data_vars = OrderedDict(
                [
                    (
                        "{}".format(var_name),
                        (
                            ["system", "cell", "time"],
                            [[v for k, v in var["values"].items()]],
                            {"long_name": var_long_name, "units": var_units},
                        ),
                    )
                ]
            )
            attributes_values = (
                "attributes_values".format(""),
                (
                    ["station", "system", "channel", "attributes"],
                    [[[ddattr["attributes_val"]] * 1]],
                    {
                        "description": "Attributes combined from meta data and input-file headers."
                    },
                ),
            )

            # Coordinates
            coord_time = pd.to_datetime(res_json["dates"], format="ISO8601")
            coords = {
                "time": (
                    ["time"],
                    coord_time,
                    {
                        "long_name": "time",
                        "standard_name": "time",
                        # "calendar": "proleptic_gregorian",  # xarray error
                        # "units": "microseconds since 1970-01-01 00:00:00 +0000", # xarray error
                    },
                ),
                "station_id": (["station"], [str(station_id)], OrderedDict()),
                "station_name": (["station"], [str(station_name)], {}),
                # "station_group": (["station"], [str("PL")], {}),
                "station_lat": (
                    ["station"],
                    [float(dattr["station"]["station.station_lat"])],
                    {},
                ),
                "station_lon": (
                    ["station"],
                    [float(dattr["station"]["station.station_lon"])],
                    {},
                ),
                "system_id": (["system"], [str(station)], {}),
                "system_name": (["system"], [str(res_meta["info.device_name"])], {}),
                "system_group": (
                    ["system"],
                    [str("AWS")],
                    {},
                ),
                "sensor_id": (["sensor"], [str(dattr["cell"]["group"])], {}),
                "channel_id": (["channel"], [str(dattr["cell"]["ch"])], {}),
                "cell_type": (["cell"], list(cell_methods), {}),
                "attributes_id": (["attributes"], ddattr["attributes_key"], {}),
                "attributes_group": (["attributes"], ddattr["attributes_id"], {}),
                "attributes_type": (["attributes"], ddattr["attributes_type"], {}),
            }

            # coords back to tuple
            coords = [(k, v) for k, v in coords.items()]

            # add attributes_values as coord or data_vars?
            coords.append(attributes_values)

            # Dataset (xarray)
            ds = xr.Dataset(
                data_vars=data_vars,
                coords=dict(coords),
                attrs=res_glob,
            )

            # Dimensions sort order
            ds = ds.transpose(
                "time",
                "station",
                "system",
                "sensor",
                "channel",
                "cell",
                "attributes",
                missing_dims="ignore",
            )

            # Encoding
            encoder_list = [
                n for n in output_encoder if n["name"] in list(ds.variables)
            ]
            for encoder in encoder_list:
                n = encoder["name"]
                enc = {k: v for k, v in encoder.items() if k not in ["name"]}
                ds[n].encoding.update(**enc)
                logging.debug("Encoding updated for `%s`: `%s`", n, str(enc))

            ds_dict[station][timestamp].append(ds)

    return ds_dict

In [7]:
def get_fieldclimate_station_datasets(response_dict, meta_stations, meta_deployment):
    import re

    def get_deployment(meta_deployment, system_id, timestamp):
        ts = pd.to_datetime(float(timestamp), unit="s", origin="unix")

        # for reference only, <v1.0.4 keys
        translation_dict = {
            "sensor.sensor_serial": ("id", "system_id"),
            "location.location_startdate": ("configuration", "d_start_date"),
            "location.location_enddate": ("configuration", "d_end_date"),
            "location.location_text": ("id", "station_name"),
            "station.station_lat": ("id", "system_lat"),
            "station.station_lon": ("id", "system_lon"),
        }

        res = {}

        dep = meta_deployment.loc[
            (
                (meta_deployment[("id", "system_id")] == system_id)
                & (meta_deployment[("configuration", "d_start_date")] <= ts)
                & (meta_deployment[("configuration", "d_end_date")] > ts)
            ),
            :,
        ]

        if not dep.empty:
            dep_subset = pd.concat(
                [
                    dep.iloc[[0]].loc(axis=1)["id", :],
                    dep.iloc[[0]].loc(axis=1)[
                        "configuration", ["d_id", "c_id", "i_id", "m_id", "s_id"]
                    ],
                ],
                axis=1,
            ).droplevel(axis=1, level=0)
            # ALT: dep_subset.columns = dep_subset.columns.to_flat_index().str.join('_')

            res = {**res, **dep_subset.iloc[0].to_dict()}

        return res

    def evaluate_attributes(d):
        """Encoding of list and dicts to json."""
        import json

        res = {
            "attributes_id": [],
            "attributes_key": [],
            "attributes_type": [],
            "attributes_val": [],
        }
        for i, j in d.items():
            for k, v in j.items():
                res["attributes_id"].append(i)
                res["attributes_key"].append(k)
                res["attributes_type"].append(str(type(v).__name__))
                if isinstance(v, list) or isinstance(v, dict):
                    v = json.dumps(v)
                else:
                    v = str(v)
                    v = json.dumps(v)

                res["attributes_val"].append(v)
        return res

    ds_dict = OrderedDict()

    for ind, response in response_dict.items():
        # print(ind)
        # processing identifiers
        system_id, timestamp, group = ind.split("_")

        if not system_id in ds_dict:
            ds_dict[system_id] = OrderedDict()

        if not timestamp in ds_dict[system_id]:
            ds_dict[system_id][timestamp] = []

        if isinstance(response, requests.models.Response):
            res_dict = response.json()
        elif isinstance(response, dict):
            res_dict = response  # dict

        if not 'data' in res_dict.keys():
            if 'message' in res_dict.keys():
                logging.warning("No 'data' in response: `%s`: `%s`", ind, str(res_dict['message']))
            continue
        
        res_meta = meta_stations[meta_stations.index == system_id].reset_index().loc[0]
        res_locs = get_deployment(meta_deployment, system_id, timestamp)
        res_refs = get_global_reference()
        res_glob = get_global_attributes(output_subset)

        for var in res_dict["data"]:
            # filters
            dattr = OrderedDict(
                [
                    (k, v)
                    for k, v in var.items()
                    if not isinstance(v, list) and not isinstance(v, dict)
                ]
            )
            # attributes
            dattr = OrderedDict(
                [
                    ("global", OrderedDict(res_refs)),
                    ("station", OrderedDict(res_locs)),
                    ("system", OrderedDict(res_meta)),
                    (
                        "cell",
                        OrderedDict(
                            [(k, v) for k, v in var.items() if k not in ["values"]]
                        ),
                    ),
                ]
            )

            # flatten attributes, convert values to json
            ddattr = evaluate_attributes(dattr)

            # remove meaningless variables? (e.g. Midnight)
            if not "name_original" in list(var.keys()):
                # print('SKIPPED: {}'.format(dattr['name']))
                continue

            if not res_locs:
                # if station is not defined in metadata, skip
                continue

            # extraction of meaningful attributes:
            station_id = dattr["station"]["station_id"]
            station_name = dattr["station"]["station_name"]

            system_name = str(dattr["station"]["system_name"])
            system_name_alt = str(dattr["system"]["info.device_name"])

            if system_name != system_name_alt and not ds_dict[system_id]:
                logging.warning(
                    "Name mismatch (system_id,system_name,device_name): (`%s`,`%s`,`%s`)",
                    system_id,
                    system_name,
                    system_name_alt,
                )

            cell_attrs = [
                get_fieldclimate_cell_methods(m) for m in var["values"].keys()
            ]
            # print(cell_attrs)
            cell_methods = [
                n["cell_methods"] if "cell_methods" in n.keys() else ""
                for n in cell_attrs
            ]

            var_name = dattr["cell"]["name_original"].replace(" ", "_").replace(",", "")
            var_long_name = dattr["cell"]["name_original"]
            var_units = dattr["cell"]["unit"]

            # Data variables
            data_vars = OrderedDict(
                [
                    (
                        "{}".format(var_name),
                        (
                            ["system", "cell", "time"],
                            [[v for k, v in var["values"].items()]],
                            {"long_name": var_long_name, "units": var_units},
                        ),
                    )
                ]
            )
            attributes_values = (
                "attributes_values".format(""),
                (
                    ["station", "system", "channel", "attributes"],
                    [[[ddattr["attributes_val"]] * 1]],
                    {
                        "description": "Attributes combined from meta data and input-file headers."
                    },
                ),
            )

            # Coordinates
            coord_time = pd.to_datetime(res_dict["dates"], format="ISO8601")
            coords = {
                "time": (
                    ["time"],
                    coord_time,
                    {
                        "long_name": "time",
                        "standard_name": "time",
                        # "calendar": "proleptic_gregorian",  # xarray error
                        # "units": "microseconds since 1970-01-01 00:00:00 +0000", # xarray error
                    },
                ),
                "station_id": (
                    ["station"],
                    [str(dattr["station"]["station_id"])],
                    {},
                ),
                "station_name": (
                    ["station"],
                    [str(dattr["station"]["station_name"])],
                    {},
                ),
                "station_lat": (
                    ["station"],
                    [float(dattr["station"]["station_lat"])],
                    {},
                ),
                "station_lon": (
                    ["station"],
                    [float(dattr["station"]["station_lon"])],
                    {},
                ),
                "station_height": (
                    ["station"],
                    [float(dattr["station"]["station_height"])],
                    {},
                ),
                "system_id": (
                    ["system"],
                    [str(dattr["station"]["system_id"])],  ### attention
                    {},
                ),
                "system_name": (
                    ["system"],
                    [str(dattr["station"]["system_name"])],  ### attention
                    {},
                ),
                "system_group": (
                    ["system"],
                    [str("AWS")],
                    {},
                ),
                "sensor_id": (
                    ["sensor"],
                    [str(dattr["cell"]["group"])],
                    {},
                ),
                "channel_id": (
                    ["channel"],
                    [str(dattr["cell"]["ch"])],
                    {},
                ),
                "cell_type": (
                    ["cell"],
                    list(cell_methods),  ### atttenion
                    {},
                ),
                "attributes_id": (
                    ["attributes"],
                    ddattr["attributes_key"],
                    {},
                ),
                "attributes_group": (
                    ["attributes"],
                    ddattr["attributes_id"],
                    {},
                ),
                "attributes_type": (
                    ["attributes"],
                    ddattr["attributes_type"],
                    {},
                ),
            }

            # coords back to tuple
            coords = [(k, v) for k, v in coords.items()]

            # add attributes_values as coord or data_vars?
            coords.append(attributes_values)

            # Dataset (xarray)
            ds = xr.Dataset(
                data_vars=data_vars,
                coords=dict(coords),
                attrs=res_glob,
            )

            # Dimensions sort order
            ds = ds.transpose(
                "time",
                "station",
                "system",
                "sensor",
                "channel",
                "cell",
                "attributes",
                missing_dims="ignore",
            )

            # Encoding
            encoder_list = [
                n for n in output_encoder if n["name"] in list(ds.variables)
            ]
            for encoder in encoder_list:
                n = encoder["name"]
                enc = {k: v for k, v in encoder.items() if k not in ["name"]}
                ds[n].encoding.update(**enc)
                logging.debug("Encoding updated for `%s`: `%s`", n, str(enc))

            ds_dict[system_id][timestamp].append(ds)

    return ds_dict

## Further Data management 

In [8]:
def datasets_to_zarr_zip(dx, output_file, zipstore_args={}, **zarr_args):
    from pathlib import Path

    import zarr

    # on the safe side, always start with an empty zarr.zip file.
    zipstore_args_default = dict(
        mode="w",
        allowZip64=True,
    )
    kw = {**zipstore_args_default, **zipstore_args}

    if isinstance(dx, xr.Dataset):
        dx = {None: dx}

    fn = Path(output_file).with_suffix(".zarr.zip")
    for k, ds in dx.items():
        store = zarr.storage.ZipStore(fn, **kw)
        ds.to_zarr(store, group=k, **zarr_args)
        store.close()

        # Any further group is to be updated/appended, by default
        kw["mode"] = "a"


def export_fieldclimate_query_RAW(
    export_dict,
    output_path="/tmp/Fieldclimate/RAW/",
    output_file="PESSL_FieldClimate_QueryResponses_%Y%m%dT%H%M%S+0000",
    output_dict={},
):
    """Export all queries to an archive file."""
    import io
    import json
    import os
    import tarfile
    import time
    from pathlib import Path

    # archive
    fa = os.path.join(output_path, output_file).format(**output_dict)
    fm = time.time()

    logging.info("Query RAW file `%s`", fa)

    # path
    Path(fa).parent.mkdir(parents=True, exist_ok=True)

    # add query response as files to archive
    with tarfile.open(fa, "w:gz") as tar:
        for ind, response in export_dict.items():
            # processing identifiers
            # station_id, time_timestamp, cell_group = ind.split("_")
            data_dict = (
                response.json()
                if isinstance(response, requests.models.Response)
                else response
            )
            data = json.dumps(data_dict).encode("utf8")
            info = tarfile.TarInfo(name="{}.json".format(ind))
            info.size = len(data)
            info.mtime = fm

            tar.addfile(info, io.BytesIO(data))

    return Path(fa).is_file()


def import_fieldclimate_query_RAW(
    input_path="/tmp/Fieldclimate/RAW/",
    input_file="PESSL_FieldClimate_QueryResponses_%Y%m%dT%H%M%S+0000",
    input_dict={},
    include_all=False,
):
    """Export all queries to an archive file."""
    import io
    import json
    import os
    import tarfile
    import time
    from pathlib import Path

    # archive
    fa_pattern = os.path.join(input_path, input_file).format(**input_dict)
    fa_list = sorted(glob.glob(fa_pattern, recursive=True), reverse=True)

    if fa_list:
        fa = sorted(fa_list, reverse=True)  # assuming date folder sorting key
        logging.info("Found RAW files, N=%s", len(fa_list))
        logging.info("Reading RAW file `%s`", fa)
    else:
        fa = []
        logging.info("No RAW files found, at `%s`", fa_pattern)

    query_list = []
    for fa in fa_list:
        meta_response_keys = [
            "user",
            "user_stations",
            "system_groups",
            "system_sensors",
        ]
        meta_response = {}
        response_dict = {}
        meta_stations = None

        if not Path(fa).is_file():
            logging.info("File not found", fa)
        else:
            # extract query response as files in archive
            with tarfile.open(fa, "r:gz") as tar:
                for member in tar.getmembers():
                    f = tar.extractfile(member)
                    data = f.read()
                    response = json.loads(data.decode("utf8"))
                    ind = str(Path(member.name).with_suffix(""))
                    if ind in meta_response_keys:
                        meta_response[ind] = response
                    else:
                        response_dict[ind] = response

            meta_stations = get_fieldclimate_metadata_dataframe(
                meta_response, section="user_stations"
            )

            # remove the test station.
            meta_stations = meta_stations[
                ~meta_stations["name.original"].str.startswith("03A0D07B")
            ]

            # sort by unique identifier (station_id)
            meta_stations = meta_stations.reset_index().set_index("name.original")

            query_list.append(tuple([fa, meta_response, response_dict, meta_stations]))

            if query_list and not include_all:
                break

    return query_list


def export_fieldclimate_station_datasets_L0(
    dx,
    output_path="/tmp/Fieldclimate/L0/",
):
    """Export each Fieldclimate query to a separate NetCDF"""
    import os
    from pathlib import Path

    logging.info("Query L0 files `%s`", output_path)

    for i in list(dx.keys()):
        # merge on channels
        for j in list(dx[i].keys()):
            for k, v in enumerate(dx[i][j]):
                if isinstance(v, xr.Dataset):
                    # print(v)
                    fp = os.path.join(output_path, i)  # output path + station
                    Path(fp).mkdir(parents=True, exist_ok=True)
                    fn = os.path.join(fp, "{}_{}_{}.nc".format(i, j, k))
                    try:
                        v.to_netcdf(fn)
                    except (TypeError, OSError):
                        logging.info("Query L0 file `%s` could not be written.", fn)
                        pass
                        # print(fn)


def export_fieldclimate_station_datasets_L0_merged(
    ds,
    output_path="/tmp/Fieldclimate/L0/",
    output_file="urbisphere_set({global_location},{system_group},{time_bounds})_version({version}).nc",
    output_dict={},
    output_extension=".zarr.zip",
):
    from pathlib import Path

    if query_latest:
        tr = [x.strftime("%Y%m%dT%H%M%S") for x in list(query_range)]
    else:
        tr = [
            pd.to_datetime(str(ds.time.min().values), format="ISO8601").strftime(
                "%Y%m%dT%H%M%S"
            ),
            pd.to_datetime(str(ds.time.max().values), format="ISO8601").strftime(
                "%Y%m%dT%H%M%S"
            ),
        ]

    if not "time_bounds" in output_dict:
        output_dict["time_bounds"] = tr[0] if tr[0] == tr[1] else "{}_{}".format(*tr)

    for k in ["station_group", "system_group", "system_type"]:
        if k in ds.coords:
            output_dict[k] = str(ds[k].values[0]).replace("\xa0", " ").replace(" ", "")
        else:
            output_dict[k] = ""

    fn = os.path.join(output_path, output_file).format(**output_dict)

    # input("\nPress Enter to continue...")

    Path(fn).parent.mkdir(parents=True, exist_ok=True)
    try:
        if output_extension == ".zarr.zip":
            logging.info("L0 file `%s`", Path(fn).with_suffix(output_extension))
            datasets_to_zarr_zip(ds, fn)
            
            ## Temporary duplicate exports for UniWeather (2024-11-27)        
            if "scratch" in Path(fn).parent.parts and "L0" in Path(fn).parent.parts and version["id"] == "v1.0.5":
                fn_latest = Path(fn.replace('v1.0.5','latest'))
                fn_latest.parent.mkdir(parents=True, exist_ok=True)
                logging.info("+++ L0 file `%s`", fn_latest)
                ds.to_netcdf(fn)                   
        else:
            logging.info("L0 file `%s`", fn)
            ds.to_netcdf(fn)
    except (TypeError, OSError):
        logging.info("L0 file `%s` could not be written.", fn)
        pass

In [None]:
def filter_fieldclimate_station_datasets(dx, filters=[]):
    """Drop empty query results"""
    for i in list(dx.keys()):
        # filter retrieval
        dx[i] = {k: v for k, v in dx[i].items() if not v == []}

        # drop empty stations
        if not dx[i]:
            logging.info("Station skipped: %s", i)
            dx.pop(i, None)
            continue

        for f in filters:
            fk = list(f.keys())[0]
            fv = list(f.values())[0]
            dx[i] = {k: v for k, v in dx[i].items() for vv in v if vv[fk] != fv}
            if not dx[i]:
                logging.info("Station dropped: %s", i)
                dx.pop(i, None)
                continue

    return dx


def merge_fieldclimate_station_datasets(dx):
    """Merge a dict of Xarray datasets into one."""
    for i in list(dx.keys()):
        # drop unneeded dimensions (min, max, ...):
        # - cell, currently only pick the first column (to be revised)
        for j in list(dx[i].keys()):
            for n in range(len(dx[i][j])):
                for k in [0]:
                    # the cell_type coordinate is later removed, here we copy the
                    # info to the cell_methods attiribute for the selected variable.
                    if "cell_type" in dx[i][j][n].coords:
                        dv = [
                            i
                            for i in dx[i][j][n].data_vars
                            if not i.endswith("_attributes")
                        ]
                        for d in dv:
                            dx[i][j][n][d].attrs["cell_methods"] = (
                                dx[i][j][n].coords["cell_type"].values[k]
                            )
                    dx[i][j][n] = dx[i][j][n].isel(cell=k)

        # merge on channels
        for j in list(dx[i].keys()):
            dx[i][j] = xr.merge(dx[i][j], compat="override")

            # Drop duplicates in the time coordinate. A result of inclusive boundaries for the retrieval by time.
            dx[i][j].drop_duplicates("time")
            unique_time, unique_index = np.unique(dx[i][j].time, return_index=True)
            dx[i][j] = dx[i][j].isel(time=unique_index)

        # merge on time
        dx[i] = xr.merge(dx[i].values())

    # concat on station
    dx = xr.concat(dx.values(), "station")

    # remove unusable coordinates and dimensions
    dx = dx.drop_dims(["sensor"]).drop("cell_type")
    # dx.attrs["production_date"] = get_creation_time()

    # Dimensions sort order
    dx = dx.transpose(
        "time",
        "station",
        "system",
        "sensor",
        "channel",
        "cell",
        "attributes",
        missing_dims="ignore",
    )

    # Encoding
    encoder_list = [n for n in output_encoder if n["name"] in list(dx.variables)]
    for encoder in encoder_list:
        n = encoder["name"]
        enc = {k: v for k, v in encoder.items() if k not in ["name"]}
        dx[n].encoding.update(**enc)
        logging.info("Encoding updated for `%s`: `%s`", n, str(enc))

    return dx

In [None]:
def fieldclimate_raw(apiConf, debug=False):
    ## fieldclimate metadata
    meta_response = get_fieldclimate_metadata(apiConf)

    # process
    if debug:
        meta_groups = get_fieldclimate_metadata_dataframe(
            meta_response, section="system_groups"
        )
        meta_sensors = get_fieldclimate_metadata_dataframe(
            meta_response, section="system_sensors"
        )

    # stations listing as dataframe
    meta_stations = get_fieldclimate_metadata_dataframe(
        meta_response, section="user_stations"
    )

    # remove the test station.
    meta_stations = meta_stations[
        ~meta_stations["name.original"].str.startswith("03A0D07B")
    ]

    # sort by unique identifier (station_id)
    meta_stations = meta_stations.reset_index().set_index("name.original")

    ## fieldclimate metadata, extract data and metadata for each station
    # prepare REST queries
    query_stations = meta_stations.index.to_list()
    query_list = get_fieldclimate_station_queries(
        query_stations, query_range, convert_to_unix_timestamp=True
    )

    if debug:
        display(query_list)

    # get REST responses
    response_dict = get_fieldclimate_station(apiConf, query_list)

    if query_cache:
        pass

    # export temporary files (json)
    res = export_fieldclimate_query_RAW(
        OrderedDict(list(meta_response.items()) + list(response_dict.items())),
        output_file=cache_file,
        output_path=os.path.join(cache_path_base, cache_path),
        output_dict=cache_subset,
    )

    return (meta_response, response_dict, meta_stations)




def fieldclimate_raw_subset(meta_response, response_dict, meta_deployment):
    import re

    def get_deployment(meta_deployment, system_id, timestamp):
        ts = pd.to_datetime(float(timestamp), unit="s", origin="unix")

        res = {}

        dep = meta_deployment.loc[
            (
                (meta_deployment[("id", "system_id")] == system_id)
                & (meta_deployment[("configuration", "d_start_date")] <= ts)
                & (meta_deployment[("configuration", "d_end_date")] > ts)
            ),
            :,
        ]

        if not dep.empty:
            dep_subset = pd.concat(
                [
                    dep.iloc[[0]].loc(axis=1)["id", :],
                    dep.iloc[[0]].loc(axis=1)[
                        "configuration", ["d_id", "c_id", "i_id", "m_id", "s_id"]
                    ],
                ],
                axis=1,
            ).droplevel(axis=1, level=0)
            # ALT: dep_subset.columns = dep_subset.columns.to_flat_index().str.join('_')

            res = {**res, **dep_subset.iloc[0].to_dict()}

        return res

    def get_meta_response_redacted(meta_response):
        import operator
        from copy import deepcopy
        from functools import reduce  # forward compatibility for Python 3

        k = [
            ("user", "username"),
            ("user", "info", "*"),
            ("user", "api_access", "hmac", "*"),
            ("user", "api_access", "oauth2", "*"),
            ("user_stations", "*", "networking", "simid"),
            ("user_stations", "*", "networking", "imei"),
            ("user_stations", "*", "networking", "imsi"),
        ]
        v = "***"
        d = deepcopy(meta_response)

        def find_key_nonrecursive(adict, key):
            stack = [adict]
            while stack:
                d = stack.pop()
                if key in d:
                    return d[key]
                for v in d.values():
                    if isinstance(v, dict):
                        stack.append(v)
                    if isinstance(v, list):
                        stack += v

        def get_by_path(root, items):
            """Access a nested object in root by item sequence."""
            return reduce(operator.getitem, items, root)

        def set_by_path(root, items, value):
            """Set a value in a nested object in root by item sequence."""
            get_by_path(root, items[:-1])[items[-1]] = value

        def get_path_list(d, path):
            kk = []
            if path[-1] == "*":
                kd = list(n for n in path[0:-1])
                try:
                    nd = get_by_path(d, kd)
                except KeyError:
                    nd = {}
                    pass 
                if isinstance(nd, dict):
                    for n, m in nd.items():
                        kk.append(tuple(kd + [n]))
                if isinstance(nd, list):
                    if len(nd) > 0:
                        for n, m in enumerate(nd):
                            kk.append(tuple(kd + [n]))
                    else:
                        # kk.append(tuple(kd + [0]))
                        pass  # no need to replace an empty array.                        
            elif path[1] == "*":
                if isinstance(d[path[0]], list):
                    for n, v in enumerate(d[path[0]]):
                        kk.append(tuple([path[0], n, *path[2:]]))
            else:
                kk.append(path)

            return kk

        nk = [tuple(n) for k in k for n in get_path_list(d, k)]
        for n in nk:
            try:
                set_by_path(d, n, v)
            except KeyError:
                pass            
            

        return d

    meta_response_subset = get_meta_response_redacted(meta_response)

    response_dict_subset = OrderedDict()

    for ind, response in response_dict.items():
        # print(ind)
        # processing identifiers
        system_id, timestamp, group = ind.split("_")

        if isinstance(response, requests.models.Response):
            res_dict = response.json()
        elif isinstance(response, dict):
            res_dict = response  # dict

        res_locs = get_deployment(meta_deployment, system_id, timestamp)

        if res_locs:
            # if station is defined in metadata, keep
            response_dict_subset[ind] = response

    return (meta_response_subset, response_dict_subset)


def fieldclimate_raw_filtered(
    input_filename, meta_response, response_dict, meta_deployment
):
    meta_response_subset, response_dict_subset = fieldclimate_raw_subset(
        meta_response, response_dict, meta_deployment
    )

    cache_output_subset = {
        **cache_subset,
        **{k: v for k, v in output_subset.items() if k in ["global_location"]},
    }

    cache_output_file = str(
        Path(input_filename).relative_to(
            rebase_path(
                path_base="FieldclimateAPI", path_root=Path(input_filename).parent
            )
        )
    ).replace(cache_subset["global_location"], output_subset["global_location"])

    res = export_fieldclimate_query_RAW(
        OrderedDict(
            list(meta_response_subset.items()) + list(response_dict_subset.items())
        ),
        output_file=cache_output_file,
        output_path=os.path.join(output_path_base, output_path),
        output_dict=cache_output_subset,
    )

    return res


In [None]:
def fieldclimate_l0(response_dict, meta_stations, meta_deployment):
    # convert responses to DataSets
    dx = get_fieldclimate_station_datasets(
        response_dict, meta_stations, meta_deployment
    )
    # filter for empty reponses
    dx = filter_fieldclimate_station_datasets(dx)

    if not query_latest:
        mod_output_subset = {
            **output_subset,
            **{
                "version_id": version["id"],
                "production_level": "L0",
                # "path_destination": "by-serialnr/Germany/Freiburg",
                "global_location": "anywhere",
                # "station_id": "Tier2/Fieldclimate",
                "time_query": "",
            },
        }

        # export L0 files (netcdf)
        res = export_fieldclimate_station_datasets_L0(
            dx,
            output_path=os.path.join(cache_path_base, cache_path).format(
                **mod_output_subset
            ),
        )
    return dx


def fieldclimate_l0_combine(dx):
    # filter: only deployed systems
    dx = filter_fieldclimate_station_datasets(dx.copy(), filters=[{"station_name": ""}])

    # merge DataSets into one
    logging.info(
        "Align and merge coords and data_vars for all stations into a new xr.Dataset"
    )
    ds = merge_fieldclimate_station_datasets(dx)

    # export
    if not query_latest:
        mod_output_subset = {
            **output_subset,
            **{
                "version_id": version["id"],
                "production_level": "L0",
                # "path_destination": "by-serialnr/Germany/Freiburg",
                # "global_location": "de.freiburg",
                # "station_id": "Tier2/Fieldclimate",
                "time_query": "",
                #"sensor_id": "LoRAIN",
                #"system_id": "LoRAIN",
            },
        }
        export_fieldclimate_station_datasets_L0_merged(
            ds,
            output_path=os.path.join(cache_path_base, cache_path),
            output_file=cache_file,
            output_dict=mod_output_subset,
        )

    # split station and system
    logging.info("Extract single stations from the merged xr.Dataset.")
    for i, g in ds.groupby("station", squeeze=False):
        for j, gg in g.groupby("system", squeeze=False):
            logging.info("Station `%s` System `%s`", i, j)
            output_dict = {
                **output_subset,
                **{
                    "version_id": version["id"],
                    "production_level": "L0",
                    # "path_destination": "by-location/Germany/Freiburg",
                    # "global_location": "de.freiburg",
                    "station_id": gg["station_id"].values[0],
                    "system_id": gg["system_id"].values[0][0],
                    "sensor_id": gg["system_id"].values[0][0],  ## attention
                },
            }
            export_fieldclimate_station_datasets_L0_merged(
                gg,
                output_path=os.path.join(output_path_base, output_path),
                output_file=output_file,
                output_dict=output_dict,
            )

    return ds

# Configuration

## Static configuration

In [None]:
# Version
version = {
    "id": "v1.0.0",
    "time": "2022-05-05",
}  # first version.
version = {
    "id": "v1.0.1",
    "time": "2022-08-03",
}  # update version.
version = {
    "id": "v1.0.2",
    "time": "2022-12-12",
}  # updated version, configuration files
version = {
    "id": "v1.0.3",
    "time": "2023-02-16",
}  # updated version, configuration files
version = {
    "id": "v1.0.4",
    "time": "2023-09-25",
}  # updated version, configuration files
version = {
    "id": "v1.0.5",
    "time": "2024-08-24",
}  # updated version, configuration files


# Configuration file for input / output files
ioconfig_name = "fieldclimate_metadata"
try:
    ioconfig_file = "../conf/{}.toml".format(ioconfig_name)
    if not Path(ioconfig_file).exists():
        raise
except:
    ioconfig_file = "conf/{}.toml".format(ioconfig_name)
# ----- Papermill injection below this cell -----

In [None]:
# input/output config
ioconf = parse_config(ioconfig_file, ioconfig_name, version)

# validate config (to do)

In [None]:
"""
Note: the approach to set global helper variables should be revised. 
But was/is used in combination with papermill automation.
"""

# set global variables
query_from = ioconf["query"]["start"]
query_to = None
query_period = ioconf["query"]["period"]
query_index = ioconf["query"]["system_index"]
query_cache = ioconf["query"]["cache"]
query_latest = ioconf["query"]["latest"]
query_tasks = ioconf["query"]["tasks"]
query_key_file = ioconf["query"]["key_file"]
query_city = None if not "city" in ioconf["query"] else ioconf["query"]["city"]

input_path_base = ioconf["input"]["path_base"]
input_path = None if not "path" in ioconf["input"] else ioconf["input"]["path"]
# input_file = ioconf["input"]["file"]
# input_subset = ioconf["input"]["subset"]
cache_path_base = ioconf["cache"]["path_base"]
cache_path = ioconf["cache"]["path"]
cache_file = ioconf["cache"]["file"]
output_path_base = ioconf["output"]["path_base"]
output_path = ioconf["output"]["path"]
output_file = ioconf["output"]["file"]
output_subset = ioconf["output"]["subset"]
output_encoder = ioconf["output"]["encoder"]

log_path = ioconf["logging"]["path"]
log_file = ioconf["logging"]["file"]
log_format = ioconf["logging"]["format"]
log_filemode = (
    "a" if not "filemode" in ioconf["logging"] else ioconf["logging"]["filemode"]
)

gattrs = ioconf["gattrs"]

## Logging Configuration

In [None]:
# create logger
import logging
import logging.handlers
from pprint import pformat

logging.basicConfig(
    encoding="utf-8",
    format=log_format,
    level=logging.INFO,
    # Declare handlers
    handlers=[
        logging.FileHandler(
            os.path.join(log_path, log_file).format(version_id=version["id"]),
            mode=log_filemode,
        ),
        logging.StreamHandler(sys.stdout),
    ],
)

## Dynamic Configuration

In [None]:
# time related helper variables
if not query_to and query_period:
    query_to = (
        pd.to_datetime(query_from, format="ISO8601") + pd.to_timedelta(query_period)
    ).strftime("%Y-%m-%d %H:%M:%S")
query_range = pd.to_datetime([query_from, query_to], format="ISO8601")

# meta data query
# meta_deployment = get_metadata_inventory_deployment()
meta_deployment = get_metadata_metadb_deployment(query_city, query_range)


# cache settings
cache_subset = {
    **output_subset,
    **{
        "version_id": version["id"],
        "production_level": "RAW",
        "global_location": "anywhere",
        # "station_id": "Tier2/Fieldclimate",
        "system_name": "LoRAIN",
        "time_bounds": get_time_bounds(query_range),
        "time_query": {
            "v1.0.4": datetime.utcnow().strftime("%Y-%m-%d"),
            "v1.0.5": datetime.utcnow().strftime("dupes/by-upload-date/%Y-%m-%d"),
        }[version["id"]],
        "extension": "tar.gz",
    },
}

input_cache_subset = {**cache_subset, **{"time_query": "**", "version_id": "v1.0.?"}}

# Fieldclimate Connection Settings
with open(query_key_file, "r") as keyfile:
    apiKeys = json.load(keyfile)

apiConf = {
    **dict(apiURI=input_path_base),  # Endpoint of the API
    **apiKeys,
}

# Main

In [None]:
if __name__ == "__main__":
    logging.info("Configuration context: `query_range`: %s", query_range.tolist())
    logging.info("Configuration context: `query_latest`: %s", query_latest)
    logging.info("Configuration context: `query_tasks`: %s", query_tasks)
    logging.info("Configuration context: `query_city`: %s", query_city)
    logging.info("Configuration context: `query_cache`: %s", query_cache)
    logging.info("Configuration context: `input_path_base`: %s", input_path_base)
    logging.info("Configuration context: `input_path`: %s", input_path)
    logging.info("Configuration context: `cache_path_base`: %s", cache_path_base)
    logging.info("Configuration context: `cache_path`: %s", cache_path)
    # query
    if "query" in query_tasks:
        logging.info("Task group `query`")
        meta_response, response_dict, meta_stations = fieldclimate_raw(apiConf)
        query_list = [tuple([meta_response, response_dict, meta_stations])]
    else:
        # delay the processing, until a schedule alignment solution has been found
        if query_latest:
            time.sleep(60)

    # cleanup data protection strings from raw responses
    if "clean" in query_tasks:
        logging.info("Task group `clean`")
        if not "query" in query_tasks:
            pass
        
        query_list = import_fieldclimate_query_RAW(
            input_file=cache_file,
            input_path=os.path.join(
                cache_path_base, input_path if input_path else cache_path
            ),
            input_dict=input_cache_subset,
        )

        dc = None
        for cache_filename, meta_response, response_dict, meta_stations in query_list:
            if not dc:
                dc = fieldclimate_raw_filtered(
                    cache_filename, meta_response, response_dict, meta_deployment
                )

    # convert
    if "convert" in query_tasks:
        logging.info("Task group `convert`")
        if not "query" in query_tasks:
            pass

        query_list = import_fieldclimate_query_RAW(
            input_file=cache_file,
            input_path=os.path.join(
                cache_path_base, input_path if input_path else cache_path
            ),
            input_dict=input_cache_subset,
        )

        dx = None
        for cache_filename, meta_response, response_dict, meta_stations in query_list:
            if not dx:
                dx = fieldclimate_l0(response_dict, meta_stations, meta_deployment)

    # combine
    if "combine" in query_tasks:
        logging.info("Task group `combine`")
        if dx:
            ddx = fieldclimate_l0_combine(dx)
        else:
            logging.warning("Warning: No data to process.")

    logging.info("End.")