# Introduction

Created on Sat Jul 23 15:11:15 2022

Modified on Fri Nov 25 13:37:01 2022
v1.0.2 - code clean-up

@author: zeeman-m

In [3]:
# Other requirements
import glob
import json
import os
import re
import shutil
import sys
import time
from collections import OrderedDict
from datetime import datetime, timedelta
from pathlib import Path

import diskcache
import numpy as np
import pandas as pd
import plotly as py
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import toml
import xarray as xr
from plotly.subplots import make_subplots
from tqdm.notebook import tqdm

xr.set_options(keep_attrs=True)

<xarray.core.options.set_options at 0x10e25e4d0>

In [2]:
def rebase_path(path_base="urbisphere-dm", path_root=None):
    """return abs path of a higher level directory"""
    from pathlib import Path

    path_root = Path("__file__").parent.resolve() if not path_root else path_root
    path_parts = lambda p: p[0 : (p.index(path_base) + 1 if path_base in p else len(p))]
    return str(Path(*[n for n in path_parts(Path(path_root).parts)]))


sys.path.append(os.path.join(rebase_path(), "interfaces/metadb/notebooks/"))
from ipynb.fs.full.metadb_query import metadb_query as metadb_query_subset_table
from ipynb.fs.full.metadb_query import metadb_sql_query as metadb_sql_query
from ipynb.fs.full.metadb_query import metadb_sql_response as metadb_sql_response

In [None]:
class StopExecution(Exception):
    def _render_traceback_(self):
        pass


def get_dictlist_permutations(input_subset):
    """Expand a dict of (strs|lists) into all possible permutations."""
    import itertools

    keys, values = zip(*input_subset.items())
    values = [v if isinstance(v, list) else [v] for v in values]
    permutations_dicts = [dict(zip(keys, v)) for v in itertools.product(*values)]
    return permutations_dicts


def get_dictlist_flatten(input_subset, joinstr="+"):
    """Flatten dict of (strs|lists) into dict of (strs)."""
    import itertools

    keys, values = zip(*input_subset.items())
    values = [joinstr.join(v) if isinstance(v, list) else (v) for v in values]
    return dict(zip(keys, values))

In [None]:
def parse_config(ioconf_file, ioconf_name=None, version_dict=None):
    """Read configuration file and extract dict that matches version['id']."""
    import toml
    from mergedeep import merge

    def read_config(ioconfig_file):
        if os.path.exists(ioconfig_file):
            with open(ioconfig_file) as f:
                ioconf = toml.load(f)
            return ioconf
        else:
            return None

    # read TOML config file
    ioconf = read_config(ioconfig_file)

    if isinstance(version_dict, dict):
        # lookup of settings
        if ioconfig_name:
            group = ioconfig_name
        else:
            group = Path(ioconfig_file).stem

        if group in ioconf:
            conf_list = [
                d
                for d in ioconf[group]
                if version_dict["id"].startswith(
                    d["version"]["id"] if "version" in d else "v"
                )
            ]
            config = merge(*conf_list)
        else:
            config = {}

        return config
    else:
        return ioconf

In [None]:
def get_gattrs(gattrs, data_subset):
    ga = gattrs[0]
    gs = {
        "station_id": data_subset["station"]["station_id"],
        "system_id": data_subset["system"]["system_id"],
    }
    if len(ga) > 1:
        for g in gattrs[1:]:
            bo = False
            for k in gs.keys():
                if k in g and not bo:
                    bo = any(re.compile(m).match(gs[k]) for m in g[k])
                    if bo:
                        for k, v in g.items():
                            if k in ga:
                                ga[k] = v

    gd = {
        "version_id": version["id"],
        "version_date": version["time"],
        "creation_time": get_creation_time(),
    }
    ga = {k: "; ".join(v) if isinstance(v, list) else v for k, v in ga.items()}
    ga = {k: v.format(**gd) for k, v in ga.items()}
    return ga

In [None]:
def input_args(query_index):
    metadb = metadb_query_subset_table(meta_data_subset)
    print("Available:")
    print(
        metadb.loc[:, ("id", slice(None))]
        # .drop_duplicates([("id", "station_id")], keep="first")
        .loc[:, "id"]
        .sort_values(["station_id", "sensor_id"])
        # .drop(["sensor_id"], axis=1)
        .set_index("station_id")
        .to_markdown()
    )

    # evaluate index
    metadb.index = pd.Index(metadb.groupby(("id", "system_id")).ngroup())

    if isinstance(query_index, str):
        query_index = (
            metadb.reset_index()
            .index[metadb["id"]["system_id"] == query_index]
            .tolist()
        )
        if len(query_index) > 0:
            query_index = query_index  # [-1]
        else:
            print("`query_index` outside scope of Meta Data DB table")
            sys.exit()

    if isinstance(query_index, int):
        if query_index > metadb.shape[0]:
            print("`query_index` outside scope of Meta Data DB table")
            sys.exit()

    # subset
    if query_index is not None:
        if not isinstance(query_index,list):
            query_index = [query_index]        
        metadb = metadb.iloc[[query_index], :]

    print("\nSelection:")
    print(
        metadb.loc[:, ("id", slice(None))]
        # .drop_duplicates([("id", "station_id")], keep="first")
        .loc[:, "id"]
        .sort_values(["station_id", "sensor_id"])
        # .drop(["sensor_id"], axis=1)
        .set_index("station_id")
        .to_markdown()
    )

    # input stations
    input_stations = []  # manual entries
    input_stations.extend(
        [
            {
                "system_name": k[("id", "system_name")],
                "system_id": k[("id", "system_id")],
                "sensor_id": k[("id", "sensor_id")],
            }
            for k in metadb.loc[
                :,
                [
                    ("id", "system_name"),
                    ("id", "system_id"),
                    ("id", "sensor_id"),
                ],
            ].to_dict(orient="records")
        ]
    )

    return query_index, input_stations, metadb

In [None]:
def get_input_files(input_subset, data_subset, debug=False):
    input_files = [
        {
            "type": n["file"]["type"],
            "path": os.path.join(n["path_base"], n["path"], n["file"]["path"]),
            "file": n["file"]["file"],
        }
        for n in get_dictlist_permutations(
            dict(
                path_base=(
                    input_path_base
                    if isinstance(input_path_base, list)
                    else [input_path_base]
                ),
                path=(input_path if isinstance(input_path, list) else [input_path]),
                file=input_file,
            )
        )
    ]

    # expand input_files
    fn_list = [
        {
            "type": input_file["type"],
            "file": os.path.join(input_file["path"], input_file["file"]).format(
                **{**ss, **data_subset["system"]}
            ),
        }
        for input_file in input_files
        for ss in get_dictlist_permutations(input_subset)
        if input_file["type"] in query_tasks
    ]  #

    def validate_filepath(fi):
        m_str = None
        t_str = None
        t_fmt = None
        valid = False
        
        # check if matches dupes path patterns
        if not m_str and not valid:
            m_pat = "\/(dupes\\/by-upload-date)+\/"
            m_str = re.findall(m_pat, fi)            
            if m_str:   
                m_str = m_str[0]
                valid = True        
        
        # check for dates, valid date range
        if not t_str and not valid:
            t_pat = ".*?_(\d{4}-\d{2}-\d{2})+\.\w{3}$"
            t_str = re.findall(t_pat, fi)
            if t_str:
                t_str = t_str[0]            
                t_fmt = "%Y-%m-%d"
        
        if not t_str and not valid:
            t_pat = "\/(\d{4}\/\d{3})+\/"
            t_str = re.findall(t_pat, fi)
            if t_str:
                t_str = t_str[0]
                t_fmt = "%Y/%j"
            
        if t_str and t_fmt and not valid:
            fd = pd.to_datetime(t_str, format=t_fmt)
            if (
                fd >= data_subset["time_range"].start
                and fd <= data_subset["time_range"].stop
            ):
                valid = True
        
        return(valid)
        
        
    fns_list = {}
    for fn in fn_list:
        fns = glob.glob(fn["file"])
        for fi in fns:
            # try:
            #     # retrieve timestamp from file name
            #     t_pat = ".*?_(\d{4}-\d{2}-\d{2})+\.\w{3}$"
            #     t_str = re.findall(t_pat, fi)[0]
            #     fd = pd.to_datetime(t_str, format="%Y-%m-%d")
            # except:
            #     # retrieve timestamp from file path
            #     t_pat = "\/(\d{4}\/\d{3})+\/"
            #     t_str = re.findall(t_pat, fi)[0]
            #     fd = pd.to_datetime(t_str, format="%Y/%j")
            # try:
            #     if (
            #         fd >= data_subset["time_range"].start
            #         and fd <= data_subset["time_range"].stop
            #     ):
            #         if not fn["type"] in fns_list:
            #             fns_list[fn["type"]] = []
            #         fns_list[fn["type"]] = sorted([fi] + fns_list[fn["type"]])
            # except:
            #     logging.debug("Skipped file %s", fn)
            try:
                if validate_filepath(fi):
                    if not fn["type"] in fns_list:
                        fns_list[fn["type"]] = []
                    fns_list[fn["type"]] = sorted([fi] + fns_list[fn["type"]])        
            except:
                logging.debug("Skipped file %s", fn)                    

    if debug:
        return input_files, fn_list, fns_list
    else:
        return fns_list

In [None]:
# get_input_files(input_subset, data_subset, debug=True)

In [None]:
def get_toa5_header(filename, n_header=4):
    with open(filename) as txtfile:
        head = [
            [x.strip('"') for x in next(txtfile).strip().split(",")]
            for x in range(n_header)
        ]

    return head


def get_toa5_data(filename, header, n_header=4):
    """cs logger file"""
    col_names = header[1]
    data = pd.read_csv(
        filename,
        skiprows=n_header,
        engine="python",
        # warn_bad_lines=False, # until pandas 1.3
        # error_bad_lines=False, # until pandas 1.3
        on_bad_lines="skip",  # from pandas 1.3
        names=col_names,
        na_values=["NAN"],
    )

    if isinstance(data, pd.DataFrame):
        # dtype coercion
        d_k = []
        for n, k in enumerate(header[1]):
            # time
            if header[2][n] == "TS":
                data[k] = pd.to_datetime(data[k], origin="unix", errors="coerce",format='ISO8601')
                d_k.append(k)
            # integer
            if header[2][n] == "RN":
                data[k] = pd.to_numeric(data[k], downcast="integer", errors="coerce")
                d_k.append(k)
            # numeric values
            if header[3][n] not in ["TS", "RN"] and not k in [
                "SYS_CV50_Meta"
            ]:  # ["Smp", "Tot"]
                data[k] = pd.to_numeric(data[k], errors="coerce")
                # d_k.append(k)

        # filter for valid values on selected (critical) columns
        data.dropna(axis=0, subset=d_k, how="any", inplace=True)

    return data


def get_toa5_metadata_attributes(header):
    res = OrderedDict(
        [
            ("system_id", header[0][3]),
            ("system_desc", header[0]),
            ("channel_id", header[0][-1]),
            ("cell_name", header[1]),
            ("cell_units", header[2]),
            ("cell_methods", header[3]),
            # ("time", pd.to_datetime(datetime(**{**d_date, **d_time}))),
            (
                "data_vars_1",
                [
                    dict(
                        zip(
                            ["name", "units", "original_name"],
                            [n[0].strip(), n[1], n[0].strip()],
                        )
                    )
                    for n in list(zip(header[1], header[2]))
                ],
            ),
        ]
    )
    return res


def get_toa5_cell_methods(aggr, cell_dim="time"):
    lut = {
        "Max": "maximum",
        "Min": "minimum",
        "Avg": "mean",
        "Std": "standard_deviation",
        "Vec": "mean",
        "Tot": "sum",
        "latest": "last",
    }
    if aggr in [
        "sum",
        "mean",
        "maximum",
        "minimum",
        "mid_range",
        "standard_deviation",
        "variance",
        "mode",
        "median",
    ]:
        res = OrderedDict(
            cell_methods="{}: {}".format(cell_dim, aggr),
            cell_bounds_domain={"{}_{}".format(cell_dim, "bounds"): [0, 1]},
        )
    elif aggr in lut.keys():
        res = OrderedDict(
            cell_methods="{}: {}".format(cell_dim, lut[aggr]),
            cell_bounds_domain={"{}_{}".format(cell_dim, "bounds"): [0, 1]},
        )
    elif aggr in ["point"]:
        res = OrderedDict(
            cell_methods="{}: {}".format(cell_dim, aggr),
            cell_bounds_domain={"{}_{}".format(cell_dim, "bounds"): [0, 0]},
        )
    elif aggr in ["last"]:
        res = OrderedDict(
            cell_methods="{}: {}".format(cell_dim, "point"),
            cell_bounds_domain={"{}_{}".format(cell_dim, "bounds"): [1, 1]},
        )
    else:
        res = OrderedDict()

    return res


def get_toa5_query(filename):
    header = get_toa5_header(filename)
    d_attr = get_toa5_metadata_attributes(header)

    if (header[0][0] == "TOA5") and (header[1][0] == "TIMESTAMP"):
        data = get_toa5_data(filename, header)
    else:
        data = None

    return {
        "header": header,
        "attributes": d_attr,
        "data": data,
    }

In [None]:
def get_toa5_datasets(
    metadb, data, header, attributes 
):  # , responses, meta_stations, meta_locations
    def evaluate_attributes(d):
        """Encoding of list and dicts to json."""
        import json

        res = {
            "attributes_id": [],
            "attributes_key": [],
            "attributes_type": [],
            "attributes_val": [],
        }
        for i, j in d.items():
            for k, v in j.items():
                res["attributes_id"].append(i)
                res["attributes_key"].append(k)
                res["attributes_type"].append(str(type(v).__name__))
                if isinstance(v, list) or isinstance(v, dict):
                    v = [
                        n if (isinstance(n, float) or isinstance(n, int)) else str(n)
                        for n in v
                    ]
                    v = json.dumps(v)
                else:
                    v = str(v)
                    v = json.dumps(v)

                res["attributes_val"].append(v)
        return res

    def unexplode(df):
        """Compress columns into single rows"""
        dd = []
        for i in list(df):
            m_list = df[i].tolist()
            m_uniq = list(set(m_list))
            if not isinstance(m_uniq, list):
                m = m_list
            elif len(m_uniq) > 1:
                m = [m_list]
            else:
                m = m_uniq
            dd.append((i, m))

        res = pd.DataFrame(dict(dd))
        return res

    ds_dict = OrderedDict()

    # processing identifiers
    system_id = attributes["system_id"]

    """
    res_meta = metadb
    res_meta = (
        get_cslogger_metadb()
        .set_index(("id", "system_id"))
        .loc[[system_id]]
        .reset_index()
        .iloc[0]
        .copy()
    )
    """

    # res_meta = metadb["id"].set_index("system_id")
    # res_meta = res_meta.loc[  # get_streamline_metadb(meta_data_subset)
    #    attributes["system_id"], :
    # ].to_dict()

    # res_meta = metadb.iloc[0]
    res_meta = unexplode(metadb).iloc[0]
    res_refs = get_global_reference()
    res_glob = get_global_attributes()

    # attributes
    dattr_system = OrderedDict(
        {
            **res_meta["system"].to_dict(),
            **{k: v for k, v in attributes.items() if not k.startswith("data_var")},
        }
    )
    dattr = OrderedDict(
        [
            ("global", OrderedDict(res_refs)),
            ("station", OrderedDict(res_meta["station"].to_dict())),
            ("system", dattr_system),
            ("sensor", OrderedDict(res_meta["sensor"].to_dict())),
            ("channel", OrderedDict()),
            ("cell", OrderedDict(res_meta["configuration"].to_dict())),
        ]
    )

    # flatten attributes, convert values to json
    ddattr = evaluate_attributes(dattr)

    attributes_values = (
        "attributes_values".format(""),
        (
            ["station", "system", "sensor", "channel", "cell", "attributes"],
            [[[[[ddattr["attributes_val"]] * 1]]]],
            {
                "description": "Attributes combined from meta data and input-file headers."
            },
        ),
    )

    def get_data_vars(k_dv, d_dv, l_dl, a_dv={}, dim_names=["time"]):
        var_name = d_dv["name"].replace(" ", "_")
        var_long_name = d_dv["name"]
        var_units = d_dv["units"]

        # var = [dv[k_dv] for k, dv in l_dl.items()]
        # var = pd.to_numeric(l_dl[var_name].values.tolist(),errors='coerce').tolist()
        var = l_dl[var_name].values.tolist()
        var_dim = 1

        # stack 1-dimensions
        for n in range(len(dim_names) - var_dim):
            var = [var]

        data_var = (
            "{}".format(var_name),
            (
                dim_names,
                var,
                {**{"long_name": var_long_name, "units": var_units}, **a_dv},
            ),
        )
        return data_var

    coord_time = pd.to_datetime(data["TIMESTAMP"])
    coord_cell_methods = [
        n["cell_methods"] if "cell_methods" in n.keys() else ""
        for n in [get_toa5_cell_methods(m) for m in attributes["cell_methods"]]
    ]

    # return(attributes)
    # Data variables, filters (2)
    data_vars = []
    for k_dv, d_dv in enumerate(attributes["data_vars_1"]):
        data_var = get_data_vars(
            k_dv,
            d_dv,
            data,
            {"cell_methods": coord_cell_methods[k_dv]},
            dim_names=["station", "system", "channel", "time"],
        )
        data_vars.append(data_var)

    coords = [
        (
            "time",
            (
                ["time"],
                coord_time,
                {
                    "long_name": "time",
                    "standard_name": "time",
                    # "calendar": "proleptic_gregorian",  # xarray error
                    # "units": "microseconds since 1970-01-01 00:00:00 +0000", # xarray error
                },
            ),
        ),
        (
            "station_id",
            (["station"], [res_meta[("id", "station_id")]], OrderedDict()),
        ),
        ("station_name", (["station"], [res_meta[("id", "station_name")]], {})),
        ("station_lat", (["station"], [float(res_meta[("id", "station_lat")])], {})),
        ("station_lon", (["station"], [float(res_meta[("id", "station_lon")])], {})),
        (
            "station_height",
            (["station"], [float(res_meta[("id", "station_height")])], {}),
        ),
        ("system_id", (["system"], [attributes["system_id"]], {})),
        ("system_name", (["system"], [str(res_meta[("id", "system_name")])], {})),        
        ("system_group", (["system"], [str("AWS")], {})),
        ("sensor_id", (["sensor"], [""], {})),
        ("channel_id", (["channel"], [attributes["channel_id"]], {})),
        # ("cell_id", (["cell"], coord_cell, {})),
        ("attributes_id", (["attributes"], ddattr["attributes_key"], {})),
        ("attributes_group", (["attributes"], ddattr["attributes_id"], {})),
        ("attributes_type", (["attributes"], ddattr["attributes_type"], {})),
    ]

    # add attributes_values as coord or data_vars?
    coords.append(attributes_values)

    # Conversion to dict
    data_vars = OrderedDict(data_vars)
    coords = OrderedDict(coords)

    # Dataset (xarray)
    ds = xr.Dataset(
        data_vars=data_vars,
        coords=coords,
        attrs=res_glob,
    )

    # Dimensions sort order (xarray)
    ds = ds.transpose(
        "station",
        "system",
        "sensor",
        "channel",
        "time",
        "cell",
        "attributes",
        missing_dims="ignore",
    )

    return ds

In [None]:
def get_datasets_groups(
    dx, group_keys=["station_id", "system_id", "channel_id", "attributes_id"]
):
    dsi = []
    for n, ds in enumerate(dx):
        dsi.append(
            OrderedDict(
                [
                    ("index", n),
                    ("station_id", ds["station_id"].values.tolist()),
                    ("system_id", ds["system_id"].values.tolist()),
                    ("channel_id", ds["channel_id"].values.tolist()),
                    (
                        "attributes_id",
                        [
                            ds["attributes_values"]
                            .sel(attributes=ds["attributes_id"] == ["system_desc"])
                            .squeeze()
                            .values.tolist()
                        ],
                    ),
                    ("variables", list(ds.keys())),
                ]
            )
        )
    dsg = {}
    for si in dsi:
        k = [(k, n) for k, v in si.items() if k in group_keys for n in v]
        k = tuple(k)
        if not k in list(dsg.keys()):
            dsg[k] = []

        dsg[k].append(si["index"])
    return (dsi, dsg)

In [None]:
def get_creation_time(d=datetime.utcnow()):
    d_pat = "[%Y%m%dT%H%M%S+0000] %a, %d %b %Y %H:%M:%S GMT"
    d_str = d.strftime(d_pat)
    return d_str


def get_time_bounds(query_range):
    tr = query_range.strftime("%Y%m%dT%H%M%S%z").tolist()
    res = tr[0] if tr[0] == tr[1] else "{}_{}".format(*tr)
    return res


def get_global_attributes():
    return get_gattrs(gattrs, data_subset)


def get_global_reference():
    crs = OrderedDict(
        origin_time="microseconds since 1970-01-01 00:00:00 +0000",
        origin_lon=0.0,
        origin_lat=0.0,
        origin_utm_x=0.0,
        origin_utm_y=0.0,
        origin_x=0.0,
        origin_y=0.0,
        origin_z=0.0,  # or "station: h"
        origin_h="meters above mean sea level",  # or "meters above Normaal Amsterdams Peil"
        origin_azimuth=0.0,
    )
    return crs

In [None]:
def export_datasets(
    dx,
    output_file,
    encoding={},
    mode="w",
    reduced_size=False,
    verbose=False,
):
    import os
    from pathlib import Path

    if output_file:
        fn = Path(output_file)

        if isinstance(dx, xr.Dataset):
            try:
                fn.parent.mkdir(parents=True, exist_ok=True)
                dx.to_netcdf(output_file, encoding=encoding, mode=mode)
                logging.info("File '{}' was written.".format(fn))
            except:
                logging.info("Warning: File '{}' was not written.".format(fn))
                pass

In [None]:
def get_datasets_attrgroups(dsg):
    k = pd.DataFrame([dict(x) for x in dsg.keys()])
    v = pd.DataFrame([dict([("dataset_id", x)]) for x in dsg.values()])

    df = pd.concat([k, v], axis=1)
    df["attributes_id"] = df["attributes_id"].map(lambda x: json.loads(x)[0:7])
    df = df.reset_index()
    df.index = pd.MultiIndex.from_tuples(df["attributes_id"].values.tolist())
    df = df.sort_index()
    return df


def get_merged_attrgroups(dc, dxg):
    def dim_set(dc):
        dd = (
            dc.loc[:, ["channel_id", "system_id", "station_id"]]
            .reset_index(drop=True)
            .to_dict(orient="list")
        )
        return {k: list(set(v)) for k, v in dd.items()}

    def dxg_set(d, reset=False, lut={}):
        if reset:
            return d.reset_index(list(lut.keys())).rename_vars(lut)
        else:
            return d.set_index(**lut)

    dim_lut = {k.split("_")[0]: k for k, v in dim_set(dc).items() if len(v) > 1}

    dxga = xr.combine_by_coords([dxg_set(d, lut=dim_lut) for d in dxg])
    dxgb = dxg_set(dxga, lut=dim_lut, reset=True)

    return dxgb


def export_merged_attrgroups(dxi, nc_mode="w"):
    time_bounds = pd.Index(
        [
            query_range[0]
            if np.any((dxi["time"] < query_range[0]).any().values)
            else pd.to_datetime(dxi["time"].min().values,format='ISO8601'),
            query_range[1]
            if np.any((dxi["time"] > query_range[1]).any().values)
            else pd.to_datetime(dxi["time"].max().values,format='ISO8601'),
        ]
    )
    fn, fs = get_output_files(
        dxi,
        custom_subset={"time_bounds": get_time_bounds(time_bounds)},
    )

    export_datasets(
        dxi,
        encoding={},
        mode=nc_mode,
        output_file=fn,
        reduced_size=True,
        verbose=True,
    )

    try:
        fn_log_src = Path(os.path.join(log_path, log_file))
        fn_log_dst = Path(fn).with_suffix(".log")
        shutil.copy(fn_log_src, fn_log_dst)
        logging.info("Log file saved.")
        logging.info("File '%s' was written.", str(fn_log_dst))
        shutil.copy(fn_log_src, fn_log_dst)
    except:
        logging.info("Log file not saved.")


def get_output_files(dx, default_subset={}, custom_subset={}):
    # dataset lookup
    s_id = list(dx["station_id"].values)
    i_id = list(dx["system_id"].values)

    # modify output subset dict, based on input settings
    output_subset = get_dictlist_flatten(input_subset)
    output_subset["station_id"] = "".join(s_id) if len(s_id) == 1 else ""
    output_subset["system_id"] = "+".join(i_id)
    output_subset["version_id"] = version["id"]
    output_subset["time_bounds"] = "_".join(
        [
            x.strftime("%Y%m%dT%H%M%S")
            for x in [data_subset["time"].start, data_subset["time"].stop]
        ]
    )
    output_subset = {**output_subset, **default_subset}
    output_subset = {**output_subset, **custom_subset}

    #
    output_file = os.path.join(output_files["path"], output_files["file"]).format(
        **output_subset
    )
    return output_file, output_subset

In [None]:
def main(
    metadb_df,
    combine_dims=["time"],
    export=True,
    verbose=True,
):
    for gb_ind, metadb in metadb_df.groupby(
        [("id", "station_id"), ("id", "system_id")]
    ):
        logging.info("MetaDataDB `index`: %s", gb_ind)
        logging.info("MetaDataDB `station_id`: %s", metadb["id"]["station_id"].iloc[0])
        logging.info("MetaDataDB `system_id`: %s", metadb["id"]["system_id"].iloc[0])

        # data related helper dict:
        global data_subset
        data_subset = {
            "station": {"station_id": metadb["id"]["station_id"].iloc[0]},
            "system": {
                k: metadb["id"][k].iloc[0]
                for k in ["system_name", "system_id", "sensor_id"]
            },  # metadb_systems[0],  # e.g., 0 for "175",
            "time": slice(*query_range),
            "time_range": slice(*(query_range + pd.to_timedelta(["-1D", "1D"]))),
        }

        # update ioconfig
        ioconf["gattrs"] = get_gattrs(gattrs, data_subset)

        dx = []
        dxcache = []
        dxg = []
        nc_mode = "w"

        # read source file list
        p_list = get_input_files(input_subset, data_subset)

        # read files
        if p_list:
            for ind, fns in p_list.items():
                for filename in tqdm(fns):
                    filename = os.path.expanduser(filename)

                    # read information from file
                    try:
                        logging.info("Read input file '%s'.", filename)
                        data = get_toa5_query(filename)
                    except:
                        logging.info("Warning: File '%s' could not be read.", filename)
                        continue

                    # convert information to xarray dataset (netcdf)
                    try:
                        logging.info("Convert data to dataset.")
                        logging.info("Dataset `channel_id` '%s'.", ind)
                        ds = get_toa5_datasets(metadb, **data)
                    except:
                        logging.info(
                            "Warning: File '%s' could not be converted to a dataset.",
                            filename,
                        )
                        continue

                    # export, for debugging
                    if export and (None in combine_dims):
                        export_datasets(
                            ds,
                            encoding={},
                            reduced_size=True,
                            output_file=fp,
                            verbose=verbose,
                        )

                    dx.append(ds)
                    logging.info("Dataset `dataset_id` '%s'.", str(len(dx)))

            # define cache (not implemented)
            dxcache = dx

            # return(dx)
            # continue with groups of data, in case multiple tables were read
            dsi, dsg = get_datasets_groups(dxcache)
            dsc = get_datasets_attrgroups(dsg)

            if "time" in combine_dims:
                for ic, dc in dsc.groupby(dsc.index):
                    logging.info("Attributes group: %s", " ; ".join(ic))
                    idc = dc["dataset_id"].values.tolist()
                    logging.info("Attributes group `dataset_id`: %s", json.dumps(idc))

                    dxg = []
                    for idx in idc:
                        # select variables
                        vnl = [list(dx[idx[0]].keys())]
                        for vn in vnl:
                            dxi = xr.concat(
                                [
                                    dxcache[n]
                                    .copy()
                                    .drop_vars(
                                        [x for x in dxcache[n].keys() if not x in vn]
                                    )
                                    for n in idx
                                ],
                                dim="time",
                            )

                            # evaluate and filter merge results (dim: time)
                            dxi = dxi.sel(time=dxi["time"].notnull())  # NaT
                            dxi = dxi.drop_duplicates("time", keep="first")
                            
                            if dxi["time"].shape[0] == 0:
                                logging.info("Skipping empty datasets.")
                                continue                            
                            
                            # slice to match query, in case of dupes
                            dxi = dxi.sortby(['time'])
                            dxi = dxi.sel(time=slice(query_range[0],query_range[1]))
                            
                            if dxi["time"].shape[0] == 0:
                                logging.info("Skipping empty datasets.")
                                continue

                            dxg.append(dxi.copy())
                            logging.info("Add variables: %s", " ; ".join(vn))
                            
                    if dxg:
                        # merge group (e.g, by channel)
                        dxm = get_merged_attrgroups(dc, dxg)

                        # export xarray dataset to netcdf
                        if export:
                            export_merged_attrgroups(dxm)

            # if dxi:
            #    logging.info("Returning merged dataset.")
            #    return dxi
            # if dx:
            #    return dx

# Configuration
## Static Configuration

In [None]:
# history
version = {
    "id": "000_prerelease",
    "time": "2022-07-25",
}  # prerelease.
version = {
    "id": "v1.0.0",
    "time": "2022-10-10",
}  # first version.
version = {
    "id": "v1.0.1",
    "time": "2022-10-11",
}  # first version.
# - [x] update system group, CR -> AWS
# - [x] use of TOML configuration

# Configuration file for input / output files
ioconfig_name = "cslogger_metadata"
try:
    import ipynbname

    ioconfig_file = "{}.toml".format(ipynbname.name())
except:
    ioconfig_file = "{}.toml".format(ioconfig_name)

# ----- Papermill injection below this cell -----

In [None]:
# input/output config
ioconf = parse_config(ioconfig_file, ioconfig_name, version)

# validate config (to do)

In [None]:
# ioconf

In [None]:
"""
Note: the approach to set global helper variables should be revised. 
But was/is used in combination with papermill automation.
"""

# set global variables
query_from = ioconf["query"]["start"]
query_to = None
query_period = ioconf["query"]["period"]
query_index = (
    ioconf["query"]["system_index"] if ioconf["query"]["system_index"] != "" else None
)
query_city = ioconf["query"]["city"]
query_tasks = ioconf["query"]["tasks"]
query_cache = ioconf["query"]["cache"]
input_subset = ioconf["input"]["subset"]
input_path_base = ioconf["input"]["path_base"]
input_path = ioconf["input"]["path"]
input_file = ioconf["input"]["file"]
output_path_base = ioconf["output"]["path_base"]
output_path = ioconf["output"]["path"]
output_file = ioconf["output"]["file"]

log_path = ioconf["logging"]["path"]
log_file = ioconf["logging"]["file"]
log_format = ioconf["logging"]["format"]

gattrs = ioconf["gattrs"]

In [None]:
query_index

## Logging Configuration

In [None]:
# create logger
import logging
import logging.handlers
from pprint import pformat

logging.basicConfig(
    # encoding="utf-8",
    format=log_format,
    level=logging.INFO,
    # Declare handlers
    handlers=[
        logging.FileHandler(
            os.path.join(log_path, log_file), "w"
        ),  # overwrite, instad of append 'w+'
        logging.StreamHandler(sys.stdout),
    ],
)

## Dynamic Configuration

In [None]:
# time related helper variables
if not query_to and query_period:
    if query_period.endswith("M"):
        query_to = pd.to_datetime(query_from,format='ISO8601') + pd.DateOffset(
            months=int(query_period[:-1])
        )
    else:
        query_to = pd.to_datetime(query_from,format='ISO8601') + pd.to_timedelta(query_period)
    query_to = query_to.strftime("%Y-%m-%d")
query_range = pd.to_datetime([query_from, query_to],format='ISO8601')

# metadb (online)
meta_data_subset = {
    "station": {"y_code": [query_city]},
    "system": {
        "i_description": ["CR logger", "Data logger"],
    },
    "time": {"start": query_from, "end": query_to},
}
query_index, metadb_systems, metadb = input_args(query_index)

# output files
output_files = {
    "path": os.path.join(
        output_path_base,
        output_path,
    ),
    "file": output_file,
}

In [None]:
# summarize
logging.info("`ioconf` file: %s", ioconfig_file)
logging.info(
    "`ioconf` dict:\n# start of item\n%s\n# end of item\n",
    pformat(ioconf, sort_dicts=False),
)

# MAIN

In [None]:
dx = main(metadb, export=True, verbose=False)

# DEV snippets

In [None]:
if False:
    # quick evaluation of TOA5 files
    def read_cs_toa5(input_stations, input_files):
        """Read Campbell Scientific TOA5 data table files."""
        h_dict = {}
        d_dict = {}
        d_list = {}
        for station in input_stations:
            s_id = station["system_id"]
            h_dict[s_id] = {}
            d_dict[s_id] = {}
            d_list[s_id] = {}

            # read tables
            for filegroup in input_files:
                # local definitions
                fp = filegroup["file_path"].format(**station)
                fn = filegroup["file_name"].format(**station)
                ft = filegroup["file_type"].format(**station)

                # list files
                fi_list = sorted(glob.glob(os.path.join(fp, fn)))
                if len(fi_list) == 0:
                    continue

                # read TOA5 files
                h_dict[s_id][ft] = []  # initialize list, header info, for DEBUG only
                d_dict[s_id][ft] = []  # initialize list, data table
                for fi in tqdm(fi_list):
                    # retrieve timestamp from the filename
                    t_pat = ".*?_(\d{4}-\d{2}-\d{2})+\.csv$"
                    t_str = re.findall(t_pat, fi)
                    fd = pd.to_datetime(t_str, format="%Y-%m-%d")

                    # MZ add: if needed, filter input files by timestamp here

                    # read TOA5 header information
                    h_df = pd.read_csv(fi, skiprows=1, nrows=2)

                    # skip files without header information
                    if not h_df.columns[0] in ["TIMESTAMP"]:
                        continue

                    d_df = pd.read_csv(
                        fi,
                        skiprows=4,
                        engine="python",
                        # warn_bad_lines=False, # until pandas 1.3
                        # error_bad_lines=False, # until pandas 1.3
                        on_bad_lines="skip",  # from pandas 1.3
                        names=h_df.columns.to_list(),
                        na_values=["NAN"],
                    )

                    # MZ add: dtype coercion here
                    d_k = []
                    for k in h_df.columns:
                        # time
                        if h_df.loc[0, k] == "TS":
                            d_df.loc[:, k] = pd.to_datetime(
                                d_df.loc[:, k], origin="unix", errors="coerce", format='ISO8601'
                            )
                            d_k.append(k)
                        # integer
                        if h_df.loc[0, k] == "RN":
                            d_df.loc[:, k] = pd.to_numeric(
                                d_df.loc[:, k], downcast="integer", errors="coerce", 
                            )
                            d_k.append(k)
                        # numeric values
                        if h_df.loc[1, k] in ["Smp", "Tot"] and not k in [
                            "SYS_CV50_Meta"
                        ]:
                            d_df.loc[:, k] = pd.to_numeric(
                                d_df.loc[:, k], errors="coerce"
                            )
                            # dk.append(k)

                    # filter for valid values on selected (critical) columns
                    d_df.dropna(axis=0, subset=d_k, how="any", inplace=True)

                    if isinstance(d_df, pd.DataFrame):
                        # append data tables to the list
                        h_dict[s_id][ft].append(h_df)
                        d_dict[s_id][ft].append(d_df)

            # aggregate tables, per file type
            for ft, df_list in list(d_dict[s_id].items()):
                d_list[s_id][ft] = pd.concat(df_list, axis=0).rename(
                    columns={"RECORD": f"{ft}_RECORD"}
                )

        return (d_list, h_dict)