In [1]:
from datetime import datetime
import json
import logging
from pathlib import Path
import re
import time
from typing import List, Tuple
from tempfile import NamedTemporaryFile

import dask.array as da
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from dask.distributed import Client, as_completed
import dask_geopandas as dgd 
import hydra
import geopandas as gpd
import numpy as np
from omegaconf import DictConfig, OmegaConf
import pandas as pd
from pyproj import CRS
from tqdm.notebook import tqdm
import xarray as xr
import zarr

log = logging.getLogger(__name__)

In [2]:
client = Client(dashboard_address=':8989')
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 36651 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:36651/status,

0,1
Dashboard: http://127.0.0.1:36651/status,Workers: 12
Total threads: 144,Total memory: 503.74 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:41399,Workers: 12
Dashboard: http://127.0.0.1:36651/status,Total threads: 144
Started: Just now,Total memory: 503.74 GiB

0,1
Comm: tcp://127.0.0.1:43237,Total threads: 12
Dashboard: http://127.0.0.1:45541/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:38887,
Local directory: /tmp/dask-scratch-space/worker-6jtrsmdq,Local directory: /tmp/dask-scratch-space/worker-6jtrsmdq

0,1
Comm: tcp://127.0.0.1:44791,Total threads: 12
Dashboard: http://127.0.0.1:33431/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:40181,
Local directory: /tmp/dask-scratch-space/worker-fi2143o5,Local directory: /tmp/dask-scratch-space/worker-fi2143o5

0,1
Comm: tcp://127.0.0.1:37509,Total threads: 12
Dashboard: http://127.0.0.1:34085/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:38591,
Local directory: /tmp/dask-scratch-space/worker-twyrgx1t,Local directory: /tmp/dask-scratch-space/worker-twyrgx1t

0,1
Comm: tcp://127.0.0.1:45685,Total threads: 12
Dashboard: http://127.0.0.1:33221/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:46089,
Local directory: /tmp/dask-scratch-space/worker-tuhpohkc,Local directory: /tmp/dask-scratch-space/worker-tuhpohkc

0,1
Comm: tcp://127.0.0.1:45497,Total threads: 12
Dashboard: http://127.0.0.1:45899/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:38703,
Local directory: /tmp/dask-scratch-space/worker-5oyapjy_,Local directory: /tmp/dask-scratch-space/worker-5oyapjy_

0,1
Comm: tcp://127.0.0.1:40707,Total threads: 12
Dashboard: http://127.0.0.1:44721/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:36727,
Local directory: /tmp/dask-scratch-space/worker-2_c80frn,Local directory: /tmp/dask-scratch-space/worker-2_c80frn

0,1
Comm: tcp://127.0.0.1:39591,Total threads: 12
Dashboard: http://127.0.0.1:36201/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:35689,
Local directory: /tmp/dask-scratch-space/worker-utglnktm,Local directory: /tmp/dask-scratch-space/worker-utglnktm

0,1
Comm: tcp://127.0.0.1:35013,Total threads: 12
Dashboard: http://127.0.0.1:33809/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:38871,
Local directory: /tmp/dask-scratch-space/worker-ym64fpk6,Local directory: /tmp/dask-scratch-space/worker-ym64fpk6

0,1
Comm: tcp://127.0.0.1:45035,Total threads: 12
Dashboard: http://127.0.0.1:36519/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:34063,
Local directory: /tmp/dask-scratch-space/worker-spsi3eaf,Local directory: /tmp/dask-scratch-space/worker-spsi3eaf

0,1
Comm: tcp://127.0.0.1:36751,Total threads: 12
Dashboard: http://127.0.0.1:41149/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:45201,
Local directory: /tmp/dask-scratch-space/worker-6oa_i34l,Local directory: /tmp/dask-scratch-space/worker-6oa_i34l

0,1
Comm: tcp://127.0.0.1:35437,Total threads: 12
Dashboard: http://127.0.0.1:42979/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:41233,
Local directory: /tmp/dask-scratch-space/worker-63lxoyet,Local directory: /tmp/dask-scratch-space/worker-63lxoyet

0,1
Comm: tcp://127.0.0.1:33713,Total threads: 12
Dashboard: http://127.0.0.1:37271/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:43593,
Local directory: /tmp/dask-scratch-space/worker-5eru8ae9,Local directory: /tmp/dask-scratch-space/worker-5eru8ae9


In [34]:
json_data = '''
{
  "name": "MERIT",
  "data_path": "/data/tkb5476/projects/marquette/data/",
  "dx": 2000,
  "buffer": 0.3334,
  "units": "mm/day",
  "date_codes": "${data_path}/date_codes.json",
  "crs": {
    "wgs": "epsg:4326",
    "utm18": "epsg:32618"
  },
  "is_streamflow_split": true,
  "start_date": "01-01-1980",
  "end_date": "12-31-2019",
  "num_cores": 20,
  "continent": 7,
  "area": 3,
  "num_partitions": 64,
  "save_name": "${name}_${continent}${area}",
  "save_paths": {
    "attributes": "${data_path}/${name}/streamflow/attributes.csv",
    "flow_lines": "${data_path}/${name}/raw/flowlines",
    "basins": "${data_path}/${name}/raw/basins/cat_pfaf_${continent}${area}_MERIT_Hydro_v07_Basins_v01_bugfix1.shp",
    "huc10": "${data_path}/HUC/huc_10_CONUS.shp",
    "streamflow_files": "${data_path}/${name}/streamflow/dpl_v2/dHBV"
  },
  "zarr": {
    "edges": "${data_path}/${name}/zarr/graph/${save_name}_edges/",
    "sorted_edges_keys": "${data_path}/${name}/zarr/graph/${save_name}_edge_keys/",
    "HUC_TM": "${data_path}/${name}/zarr/TMs/PFAF_${continent}${area}",
    "MERIT_TM": "${data_path}/${name}/zarr/TMs/MERIT_FLOWLINES_${continent}${area}"
  },
  "netcdf": {
    "streamflow": "${data_path}/${name}/netcdf/dpl_v2/"
  }
}
'''

data_dict = json.loads(json_data)
cfg = OmegaConf.create(data_dict)

In [4]:
def extract_numbers(filename: str) -> Tuple[int, int]:
    """
    Extracts numerical values from a filename and returns them as a tuple of integers.

    This function searches for the first occurrence of one or two groups of digits in the filename,
    separated by an underscore, and returns the extracted numbers as a tuple of integers. If the
    expected pattern is not found, it returns (0, 0). This function is typically used for sorting
    filenames based on numerical values embedded in their names.

    Parameters:
    filename (str or Path-like): The filename or path from which to extract the numbers. The
                                 filename is expected to contain numbers in the format 'xxxx_yyyy'.

    Returns:
    tuple: A tuple of two integers representing the extracted numerical values. If the pattern is
           not found, returns (0, 0).

    Example:
    --------
    >>> extract_numbers("Qr_12000_13000")
    (12000, 13000)

    >>> extract_numbers("file_123.txt")
    (123, 0)

    >>> extract_numbers("no_numbers_here")
    (0, 0)

    Notes:
    ------
    - The function uses regular expressions to find the numbers.
    - If only one group of digits is found, the second element of the returned tuple will be 0.
    """
    import re

    match = re.search(r"(\d+)_(\d+)", str(filename))
    if match:
        return tuple(map(int, match.groups()))
    return (0, 0)

In [5]:
def convert_streamflow(cfg: DictConfig) -> None:
    """
    Convert streamflow data from CSV files to a Zarr group format.

    This function reads streamflow data from multiple CSV files located in a specified
    directory, converts each file to a NumPy array, and then stores each array as a 
    dataset in a Zarr group. The function creates the Zarr group if it does not 
    already exist. Each dataset within the Zarr group is named after the corresponding 
    file.

    Parameters:
    cfg (DictConfig): A Hydra DictConfig configuration object. The configuration 
                      should contain the following keys:
                      - zarr.streamflow: The path where the Zarr group will be created.
                      - save_paths.streamflow_files: The directory containing the CSV 
                                                     files with streamflow data.

    Returns:
    None: This function does not return anything. It writes the converted data to 
          disk in Zarr group format.

    Raises:
    FileNotFoundError: If the specified directory for streamflow CSV files does not exist.
    IOError: If there is an issue reading the CSV files or writing to the Zarr group.

    Example usage:
    ```
    cfg = DictConfig({'zarr': {'streamflow': '/path/to/zarr/output'},
                      'save_paths': {'streamflow_files': '/path/to/csv/files'}})
    convert_streamflow(cfg)
    ```
    """
    try:
        streamflow_output = Path(cfg.zarr.streamflow)
        if not streamflow_output.exists():
            folder = Path(cfg.save_paths.streamflow_files)
            if not folder.exists():
                raise FileNotFoundError(f"Specified directory does not exist: {folder}")
            file_paths = [file for file in folder.glob("*") if file.is_file()]
            file_paths.sort(key=extract_numbers) 
            zarr_group = zarr.open_group(streamflow_output, mode='w')
            for file in file_paths:
                try:
                    array = pd.read_csv(file, dtype=np.float32, header=None).to_numpy()
                    zarr_group.create_dataset(file.name, data=array)
                    log.info(f"Wrote {file.name} to disk")
                except IOError as e:
                    log.info(f"Error processing file {file}: {e}")
        else:
            log.info(f"Zarr group already exists: {streamflow_output}")

    except FileNotFoundError as e:
        log.error(f"File not found: {e}")
    except IOError as e:
        log.error(f"I/O error occurred: {e}")

In [6]:
def _sort_into_bins(ids: np.ndarray, bins: List[np.ndarray]):
    """
    :param ids: a list of HUC10 IDS
    :return:
    """

    def find_list_of_str(target: int, sorted_lists: List[np.ndarray]):
        left, right = 0, len(sorted_lists) - 1
        while left <= right:
            mid = (left + right) // 2
            mid_list = sorted_lists[mid]
            if mid_list.size > 0:
                first_element = int(mid_list[0])
                last_element = int(mid_list[-1])
                if target < first_element:
                    right = mid - 1
                elif target > last_element:
                    left = mid + 1
                else:
                    return mid
            else:
                left += 1
        return None

    keys = list(range(0, 16, 1))
    grouped_values = {key: [] for key in keys}
    for idx, value in enumerate(ids):
        id = int(ids[idx])
        _key = find_list_of_str(id, bins)
        grouped_values[_key].append({id: idx})

    return grouped_values

In [8]:
streamflow_zarr_path = Path(cfg.zarr.streamflow)
# if streamflow_zarr_path.exists() is False:
attrs_df = pd.read_csv(cfg.save_paths.attributes)
huc10_ids = attrs_df["gage_ID"].values.astype("str")
huc_to_merit_TM = zarr.open(Path(cfg.zarr.HUC_TM), mode='r')
huc_10_list = huc_to_merit_TM.HUC10[:]
bins_size = 1000
bins = [huc10_ids[i: i + bins_size] for i in range(0, len(huc10_ids), bins_size)]
basin_hucs = huc_10_list
basin_indexes = _sort_into_bins(basin_hucs, bins)
streamflow_data = []
columns = []
folder = Path(cfg.save_paths.streamflow_files)
file_paths = [file for file in folder.glob("*") if file.is_file()]
file_paths.sort(key=extract_numbers)
iterable = basin_indexes.keys()
pbar = tqdm(iterable)
for i, key in enumerate(pbar):
    pbar.set_description(f"Processing Qr files")
    values = basin_indexes[key]
    if values:
        file = file_paths[i]
        df = pd.read_csv(file, dtype=np.float32, header=None)
        for val in values:
            id = list(val.keys())[0]
            columns.append(id)
            row = attrs_df[attrs_df["gage_ID"] == id]
            try:
                attr_idx = row.index[0]
                try:
                    row_idx = attr_idx - (
                            key * 1000
                    )  # taking only the back three numbers
                    _streamflow = df.iloc[row_idx].values
                except IndexError as e:
                    raise e
                if cfg.units.lower() == "mm/day":
                    # converting from mm/day to m3/s
                    area = row["area"].values[0]
                    _streamflow = _streamflow * area * 1000 / 86400
                streamflow_data.append(_streamflow)
            except IndexError:
                log.info(
                    f"HUC10 {id} is missing from the attributes file."
                )
                no_pred = np.zeros([14610])
                streamflow_data.append(no_pred)
                continue

  0%|          | 0/16 [00:00<?, ?it/s]

In [9]:
array = np.array(streamflow_data).T
column_keys = np.array(columns)

In [10]:
date_range = pd.date_range(start=cfg.start_date, end=cfg.end_date, freq='D')
date_range

DatetimeIndex(['1980-01-01', '1980-01-02', '1980-01-03', '1980-01-04',
               '1980-01-05', '1980-01-06', '1980-01-07', '1980-01-08',
               '1980-01-09', '1980-01-10',
               ...
               '2019-12-22', '2019-12-23', '2019-12-24', '2019-12-25',
               '2019-12-26', '2019-12-27', '2019-12-28', '2019-12-29',
               '2019-12-30', '2019-12-31'],
              dtype='datetime64[ns]', length=14610, freq='D')

In [11]:
ds = xr.Dataset({
    'streamflow': (['time', 'location'], array)
}, coords={
    'time': date_range,
    'location': column_keys
})
ds

In [12]:
ds_interpolated = ds.interp(time=pd.date_range(start=cfg.start_date, end=cfg.end_date, freq='H'), method='linear')

In [33]:
ds_interpolated.to_netcdf(Path(cfg.netcdf.streamflow))