In [17]:
from datetime import datetime
import json
import logging
from pathlib import Path
import re
import time
from typing import List, Tuple
from tempfile import NamedTemporaryFile

import dask.array as da
from dask.diagnostics import ProgressBar
from dask.distributed import Client, as_completed
import dask_geopandas as dgd 
import hydra
import geopandas as gpd
import numpy as np
from omegaconf import DictConfig, OmegaConf
import pandas as pd
from pyproj import CRS
from tqdm.notebook import tqdm
import xarray as xr
import zarr

log = logging.getLogger(__name__)

In [18]:
client = Client(dashboard_address=':8989')
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 37403 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:37403/status,

0,1
Dashboard: http://127.0.0.1:37403/status,Workers: 8
Total threads: 32,Total memory: 251.53 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:38479,Workers: 8
Dashboard: http://127.0.0.1:37403/status,Total threads: 32
Started: Just now,Total memory: 251.53 GiB

0,1
Comm: tcp://127.0.0.1:40535,Total threads: 4
Dashboard: http://127.0.0.1:38591/status,Memory: 31.44 GiB
Nanny: tcp://127.0.0.1:42333,
Local directory: /tmp/dask-scratch-space/worker-lqsz4vcy,Local directory: /tmp/dask-scratch-space/worker-lqsz4vcy

0,1
Comm: tcp://127.0.0.1:43687,Total threads: 4
Dashboard: http://127.0.0.1:45161/status,Memory: 31.44 GiB
Nanny: tcp://127.0.0.1:44771,
Local directory: /tmp/dask-scratch-space/worker-4cotdhc4,Local directory: /tmp/dask-scratch-space/worker-4cotdhc4

0,1
Comm: tcp://127.0.0.1:36737,Total threads: 4
Dashboard: http://127.0.0.1:37209/status,Memory: 31.44 GiB
Nanny: tcp://127.0.0.1:46623,
Local directory: /tmp/dask-scratch-space/worker-ztj_4yip,Local directory: /tmp/dask-scratch-space/worker-ztj_4yip

0,1
Comm: tcp://127.0.0.1:36305,Total threads: 4
Dashboard: http://127.0.0.1:37397/status,Memory: 31.44 GiB
Nanny: tcp://127.0.0.1:40797,
Local directory: /tmp/dask-scratch-space/worker-dm_l5prk,Local directory: /tmp/dask-scratch-space/worker-dm_l5prk

0,1
Comm: tcp://127.0.0.1:34245,Total threads: 4
Dashboard: http://127.0.0.1:39635/status,Memory: 31.44 GiB
Nanny: tcp://127.0.0.1:44863,
Local directory: /tmp/dask-scratch-space/worker-eftinl0l,Local directory: /tmp/dask-scratch-space/worker-eftinl0l

0,1
Comm: tcp://127.0.0.1:32909,Total threads: 4
Dashboard: http://127.0.0.1:41953/status,Memory: 31.44 GiB
Nanny: tcp://127.0.0.1:39241,
Local directory: /tmp/dask-scratch-space/worker-qae22jzl,Local directory: /tmp/dask-scratch-space/worker-qae22jzl

0,1
Comm: tcp://127.0.0.1:41997,Total threads: 4
Dashboard: http://127.0.0.1:37461/status,Memory: 31.44 GiB
Nanny: tcp://127.0.0.1:41591,
Local directory: /tmp/dask-scratch-space/worker-gnerxu7w,Local directory: /tmp/dask-scratch-space/worker-gnerxu7w

0,1
Comm: tcp://127.0.0.1:37045,Total threads: 4
Dashboard: http://127.0.0.1:34179/status,Memory: 31.44 GiB
Nanny: tcp://127.0.0.1:33393,
Local directory: /tmp/dask-scratch-space/worker-9u98l4m0,Local directory: /tmp/dask-scratch-space/worker-9u98l4m0


In [31]:
json_data = '''
{
  "name": "MERIT",
  "data_path": "/data/tkb5476/projects/marquette/data/",
  "dx": 2000,
  "buffer": 0.3334,
  "units": "mm/day",
  "date_codes": "${data_path}/date_codes.json",
  "crs": {
    "wgs": "epsg:4326",
    "utm18": "epsg:32618"
  },
  "is_streamflow_split": true,
  "start_date": "01-01-1980",
  "end_date": "12-31-2019",
  "num_cores": 20,
  "continent": 7,
  "area": 3,
  "zone": "${continent}${area}",
  "save_paths": {
    "attributes": "${data_path}/${name}/streamflow/attributes.csv",
    "flow_lines": "${data_path}/${name}/raw/flowlines",
    "basins": "${data_path}/${name}/raw/basins/cat_pfaf_${zone}_MERIT_Hydro_v07_Basins_v01_bugfix1.shp",
    "huc10": "${data_path}/HUC/huc_10_CONUS.shp",
    "streamflow_files": "${data_path}/${name}/streamflow/dpl_v2/dHBV"
  },
  "zarr": {
    "edges": "${data_path}/${name}/zarr/dpl_v2/${zone}_edges/",
    "sorted_edges_keys": "${data_path}/${name}/zarr/dpl_v2/${zone}_edge_keys/",
    "HUC_TM": "${data_path}/${name}/zarr/TMs/PFAF_${continent}${area}",
    "MERIT_TM": "${data_path}/${name}/zarr/TMs/MERIT_FLOWLINES_${continent}${area}",
    "streamflow": "${data_path}/streamflow/zarr/dpl_v2/${zone}",
    "q_prime": "${data_path}/${name}/zarr/q_prime/${zone}/"
    }
}'''

data_dict = json.loads(json_data)
cfg = OmegaConf.create(data_dict)

In [20]:
streamflow_prediction = zarr.open(Path(cfg.zarr.streamflow))
huc_to_merit_TM = zarr.open(Path(cfg.zarr.HUC_TM), mode='r')
merit_to_edge_TM = zarr.open(Path(cfg.zarr.MERIT_TM), mode='r')

In [21]:
print(streamflow_prediction.tree())


/
 ├── HUC10 (2437,) int64
 ├── streamflow (350617, 2437) float64
 └── time (350617,) int64


In [22]:
print(huc_to_merit_TM.tree())


/
 ├── COMID (23294,) int64
 ├── HUC10 (2437,) object
 └── TM (2437, 23294) float64


In [23]:
print(merit_to_edge_TM.tree())


/
 ├── COMID (23294,) int64
 ├── EDGEID (96353,) <U11
 └── TM (23294, 96353) float64


In [24]:
xr_streamflow_to_huc_TM = xr.open_zarr(Path(cfg.zarr.streamflow))
xr_huc_to_merit_TM = xr.open_zarr(Path(cfg.zarr.HUC_TM))
xr_merit_to_edge_TM = xr.open_zarr(Path(cfg.zarr.MERIT_TM))

In [25]:
print(xr_streamflow_to_huc_TM["streamflow"].dims)
print(xr_huc_to_merit_TM["TM"].dims)
print(xr_merit_to_edge_TM["TM"].dims)

('time', 'HUC10')
('HUC10', 'COMID')
('COMID', 'EDGEID')


In [26]:
streamflow_merit = xr.dot(xr_streamflow_to_huc_TM['streamflow'], xr_huc_to_merit_TM['TM'])
streamflow_edges = xr.dot(streamflow_merit, xr_merit_to_edge_TM['TM'])

  result = blockwise(
  result = blockwise(


In [41]:
streamflow_edges

Unnamed: 0,Array,Chunk
Bytes,251.70 GiB,62.95 MiB
Shape,"(350617, 96353)","(5479, 1506)"
Dask graph,4096 chunks in 17 graph layers,4096 chunks in 17 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 251.70 GiB 62.95 MiB Shape (350617, 96353) (5479, 1506) Dask graph 4096 chunks in 17 graph layers Data type float64 numpy.ndarray",96353  350617,

Unnamed: 0,Array,Chunk
Bytes,251.70 GiB,62.95 MiB
Shape,"(350617, 96353)","(5479, 1506)"
Dask graph,4096 chunks in 17 graph layers,4096 chunks in 17 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [None]:
output_path = Path(cfg.zarr.q_prime})
for col_idx in tqdm(range(streamflow_edges.shape[1]), desc="Processing Columns"):
    data = streamflow_edges[:, col_idx]
    data.to_netcdf( / f"{col_idx}.nc)