In [1]:
from datetime import datetime
import json
import logging
from pathlib import Path
import re
import time
from typing import List, Tuple
from tempfile import NamedTemporaryFile

import dask.array as da
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from dask.distributed import Client, as_completed
import dask_geopandas as dgd 
import hydra
import geopandas as gpd
import numpy as np
from omegaconf import DictConfig, OmegaConf
import pandas as pd
from pyproj import CRS
from tqdm.notebook import tqdm
import xarray as xr
import zarr

log = logging.getLogger(__name__)

In a future release, Dask DataFrame will use a new implementation that
contains several improvements including a logical query planning.
The user-facing DataFrame API will remain unchanged.

The new implementation is already available and can be enabled by
installing the dask-expr library:

    $ pip install dask-expr

and turning the query planning option on:

    >>> import dask
    >>> dask.config.set({'dataframe.query-planning': True})
    >>> import dask.dataframe as dd

API documentation for the new implementation is available at
https://docs.dask.org/en/stable/dask-expr-api.html

Any feedback can be reported on the Dask issue tracker
https://github.com/dask/dask/issues 


    # via Python

    # via CLI


  import dask.dataframe as dd


In [2]:
cfg = OmegaConf.load("/projects/mhpi/tbindas/marquette/marquette/conf/config.yaml")
cfg.create_edges.edges

'/projects/mhpi/data/MERIT/zarr/graph/edges/'

In [3]:
df = pd.read_csv(Path(cfg.create_N.obs_dataset_output))

gage_indices = zarr.open_group(Path(cfg.create_N.gage_coo_indices), mode="r")

df.head()

Unnamed: 0,STAID,STANAME,HUC02,DRAIN_SQKM,LAT_GAGE,LNG_GAGE,COMID,edge_intersection,zone_edge_id,zone_edge_uparea,zone_edge_vs_gage_area_difference,drainage_area_percent_error
0,1011000,"Allagash River near Allagash, Maine",1,3186.844,47.069611,-69.079544,72040788,72040788_1,237656,3186.327133,0.516867,0.000162
1,1013500,"Fish River near Fort Kent, Maine",1,2252.696,47.237394,-68.582642,72040863,72040863_4,232647,2249.676525,3.019475,0.00134
2,1015800,"Aroostook River near Masardis, Maine",1,2313.755,46.523003,-68.371764,72040358,72040358_1,233192,2315.255194,1.500194,0.000648
3,1017000,"Aroostook River at Washburn, Maine",1,4278.907,46.777294,-68.157194,72040306,72040306_0,241126,4278.016696,0.890304,0.000208
4,1019000,"Grand Lake Stream at Grand Lake Stream, Maine",1,620.6238,45.172522,-67.7688,73000706,73000706_8,75631,602.716851,17.906949,0.028853


In [13]:
zones = [71, 72, 73, 74, 75, 77, 78]
for _zone in zones:
    zone = str(_zone)
    print(f"===== zone: {zone} =====")
    edge_path = Path(cfg.create_edges.edges)
    edges = zarr.open_group(edge_path / f"{zone}", mode="r")
    # print(edges.tree())
    print(f"Unique Merit basins in this array: {len(np.unique(edges.merit_basin[:]))}")
    _streamflow_data = xr.open_zarr(
        Path("/projects/mhpi/data/MERIT/streamflow/zarr/merit_conus_v1.0/") / zone,
        chunks="auto",
    )
    print(f"Number of COMIDs in the streamflow predictions: {len(_streamflow_data.COMID)}")
    print("\n")

===== zone: 71 =====
Unique Merit basins in this array: 48951
Number of COMIDs in the streamflow predictions: 6596


===== zone: 72 =====
Unique Merit basins in this array: 59790
Number of COMIDs in the streamflow predictions: 15653


===== zone: 73 =====
Unique Merit basins in this array: 23294
Number of COMIDs in the streamflow predictions: 22102


===== zone: 74 =====
Unique Merit basins in this array: 72659
Number of COMIDs in the streamflow predictions: 68906


===== zone: 75 =====
Unique Merit basins in this array: 50614
Number of COMIDs in the streamflow predictions: 23448


===== zone: 77 =====
Unique Merit basins in this array: 58125
Number of COMIDs in the streamflow predictions: 33740


===== zone: 78 =====
Unique Merit basins in this array: 28489
Number of COMIDs in the streamflow predictions: 4837




In [19]:
zone = "74"
_streamflow_data = xr.open_zarr(
    Path("/projects/mhpi/data/MERIT/streamflow/zarr/merit_conus_v1.0/") / zone,
    chunks="auto",
)
_streamflow_data

FileNotFoundError: No such file or directory: '/projects/mhpi/data/MERIT/streamflow/zarr/merit_conus_v1.0/74'

In [14]:
bad_gages = []
edge_path = Path(cfg.create_edges.edges)
df_grouped = df.groupby(df['COMID'].astype(str).str[:2])
for zone, group_data in df_grouped:
    _streamflow_data = xr.open_zarr(
        Path("/projects/mhpi/data/MERIT/streamflow/zarr/merit_conus_v1.0/") / zone,
        chunks="auto",
    )
    flowline_path = Path("/projects/mhpi/data/MERIT/raw/flowlines")
    edges = zarr.open_group(edge_path / f"{zone}", mode="r")
    comids = _streamflow_data.COMID
    for data in tqdm(group_data.iterrows(), desc=f"iterating through pairs in zone: {zone}"):
        _id = str(data[1]["STAID"]).zfill(8)
        try:
            exists = True
            pairs = gage_indices[str(zone)][_id].pairs[:]
            for pair in pairs:
                up_idx = int(pair[0])
                up_basin = edges.merit_basin[up_idx]
                if up_basin not in comids:
                    if exists:
                        print(f"gage id {_id} not in comids")
                        bad_gages.append(_id)
                    exists = False
        except KeyError:
            # the gage id isn't in gage_indices
            continue

iterating through pairs in zone: 71: 0it [00:00, ?it/s]

gage id 05046000 not in comids
gage id 05050000 not in comids
gage id 05051522 not in comids
gage id 05053000 not in comids
gage id 05054000 not in comids
gage id 05056000 not in comids
gage id 05057200 not in comids
gage id 05058000 not in comids
gage id 05058700 not in comids
gage id 05059000 not in comids
gage id 05059500 not in comids
gage id 05059700 not in comids
gage id 05061500 not in comids
gage id 05062000 not in comids
gage id 05069000 not in comids
gage id 05076000 not in comids
gage id 05079000 not in comids
gage id 05085000 not in comids
gage id 05116500 not in comids
gage id 05127000 not in comids
gage id 05127500 not in comids
gage id 05129115 not in comids
gage id 05131500 not in comids


iterating through pairs in zone: 72: 0it [00:00, ?it/s]

gage id 01011000 not in comids
gage id 01013500 not in comids
gage id 01015800 not in comids
gage id 01017000 not in comids
gage id 04024000 not in comids
gage id 04024430 not in comids
gage id 04025500 not in comids
gage id 04027000 not in comids
gage id 04027500 not in comids
gage id 04029990 not in comids
gage id 04040000 not in comids
gage id 04045500 not in comids
gage id 04056500 not in comids
gage id 04059000 not in comids
gage id 04059500 not in comids
gage id 04060993 not in comids
gage id 04062000 not in comids
gage id 04062500 not in comids
gage id 04063000 not in comids
gage id 04065722 not in comids
gage id 04066003 not in comids
gage id 04069500 not in comids
gage id 04071000 not in comids
gage id 04073500 not in comids
gage id 04074950 not in comids
gage id 04079000 not in comids
gage id 04084445 not in comids
gage id 04084500 not in comids
gage id 04086600 not in comids
gage id 04087000 not in comids
gage id 04097540 not in comids
gage id 04099000 not in comids
gage id 

iterating through pairs in zone: 73: 0it [00:00, ?it/s]

gage id 01030500 not in comids
gage id 01031500 not in comids
gage id 01034000 not in comids
gage id 01034500 not in comids
gage id 01038000 not in comids
gage id 01042500 not in comids
gage id 01046500 not in comids
gage id 01048000 not in comids
gage id 01049000 not in comids
gage id 01049500 not in comids
gage id 01052500 not in comids
gage id 01053500 not in comids
gage id 01054000 not in comids
gage id 01054500 not in comids
gage id 01059000 not in comids
gage id 01064500 not in comids
gage id 01066000 not in comids
gage id 01076500 not in comids
gage id 01081000 not in comids
gage id 01092000 not in comids
gage id 01094400 not in comids
gage id 01094500 not in comids
gage id 01096500 not in comids
gage id 01099500 not in comids
gage id 01100000 not in comids
gage id 01103500 not in comids
gage id 01104200 not in comids
gage id 01104500 not in comids
gage id 01112500 not in comids
gage id 01122500 not in comids
gage id 01127000 not in comids
gage id 01135500 not in comids
gage id 

iterating through pairs in zone: 74: 0it [00:00, ?it/s]

gage id 02481880 not in comids
gage id 02482000 not in comids
gage id 02482550 not in comids
gage id 02488500 not in comids
gage id 02492000 not in comids
gage id 03007800 not in comids
gage id 03010500 not in comids
gage id 03011020 not in comids
gage id 03015000 not in comids
gage id 03015500 not in comids
gage id 03023100 not in comids
gage id 03024000 not in comids
gage id 03025500 not in comids
gage id 03029500 not in comids
gage id 03030500 not in comids
gage id 03031500 not in comids
gage id 03032500 not in comids
gage id 03034000 not in comids
gage id 03036500 not in comids
gage id 03041500 not in comids
gage id 03045000 not in comids
gage id 03048500 not in comids
gage id 03049500 not in comids
gage id 03050000 not in comids
gage id 03051000 not in comids
gage id 03054500 not in comids
gage id 03061000 not in comids
gage id 03072000 not in comids
gage id 03072655 not in comids
gage id 03074500 not in comids
gage id 03075070 not in comids
gage id 03075500 not in comids
gage id 

iterating through pairs in zone: 75: 0it [00:00, ?it/s]

gage id 08022040 not in comids
gage id 08026000 not in comids
gage id 08033500 not in comids
gage id 08041000 not in comids
gage id 08068000 not in comids
gage id 08068500 not in comids
gage id 08073600 not in comids
gage id 08079600 not in comids
gage id 08101000 not in comids
gage id 08110430 not in comids
gage id 08110500 not in comids
gage id 08162600 not in comids
gage id 08178880 not in comids
gage id 08188500 not in comids
gage id 08190000 not in comids
gage id 08220000 not in comids
gage id 08276300 not in comids
gage id 08279000 not in comids
gage id 08279500 not in comids
gage id 08284100 not in comids
gage id 08285500 not in comids
gage id 08286500 not in comids
gage id 08287000 not in comids
gage id 08289000 not in comids
gage id 08290000 not in comids
gage id 08319000 not in comids
gage id 08324000 not in comids
gage id 08330000 not in comids
gage id 08334000 not in comids
gage id 08379500 not in comids
gage id 08382650 not in comids
gage id 08382830 not in comids
gage id 

iterating through pairs in zone: 77: 0it [00:00, ?it/s]

gage id 09010500 not in comids
gage id 09034250 not in comids
gage id 09037500 not in comids
gage id 09038500 not in comids
gage id 09050700 not in comids
gage id 09057500 not in comids
gage id 09058000 not in comids
gage id 09064600 not in comids
gage id 09070000 not in comids
gage id 09070500 not in comids
gage id 09080400 not in comids
gage id 09085000 not in comids
gage id 09085100 not in comids
gage id 09095500 not in comids
gage id 09105000 not in comids
gage id 09109000 not in comids
gage id 09114500 not in comids
gage id 09118450 not in comids
gage id 09119000 not in comids
gage id 09128000 not in comids
gage id 09132500 not in comids
gage id 09147500 not in comids
gage id 09149500 not in comids
gage id 09152500 not in comids
gage id 09166500 not in comids
gage id 09169500 not in comids
gage id 09171100 not in comids
gage id 09180000 not in comids
gage id 09205000 not in comids
gage id 09209400 not in comids
gage id 09211200 not in comids
gage id 09217000 not in comids
gage id 

iterating through pairs in zone: 78: 0it [00:00, ?it/s]

gage id 12010000 not in comids
gage id 12013500 not in comids
gage id 12020000 not in comids
gage id 12025000 not in comids
gage id 12026400 not in comids
gage id 12027500 not in comids
gage id 12031000 not in comids
gage id 12036000 not in comids
gage id 12039500 not in comids
gage id 12040500 not in comids
gage id 12041200 not in comids
gage id 12043000 not in comids
gage id 12045500 not in comids
gage id 12048000 not in comids
gage id 12082500 not in comids
gage id 12086500 not in comids
gage id 12089500 not in comids
gage id 12092000 not in comids
gage id 12093500 not in comids
gage id 12095000 not in comids
gage id 12101500 not in comids
gage id 12105900 not in comids
gage id 12106700 not in comids
gage id 12112600 not in comids
gage id 12113000 not in comids
gage id 12114500 not in comids
gage id 12115000 not in comids
gage id 12116500 not in comids
gage id 12117500 not in comids
gage id 12119000 not in comids
gage id 12121600 not in comids
gage id 12134500 not in comids
gage id 

In [15]:
print(len(bad_gages))
bad_gage_int = [int(gage) for gage in bad_gages]
filtered_df = df[~df['STAID'].isin(bad_gage_int)]
print(df.shape[0] - filtered_df.shape[0])
filtered_df.to_csv(Path(cfg.data_path) / "gage_information/formatted_gage_csvs/merit_conus_v1.0_gages_3000.csv", index=False)


1599
1599


In [16]:
df.shape[0]

2808