In [2]:
import json
import logging
import multiprocessing
from pathlib import Path
from typing import Any, Dict, List, Tuple

from dask.callbacks import Callback
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import dask_geopandas as dg
import geopandas as gpd
import numpy as np
from omegaconf import DictConfig, OmegaConf
import pandas as pd
from scipy.sparse import csr_matrix
from shapely.geometry import LineString, MultiLineString, Point
from tqdm.notebook import tqdm
import utm
import xarray as xr
import zarr

import sys
sys.path.append("..")

from marquette.merit._graph import _find_flowlines

log = logging.getLogger(__name__)

from dask.distributed import Client

client = Client(dashboard_address=':8989')
client
### TODO:
# - Preprocessing functions for catchment area of MERIT basins
# - Preprocessing function for flowline geometry (using custom UTM zones)
# - Preprocess function for the number of edges (the DX/buffer)
# - Create Edges

Perhaps you already have a cluster running?
Hosting the HTTP server on port 38717 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:38717/status,

0,1
Dashboard: http://127.0.0.1:38717/status,Workers: 12
Total threads: 144,Total memory: 503.74 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:36215,Workers: 12
Dashboard: http://127.0.0.1:38717/status,Total threads: 144
Started: Just now,Total memory: 503.74 GiB

0,1
Comm: tcp://127.0.0.1:38275,Total threads: 12
Dashboard: http://127.0.0.1:42885/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:41847,
Local directory: /tmp/dask-scratch-space/worker-vmq4hfju,Local directory: /tmp/dask-scratch-space/worker-vmq4hfju

0,1
Comm: tcp://127.0.0.1:44619,Total threads: 12
Dashboard: http://127.0.0.1:40821/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:33807,
Local directory: /tmp/dask-scratch-space/worker-4yj6g6li,Local directory: /tmp/dask-scratch-space/worker-4yj6g6li

0,1
Comm: tcp://127.0.0.1:46143,Total threads: 12
Dashboard: http://127.0.0.1:37143/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:33875,
Local directory: /tmp/dask-scratch-space/worker-0hssmd7s,Local directory: /tmp/dask-scratch-space/worker-0hssmd7s

0,1
Comm: tcp://127.0.0.1:45491,Total threads: 12
Dashboard: http://127.0.0.1:41447/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:37605,
Local directory: /tmp/dask-scratch-space/worker-66fky1l0,Local directory: /tmp/dask-scratch-space/worker-66fky1l0

0,1
Comm: tcp://127.0.0.1:45481,Total threads: 12
Dashboard: http://127.0.0.1:34867/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:43939,
Local directory: /tmp/dask-scratch-space/worker-ynchzcjp,Local directory: /tmp/dask-scratch-space/worker-ynchzcjp

0,1
Comm: tcp://127.0.0.1:35497,Total threads: 12
Dashboard: http://127.0.0.1:41275/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:45385,
Local directory: /tmp/dask-scratch-space/worker-3d5j02uo,Local directory: /tmp/dask-scratch-space/worker-3d5j02uo

0,1
Comm: tcp://127.0.0.1:43301,Total threads: 12
Dashboard: http://127.0.0.1:46865/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:33599,
Local directory: /tmp/dask-scratch-space/worker-6dp_8s6a,Local directory: /tmp/dask-scratch-space/worker-6dp_8s6a

0,1
Comm: tcp://127.0.0.1:45999,Total threads: 12
Dashboard: http://127.0.0.1:35631/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:43305,
Local directory: /tmp/dask-scratch-space/worker-nw5cl7ia,Local directory: /tmp/dask-scratch-space/worker-nw5cl7ia

0,1
Comm: tcp://127.0.0.1:34259,Total threads: 12
Dashboard: http://127.0.0.1:42263/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:34157,
Local directory: /tmp/dask-scratch-space/worker-4drgwswm,Local directory: /tmp/dask-scratch-space/worker-4drgwswm

0,1
Comm: tcp://127.0.0.1:34453,Total threads: 12
Dashboard: http://127.0.0.1:33521/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:43653,
Local directory: /tmp/dask-scratch-space/worker-0l6kb0u6,Local directory: /tmp/dask-scratch-space/worker-0l6kb0u6

0,1
Comm: tcp://127.0.0.1:39115,Total threads: 12
Dashboard: http://127.0.0.1:33853/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:42975,
Local directory: /tmp/dask-scratch-space/worker-bubfsjud,Local directory: /tmp/dask-scratch-space/worker-bubfsjud

0,1
Comm: tcp://127.0.0.1:37267,Total threads: 12
Dashboard: http://127.0.0.1:43701/status,Memory: 41.98 GiB
Nanny: tcp://127.0.0.1:39217,
Local directory: /tmp/dask-scratch-space/worker-k6len2v8,Local directory: /tmp/dask-scratch-space/worker-k6len2v8


In [3]:
json_data = '''{
  "name": "MERIT",
  "data_path": "/data/tkb5476/projects/marquette/data/",
  "dx": 2000,
  "buffer": 0.3334,
  "continent": 7,
  "area": 8,
  "save_name": "${name}_${continent}${area}",
  "save_paths": {
    "flow_lines": "${data_path}/${name}/raw/flowlines"
  },
  "zarr": {
    "edges": "${data_path}/${name}/zarr/dpl_v2/${save_name}_edges/",
    "sorted_edges_keys": "${data_path}/${name}/zarr/dpl_v2/${save_name}_edge_keys/"
  }
}'''

data_dict = json.loads(json_data)
cfg = OmegaConf.create(data_dict)


In [4]:
def _plot_gdf(gdf: gpd.GeoDataFrame) -> None:
    """
    A function to find the correct flowline of all MERIT basins using glob

    Parameters
    ----------
    gdf : gpd.GeoDataFrame
        The geodataframe you want to plot

    Returns
    -------
    None

    Raises
    ------
    None
    """
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots(figsize=(10, 10))
    gdf.plot(ax=ax)
    ax.set_title("Polyline Plot")
    ax.set_xlabel("Longitude")
    ax.set_ylabel("Latitude")
    plt.show()

In [5]:
def _find_flowlines(cfg: DictConfig) -> Path:
    """
    A function to find the correct flowline of all MERIT basins using glob

    Parameters
    ----------
    cfg : DictConfig
        The cfg object

    Returns
    -------
    Path
        The file that we're going to create flowline connectivity for

    Raises
    ------
    IndexError
        Raised if no flowlines are found with your MERIT region code
    """
    flowline_path = Path(cfg.save_paths.flow_lines)
    region_id = f"_{cfg.continent}{cfg.area}_"
    matching_file = flowline_path.glob(f"*{region_id}*.shp")
    try:
        found_file = [file for file in matching_file][0]
        return found_file
    except IndexError as e:
        raise IndexError(f"No flowlines found using: *{region_id}*.shp")

# Segments

In [6]:
def create_segment_dict(row: pd.Series, segment_coords: List[Tuple[float, float]], crs: Any, dx: int, buffer: float) -> Dict[str, Any]:
    """
    Create a dictionary representation of a segment with various attributes.

    This function constructs a dictionary for a river segment based on provided
    attributes. It includes details such as segment ID, order, length, downstream
    ID, slope, sinuosity, stream drop, upstream area, coordinates, and CRS.

    Parameters
    ----------
    row : pandas.Series
        A series representing a row from a DataFrame containing segment data.
    segment_coords : List[Tuple[float, float]]
        List of tuples representing coordinates of the segment.
    crs : Any
        Coordinate reference system of the segment.
    dx : int
        Desired length of each edge in the segment (used in further calculations).
    buffer : float
        Buffer tolerance for edge length calculation.

    Returns
    -------
    Dict[str, Any]
        Dictionary containing segment attributes.
    """
    segment_dict = {
        'id': row["COMID"],
        'order': row["order"],
        'len': row["lengthkm"] * 1000,  # to meters
        'len_dir': row["lengthdir"] * 1000,  # to meters
        'ds': row["NextDownID"],
        # 'is_headwater': False,
        'up': [row[key] for key in ["up1", "up2", "up3", "up4"] if row[key] != 0] if row["maxup"] > 0 else ([] if row["order"] == 1 else []),
        'slope': row["slope"],
        'sinuosity': row["sinuosity"],
        'stream_drop': row["strmDrop_t"],
        'uparea': row["uparea"],
        'coords': segment_coords,
        'crs': crs,
    }

    return segment_dict

In [7]:
def create_segment(row: pd.Series, crs: Any, dx: int, buffer: float) -> Dict[str, Any]:
    """
    Create a dictionary representation of a segment using its row data.

    This function is a wrapper that calls 'create_segment_dict' by passing the
    geometry of the segment along with other attributes. It simplifies the creation
    of a segment dictionary from a DataFrame row.

    Parameters
    ----------
    row : pandas.Series
        A series representing a row from a DataFrame containing segment data.
    crs : Any
        Coordinate reference system of the segment.
    dx : int
        Desired length of each edge in the segment (used in further calculations).
    buffer : float
        Buffer tolerance for edge length calculation.

    Returns
    -------
    dict
        Dictionary containing segment attributes.
    """
    return create_segment_dict(row, row.geometry, crs, dx, buffer)

In [8]:
def calculate_num_edges(length: float, dx: float, buffer: float) -> Tuple:
    """
    Calculate the number of edges and the length of each edge for a given segment.

    This function determines the number of edges a segment should be divided into, 
    based on its length, a desired edge length (dx), and a tolerance (buffer). 
    The function adjusts the number of edges to ensure that the deviation of the 
    actual edge length from dx is within the specified buffer.

    Parameters
    ----------
    length : float
        The length of the segment for which to calculate the number of edges.
    dx : float
        The desired length of each edge.
    buffer : float
        The acceptable deviation from the desired edge length (dx).

    Returns
    -------
    tuple
        A tuple containing two elements:
            - The first element is an integer representing the number of edges.
            - The second element is a float representing the actual length of each edge.

    Examples
    --------
    >> calculate_num_edges(100, 30, 5)
    (3, 33.333333333333336)

    >> calculate_num_edges(100, 25, 2)
    (4, 25.0)
    """
    num_edges = length // dx
    if num_edges == 0:
        num_edges = 1
        if dx - length < buffer:
            edge_len = length
        else:
            edge_len = dx
    else:
        edge_len = length / num_edges
        buf_dev = edge_len - dx
        while abs(buf_dev) > buffer:
            if buf_dev > dx:
                num_edges -= 1
            else:
                num_edges += 1
            edge_len = length / num_edges
            buf_dev = edge_len - dx
    return (int(num_edges), edge_len)

# Edges

In [9]:
def create_edge_json(segment_row: pd.Series, up=None, ds=None, edge_id=None) -> Dict[str, Any]:
    """
    Create a JSON representation of an edge based on segment data.

    Parameters
    ----------
    segment_row : pandas.Series
        A series representing a row from the segment DataFrame.
    up : list, optional
        List of upstream segment IDs.
    ds : str, optional
        Downstream segment ID.
    edge_id : str, optional
        Unique identifier for the edge.

    Returns
    -------
    dict
        Dictionary representing the edge with various attributes.
    """
    edge = {
        'id': edge_id,
        'merit_basin': segment_row['id'],
        'segment_sorting_index': segment_row['index'],
        'order': segment_row['order'],
        'len': segment_row['len'],
        'len_dir': segment_row['len_dir'],
        'ds': ds,
        'up': up,
        'up_merit': segment_row['up'],
        'slope': segment_row['slope'],
        'sinuosity': segment_row['sinuosity'],
        'stream_drop': segment_row['stream_drop'],
        'uparea': segment_row['uparea'],
        'coords': segment_row['coords'],
        'crs': segment_row['crs'],
    }
    return edge

def calculate_drainage_area(edge: Dict[str, Any], idx: int, segment_das: Dict[str, float], num_edges: int) -> None:
    """
    Calculate the drainage area for an edge.

    Parameters
    ----------
    edge : dict
        Dictionary representing the edge.
    idx : int
        Index of the edge within the segment.
    segment_das : dict
        Dictionary containing drainage area data for each segment.
    num_edges: int
        the number of edges that we're segmenting.

    Returns
    -------
    None
        The function modifies the 'edge' dictionary in place, adding or updating
        the 'uparea' key with the calculated drainage area.
    """
    prev_up_area = 0
    if edge['up']:
        try:
            prev_up_area = sum(segment_das[seg] for seg in edge['up_merit'])
        except KeyError:
            log.info("Missing upstream branch. Treating as head node")
        area_difference = edge['uparea'] - prev_up_area
        even_distribution = area_difference / num_edges
        edge["uparea"] = prev_up_area + even_distribution * (idx + 1)


def calculate_drainage_area_for_all_edges(edges, segment_das):
    num_edges = len(edges)
    up_ids = edges[0]['up']
    if up_ids:
        for idx, edge in enumerate(edges):
            prev_up_area = sum(segment_das[seg] for seg in edge['up_merit'])
            area_difference = edge['uparea'] - prev_up_area
            even_distribution = area_difference / num_edges
            edge["uparea"] = prev_up_area + even_distribution * (idx + 1)
    else:
        total_uparea = edges[0]['uparea']
        even_distribution = total_uparea / num_edges
        for idx, edge in enumerate(edges):
            edge["uparea"] = even_distribution * (idx + 1)
    return edges


def get_upstream_ids(row: pd.Series, edge_counts: int):
    """
    Generate upstream IDs for a segment.

    Parameters
    ----------
    row : pandas.Series
        A series representing a row from the segment DataFrame.
    edge_counts : int
        The number of edges associated with the segment.

    Returns
    -------
    list
        List of upstream segment IDs.
    """
    if row['up'] is None:
        return []
    try:
        up_ids = [f"{up}_{edge_counts - 1}" for up in row['up']]
    except KeyError:
        log.error(f"KeyError with segment {row['id']}")
        return []
    return up_ids

In [10]:
def singular_segment_to_edge_partition(df: pd.DataFrame, edge_info: Dict[str, Any], segment_das: Dict[str, float]) -> pd.DataFrame:
    """
    Process a DataFrame partition to create edges for each segment.

    This function iterates over each segment in the DataFrame, computes the edge 
    length, upstream IDs, and creates JSON representation of each edge. It handles 
    segments that are associated with only one edge.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame partition containing segment data.
    edge_info : dict
        Dictionary containing edge information for each segment.
    segment_das : dict
        Dictionary containing drainage area data for each segment.

    Returns
    -------
    pandas.DataFrame
        DataFrame containing edge data for all segments in the partition.
    """
    all_edges = []
    num_edges = 1
    for _, segment in df.iterrows():
        edge_len = edge_info[segment['id']][1]
        up_ids = get_upstream_ids(segment, num_edges)
        edge = create_edge_json(
            segment,
            up=up_ids,
            ds=f"{segment['ds']}_0",
            edge_id=f"{segment['id']}_0",
        )
        edge["len"] = edge_len
        edge["len_dir"] = edge_len / segment["sinuosity"]
        all_edges.append(edge)
    return pd.DataFrame(all_edges)

def many_segment_to_edge_partition(df: pd.DataFrame, edge_info: Dict[str, Any], segment_das: Dict[str, float]) -> pd.DataFrame:
    """
    Process a DataFrame partition to create edges for segments with multiple edges.

    This function iterates over each segment in the DataFrame partition, computes
    the edge length, upstream IDs, and creates a JSON representation for each edge.
    It is specifically designed for segments that have multiple edges.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame partition containing segment data.
    edge_info : dict
        Dictionary containing information about the number of edges and edge length 
        for each segment.
    segment_das : dict
        Dictionary containing drainage area data for each segment.

    Returns
    -------
    pandas.DataFrame
        DataFrame containing edge data for all segments in the partition.
    """
    all_edges = []
    for _, segment in df.iterrows():
        all_segment_edges = []
        num_edges, edge_len = edge_info[segment['id']]
        up_ids = get_upstream_ids(segment, num_edges)
        for i in range(num_edges):
            if i == 0:
                edge = create_edge_json(
                    segment,
                    up=up_ids,
                    ds=f"{segment['id']}_{i + 1}",
                    edge_id=f"{segment['id']}_{i}",
                )
            else:
                edge = create_edge_json(
                    segment,
                    up=[f"{segment['id']}_{i - 1}"],
                    ds=f"{segment['id']}_{i + 1}" if i < num_edges - 1 else f"{segment['ds']}_0",
                    edge_id=f"{segment['id']}_{i}",
                )
            edge["len"] = edge_len
            edge["len_dir"] = edge_len / segment["sinuosity"]
            all_segment_edges.append(edge)
        all_segment_edges = calculate_drainage_area_for_all_edges(all_segment_edges, segment_das)
        for edge in all_segment_edges:
            all_edges.append(edge)
    return pd.DataFrame(all_edges)

In [11]:
def sort_based_on_keys(array_to_sort, keys, segment_sorted_index):
    """
    Sort 'array_to_sort' based on the order defined in 'keys'.
    For each key, find rows in 'segment_sorted_index' where this value occurs.
    If there are multiple occurrences, sort these rows further by ID.

    Args:
    array_to_sort: The array to be sorted.
    keys: The array of keys to sort by.
    segment_sorted_index: The index array to match keys against.

    Returns:
    A sorted version of 'array_to_sort'.
    """
    sorted_array = []
    for key in tqdm(keys):
        matching_indices = np.where(segment_sorted_index == key)[0]
        if len(matching_indices) > 1:
            sorted_indices = np.sort(matching_indices)
        else:
            sorted_indices = matching_indices
        sorted_array.extend(array_to_sort[sorted_indices])
    return np.array(sorted_array)

def sort_xarray_dataarray(da, keys, segment_sorted_index):
    sorted_data = sort_based_on_keys(da.values, keys, segment_sorted_index)
    return xr.DataArray(sorted_data, dims=da.dims, coords=da.coords)

# The functions
### Read in the polylines and convert to dask dataframe

In [12]:
flowline_file: Path = _find_flowlines(cfg)
polyline_gdf: gpd.GeoDataFrame = gpd.read_file(flowline_file)
dx: int = cfg.dx  # Unit: Meters
buffer: float = cfg.buffer * dx  # Unit: Meters
for col in [
    "COMID",
    "NextDownID",
    "up1",
    "up2",
    "up3",
    "up4",
    "maxup",
    "order",
]:
    polyline_gdf[col] = polyline_gdf[col].astype(int)
crs: Any = polyline_gdf.crs
dask_gdf: dg.GeoDataFrame = dg.from_geopandas(polyline_gdf, npartitions=48) 

### Create segments and find the ordering of the segments by drainage area

In [13]:
meta = pd.Series([], dtype=object)
with ProgressBar():
    computed_series: dd.Series = dask_gdf.map_partitions(
        lambda df: df.apply(create_segment, args=(polyline_gdf.crs, dx, buffer), axis=1),
        meta=meta
    ).compute()
    
segments_dict = computed_series.to_dict()
sorted_keys = sorted(segments_dict, key=lambda key: segments_dict[key]['uparea'])
segment_das = {segment['id']: segment['uparea'] for segment in segments_dict.values()}

This may cause some slowdown.
Consider scattering data ahead of time and using futures.


In [14]:
num_edges_dict = {segment_["id"]: calculate_num_edges(segment_["len"], dx, buffer) for seg_id, segment_ in tqdm(segments_dict.items(), desc="Processing Number of Edges")}
one_edge_segment = {seg_id: edge_info for seg_id, edge_info in tqdm(num_edges_dict.items(), desc="Filtering Segments == 1") if edge_info[0] == 1}
many_edge_segment = {seg_id: edge_info for seg_id, edge_info in tqdm(num_edges_dict.items(), desc="Filtering Segments > 1") if edge_info[0] > 1} 

Processing Number of Edges:   0%|          | 0/28489 [00:00<?, ?it/s]

Filtering Segments == 1:   0%|          | 0/28489 [00:00<?, ?it/s]

Filtering Segments > 1:   0%|          | 0/28489 [00:00<?, ?it/s]

In [15]:
segments_with_more_than_one_edge = {}
segments_with_one_edge = {}

for i, segment in segments_dict.items():
    segment_id = segment["id"]
    segment["index"] = i
    
    if segment_id in many_edge_segment:
        segments_with_more_than_one_edge[segment_id] = segment
    elif segment_id in one_edge_segment:
        segments_with_one_edge[segment_id] = segment
    else:
        print(f"MISSING ID: {segment_id}")

df_one = pd.DataFrame.from_dict(segments_with_one_edge, orient='index')
df_many = pd.DataFrame.from_dict(segments_with_more_than_one_edge, orient='index')
ddf_one = dd.from_pandas(df_one, npartitions=48)
ddf_many = dd.from_pandas(df_many, npartitions=48)

In [16]:
many = many_segment_to_edge_partition(df_many, many_edge_segment, segment_das)
many.head()

Unnamed: 0,id,merit_basin,segment_sorting_index,order,len,len_dir,ds,up,up_merit,slope,sinuosity,stream_drop,uparea,coords,crs
0,78000001_0,78000001,0,3,2560.284421,1742.96467,78000001_1,"[78000002_2, 78000003_2]","[78000002, 78000003]",0.005052,1.468925,38.9,1230.510157,LINESTRING (-133.15583333333333 59.00666666666...,EPSG:4326
1,78000001_1,78000001,0,3,2560.284421,1742.96467,78000001_2,[78000001_0],"[78000002, 78000003]",0.005052,1.468925,38.9,1242.40003,LINESTRING (-133.15583333333333 59.00666666666...,EPSG:4326
2,78000001_2,78000001,0,3,2560.284421,1742.96467,78000369_0,[78000001_1],"[78000002, 78000003]",0.005052,1.468925,38.9,1254.289902,LINESTRING (-133.15583333333333 59.00666666666...,EPSG:4326
3,78000002_0,78000002,1,3,2043.085156,1511.372348,78000002_1,"[78000399_9, 78000627_9]","[78000399, 78000627]",0.007258,1.351808,148.7,402.761285,"LINESTRING (-133.215 59.042500000000004, -133....",EPSG:4326
4,78000002_1,78000002,1,3,2043.085156,1511.372348,78000002_2,[78000002_0],"[78000399, 78000627]",0.007258,1.351808,148.7,413.181798,"LINESTRING (-133.215 59.042500000000004, -133....",EPSG:4326


In [17]:
single = singular_segment_to_edge_partition(df_one, one_edge_segment, segment_das)
single.head()

Unnamed: 0,id,merit_basin,segment_sorting_index,order,len,len_dir,ds,up,up_merit,slope,sinuosity,stream_drop,uparea,coords,crs
0,78000007_0,78000007,6,2,2000.0,1439.360328,0_0,"[78000008_0, 78000156_0]","[78000008, 78000156]",0.000405,1.389506,0.3,461.834246,"LINESTRING (-137.06083333333333 59.06, -137.06...",EPSG:4326
1,78000013_0,78000013,12,3,2000.0,1445.897609,78000369_0,"[78000022_0, 78000026_0]","[78000022, 78000026]",0.005194,1.383224,5.2,2572.571564,LINESTRING (-133.15583333333333 59.00666666666...,EPSG:4326
2,78000029_0,78000029,28,4,1998.134652,1642.548769,78000027_0,"[78000030_0, 78000049_0]","[78000030, 78000049]",0.0001,1.216484,0.2,2813.171325,LINESTRING (-135.90166666666667 59.39583333333...,EPSG:4326
3,78000036_0,78000036,35,3,2000.0,1706.314375,78000035_0,"[78000038_0, 78000196_0]","[78000038, 78000196]",0.032713,1.172117,34.2,1352.852344,"LINESTRING (-132.805 59.19833333333333, -132.8...",EPSG:4326
4,78000040_0,78000040,39,2,2000.0,1778.619475,78000039_0,"[78000053_0, 78000238_0]","[78000053, 78000238]",0.002144,1.124468,2.6,458.297012,"LINESTRING (-135.94333333333333 59.3725, -135....",EPSG:4326


In [18]:
many.loc[many["merit_basin"] == 78019710]
segment_das[78019710]

172291.48137913668

In [19]:
many.loc[many["merit_basin"] == 78013704]
# single.loc[single["merit_basin"] == 78007263] 
# single.loc[single["merit_basin"] == 78007261]
# single.loc[single["merit_basin"] == 78007272]
# segment_das[78013704]

Unnamed: 0,id,merit_basin,segment_sorting_index,order,len,len_dir,ds,up,up_merit,slope,sinuosity,stream_drop,uparea,coords,crs
48639,78013704_0,78013704,13703,7,2092.133859,1816.797027,78013704_1,"[78013705_8, 78014260_8]","[78013705, 78014260]",0.001222,1.151551,0.0,172264.849579,LINESTRING (-118.19333333333333 48.15666666666...,EPSG:4326
48640,78013704_1,78013704,13703,7,2092.133859,1816.797027,78013704_2,[78013704_0],"[78013705, 78014260]",0.001222,1.151551,0.0,172280.429622,LINESTRING (-118.19333333333333 48.15666666666...,EPSG:4326
48641,78013704_2,78013704,13703,7,2092.133859,1816.797027,78013704_3,[78013704_1],"[78013705, 78014260]",0.001222,1.151551,0.0,172296.009665,LINESTRING (-118.19333333333333 48.15666666666...,EPSG:4326
48642,78013704_3,78013704,13703,7,2092.133859,1816.797027,78013704_4,[78013704_2],"[78013705, 78014260]",0.001222,1.151551,0.0,172311.589707,LINESTRING (-118.19333333333333 48.15666666666...,EPSG:4326
48643,78013704_4,78013704,13703,7,2092.133859,1816.797027,78013704_5,[78013704_3],"[78013705, 78014260]",0.001222,1.151551,0.0,172327.16975,LINESTRING (-118.19333333333333 48.15666666666...,EPSG:4326
48644,78013704_5,78013704,13703,7,2092.133859,1816.797027,78013704_6,[78013704_4],"[78013705, 78014260]",0.001222,1.151551,0.0,172342.749793,LINESTRING (-118.19333333333333 48.15666666666...,EPSG:4326
48645,78013704_6,78013704,13703,7,2092.133859,1816.797027,78013704_7,[78013704_5],"[78013705, 78014260]",0.001222,1.151551,0.0,172358.329835,LINESTRING (-118.19333333333333 48.15666666666...,EPSG:4326
48646,78013704_7,78013704,13703,7,2092.133859,1816.797027,78013704_8,[78013704_6],"[78013705, 78014260]",0.001222,1.151551,0.0,172373.909878,LINESTRING (-118.19333333333333 48.15666666666...,EPSG:4326
48647,78013704_8,78013704,13703,7,2092.133859,1816.797027,78013703_0,[78013704_7],"[78013705, 78014260]",0.001222,1.151551,0.0,172389.489921,LINESTRING (-118.19333333333333 48.15666666666...,EPSG:4326


### Processing flowline segments to river graph edges

In [20]:
meta = pd.DataFrame({
    'id': pd.Series(dtype='str'),
    'merit_basin': pd.Series(dtype='int'),
    'segment_sorting_index': pd.Series(dtype='int'),
    'order': pd.Series(dtype='int'),
    'len': pd.Series(dtype='float'),
    'len_dir': pd.Series(dtype='float'),
    'ds': pd.Series(dtype='str'),
    'up': pd.Series(dtype='object'),  # List or array
    'up_merit': pd.Series(dtype='object'),
    'slope': pd.Series(dtype='float'),
    'sinuosity': pd.Series(dtype='float'),
    'stream_drop': pd.Series(dtype='float'),
    'uparea': pd.Series(dtype='float'),
    'coords': gpd.GeoSeries(dtype='geometry'),  # Assuming this is a geometry column
    'crs': pd.Series(dtype='object'),  # CRS object
})


edges_results_one = ddf_one.map_partitions(
    singular_segment_to_edge_partition,
    edge_info=one_edge_segment, 
    segment_das=segment_das,
    meta=meta
)
edges_results_many = ddf_many.map_partitions(
    many_segment_to_edge_partition,
    edge_info=many_edge_segment, 
    segment_das=segment_das,
    meta=meta
)

In [21]:
with ProgressBar():
    edges_results_one_df = edges_results_one.compute()
    edges_results_many_df = edges_results_many.compute()

This may cause some slowdown.
Consider scattering data ahead of time and using futures.


In [22]:
edges_results_many_df.columns

Index(['id', 'merit_basin', 'segment_sorting_index', 'order', 'len', 'len_dir',
       'ds', 'up', 'up_merit', 'slope', 'sinuosity', 'stream_drop', 'uparea',
       'coords', 'crs'],
      dtype='object')

In [23]:
merged_df = pd.concat([edges_results_one_df, edges_results_many_df])
for col in ["id", "ds", "up", "up_merit", "coords", "crs"]:
    merged_df[col] = merged_df[col].astype(str)
print(merged_df.dtypes)

id                        object
merit_basin                int64
segment_sorting_index      int64
order                      int64
len                      float64
len_dir                  float64
ds                        object
up                        object
up_merit                  object
slope                    float64
sinuosity                float64
stream_drop              float64
uparea                   float64
coords                    object
crs                       object
dtype: object


In [24]:
xr_dataset = xr.Dataset.from_dataframe(merged_df)
sorted_keys_array = np.array(sorted_keys)

In [25]:
sorted_keys_array

array([23388, 26283, 21238, ..., 17389, 17388, 17387])

In [32]:
sorted_edges = xr.Dataset()

for var_name in xr_dataset.data_vars:
    sorted_edges[var_name] = sort_xarray_dataarray(xr_dataset[var_name], sorted_keys_array, xr_dataset["segment_sorting_index"].values)

sorted_edges.to_zarr(Path(cfg.zarr.edges), mode='w')

  0%|          | 0/28489 [00:00<?, ?it/s]

  0%|          | 0/28489 [00:00<?, ?it/s]

  0%|          | 0/28489 [00:00<?, ?it/s]

  0%|          | 0/28489 [00:00<?, ?it/s]

  0%|          | 0/28489 [00:00<?, ?it/s]

  0%|          | 0/28489 [00:00<?, ?it/s]

  0%|          | 0/28489 [00:00<?, ?it/s]

  0%|          | 0/28489 [00:00<?, ?it/s]

  0%|          | 0/28489 [00:00<?, ?it/s]

  0%|          | 0/28489 [00:00<?, ?it/s]

  0%|          | 0/28489 [00:00<?, ?it/s]

  0%|          | 0/28489 [00:00<?, ?it/s]

  0%|          | 0/28489 [00:00<?, ?it/s]

  0%|          | 0/28489 [00:00<?, ?it/s]

  0%|          | 0/28489 [00:00<?, ?it/s]

<xarray.backends.zarr.ZarrStore at 0x7f52d4512840>

In [33]:
edges = zarr.open_group(Path(cfg.zarr.edges), mode="r")
# edges.create_dataset('edge_keys', data=sorted_keys_array)
edges.tree()

Tree(nodes=(Node(disabled=True, name='/', nodes=(Node(disabled=True, icon='table', name='coords (108117,) <U30…

In [93]:
sorted_edges.to_zarr(Path(cfg.zarr.edges), mode='w')
zarr.save(cfg.zarr.sorted_edges_keys, sorted_keys_array)