In [1]:
from datetime import datetime
import json
import logging
from pathlib import Path
import re
import time
from typing import List, Tuple
from tempfile import NamedTemporaryFile

import dask.array as da
from dask.diagnostics import ProgressBar
from dask.distributed import Client, as_completed
import dask_geopandas as dgd 
import hydra
import geopandas as gpd
import numpy as np
from omegaconf import DictConfig, OmegaConf
import pandas as pd
from pyproj import CRS
from tqdm.notebook import tqdm
import xarray as xr
import zarr

In [2]:
def split_gdf_into_pieces(gdf, n_pieces):
    """
    Splits a GeoDataFrame into n nearly equal pieces.

    Parameters
    ----------
    gdf : GeoDataFrame
        The GeoDataFrame to split.
    n_pieces : int
        The number of pieces to split the gdf into.

    Returns
    -------
    list of GeoDataFrame
        A list containing the split GeoDataFrames.
    """
    # Calculate the size of each split
    split_size = len(gdf) // n_pieces  # Use floor division to get an integer result
    
    # Split the GeoDataFrame into n_pieces
    gdf_splits = [gdf.iloc[i:i + split_size] for i in range(0, len(gdf), split_size)]
    
    # To handle any remainder, ensure all pieces are captured by adjusting the last piece
    if len(gdf_splits) > n_pieces:
        # Append the remainder to the last piece if more splits than n_pieces
        gdf_splits[n_pieces-1] = pd.concat([gdf_splits[n_pieces-1], *gdf_splits[n_pieces:]])
        gdf_splits = gdf_splits[:n_pieces]
    
    return gdf_splits

In [None]:
zones = [71, 72, 73, 74, 75, 76, 77, 78]
for zone in tqdm(zones, desc="writing to zones"):
    river_path = Path("/data/tkb5476/projects/marquette/data/HUC/buffered_flowlines/") / f"riv_pfaf_{zone}_ME_PairwiseBuffe2.shp"
    gdf = gpd.read_file(river_path).to_crs(epsg=4326)
    gdf_pieces = split_gdf_into_pieces(gdf, 50)
    
    output_dir = Path("/data/tkb5476/projects/marquette/data/HUC/split_buffered_flowlines/") / f"riv_pfaf_{zone}_buff_split"
    output_dir.mkdir(parents=True, exist_ok=True)  # Ensure the output directory exists
    
    for idx, gdf_piece in enumerate(gdf_pieces, start=1):
        output_path = output_dir / f"split_{idx}.shp"
        gdf_piece.to_file(output_path)
    
    print(f"Finished splitting and saving zone {zone} shapefiles into {len(gdf_pieces)} pieces.")

writing to zones:   0%|          | 0/8 [00:00<?, ?it/s]

Finished splitting and saving zone 71 shapefiles into 50 pieces.
Finished splitting and saving zone 72 shapefiles into 50 pieces.
Finished splitting and saving zone 73 shapefiles into 50 pieces.
Finished splitting and saving zone 74 shapefiles into 50 pieces.
Finished splitting and saving zone 75 shapefiles into 50 pieces.
Finished splitting and saving zone 76 shapefiles into 50 pieces.
Finished splitting and saving zone 77 shapefiles into 50 pieces.


In [None]:
# Testing:
split_path = Path("/data/tkb5476/projects/marquette/data/HUC/split_buffered_flowlines/riv_pfaf_71_buff_split/")
gpd.read_file(split_path / "split_1.shp").to_crs(epsg=4326).tail()

In [None]:
gpd.read_file(split_path / "split_2.shp").to_crs(epsg=4326).head()