In [None]:
from pathlib import Path
import geopandas as gpd
from shapely.geometry import LineString
import numpy as np
from rasterio import features
import rasterio as rio
from tqdm.auto import tqdm
from shapely.geometry import shape, box
import pandas as pd

import sys

if sys.platform == "darwin":
    from multiprocess import Pool
else:
    from multiprocessing import Pool


from shapely.geometry import box
from functools import partial

In [None]:
s2_coverage_path = Path("data/S-2 coverage area.gpkg")
s2_coverage_path.exists()

In [None]:
model_name = "regnety_002_v1.29_model"

In [None]:
# Path the the inference rasters
input_rasters = Path("/Users/nick/Desktop/CL test/Aus working v8")

In [None]:
rasters = list(input_rasters.glob("*pred.tif"))
len(rasters)

In [None]:
# set the output path
output_vector = input_rasters.parent / f"{input_rasters.name}_{model_name}.gpkg"
output_vector

In [None]:
# simplify the geometries to make them less blocky
def simplify_geometries(gdf: gpd.GeoDataFrame, tolerance: float) -> gpd.GeoDataFrame:
    new_gdf = gdf.copy()
    new_gdf["geometry"] = new_gdf["geometry"].simplify(
        tolerance, preserve_topology=False
    )
    return gpd.GeoDataFrame(new_gdf)

In [None]:
def get_raster_bounds(raster):
    with rio.open(raster) as src:
        bounds = box(*src.bounds)
        bounds_gdf = gpd.GeoDataFrame({"geometry": [bounds]})
        bounds_gdf.set_crs(src.crs, inplace=True)
        bounds_gdf = bounds_gdf.to_crs(3857)
        if bounds_gdf is not None:
            extent = bounds_gdf.geometry.values[0]
            return extent
        raise ValueError("No bounds found")

In [None]:
raster_bounds = []
for raster in tqdm(rasters):
    raster_bounds.append(get_raster_bounds(raster))
bounds_gdf = gpd.GeoDataFrame(geometry=raster_bounds)
bounds_gdf.set_crs("EPSG:3857", inplace=True)
bounds_gdf = bounds_gdf.dissolve()

In [None]:
bounds_gdf.plot()

In [None]:
# extract the polygons from the rasters and reproject them to 3857
def extract_polygons(chunk, px_size):
    with rio.open(chunk) as src:
        local_epsg = src.meta["crs"].to_epsg()
        water_array = src.read(1).astype("uint8")
        mask = water_array == 1
    bounds = box(*src.bounds)

    shapes = features.shapes(
        water_array, mask=mask, transform=src.transform, connectivity=4
    )
    water_array = None
    geoms = []
    values = []
    for shape_dict, value in shapes:
        geoms.append(shape(shape_dict))
        values.append(value)

    water_gdf = gpd.GeoDataFrame({"geometry": geoms}, crs=f"EPSG:{local_epsg}")
    water_gdf = simplify_geometries(water_gdf, px_size)

    # clip edge 3km buffer

    water_gdf_wgs = water_gdf.to_crs(3857)

    water_gdf_wgs["geometry"] = water_gdf_wgs.buffer(0)

    return water_gdf_wgs

In [None]:
extract_polygons_partial = partial(extract_polygons, px_size=10)
with Pool() as p:
    water_polygons = list(
        tqdm(p.imap(extract_polygons_partial, rasters), total=len(rasters))
    )

In [None]:
# join all chunks into one gdf and dissolve
joined_water_gdf = pd.concat(water_polygons, ignore_index=True)
joined_water_gdf_dis = joined_water_gdf.dissolve()
joined_water_gdf_dis

In [None]:
joined_water_gdf.plot()

In [None]:
# convert multipart poly to single part so we can sort by size to remove lakes and rivers
single_part_gdf = joined_water_gdf_dis.explode(index_parts=False)
single_part_gdf["area"] = single_part_gdf.area
# only keep the largest area polygon
single_part_gdf = single_part_gdf.sort_values("area", ascending=False)
single_part_gdf = single_part_gdf.iloc[[0]]
single_part_gdf

In [None]:
# convert to lines to get the coastline
single_part_gdf.geometry = single_part_gdf.boundary
single_part_gdf

In [None]:
single_part_gdf.plot()

In [None]:
# clip coastline to the bounds of the rasters to remove the edge lines
old_crs = bounds_gdf.crs.to_epsg()
bounds_gdf.to_crs(3857, inplace=True)
bounds_gdf.geometry = bounds_gdf.buffer(-10)
bounds_gdf.to_crs(old_crs, inplace=True)


clipped_gdf = gpd.clip(single_part_gdf, bounds_gdf).explode(index_parts=False)
clipped_gdf.plot()

In [None]:
single_part_gdf.plot()

In [None]:
clipped_gdf

In [None]:
coverage = gpd.read_file(s2_coverage_path)
# buffer by 1
coverage.geometry = coverage.buffer(0.5)

In [None]:
coverage = gpd.read_file(s2_coverage_path)
coverage.geometry = coverage.buffer(-0.5)
coverage.to_crs(4326, inplace=True)

old_crs = clipped_gdf.crs.to_epsg()
clipped_gdf.to_crs(4326, inplace=True)


clipped_gdf = gpd.sjoin(clipped_gdf, coverage, how="inner", op="intersects")

clipped_gdf.plot()

In [None]:
def chaikin_corner_cutting_optimized(
    points: np.ndarray, num_iterations: int = 1
) -> np.ndarray:
    for _ in range(num_iterations):
        if np.array_equal(points[0], points[-1]):
            points = np.append(points, [points[1]], axis=0)

        p0 = points[:-1]
        p1 = points[1:]
        q = p0 * 0.75 + p1 * 0.25
        r = p0 * 0.25 + p1 * 0.75
        new_points = np.empty((2 * len(points) - 2, points.shape[1]))
        new_points[0::2] = q
        new_points[1::2] = r

        if np.array_equal(points[0], points[-2]):
            new_points = new_points[1:]
            new_points = np.append(new_points, [new_points[0]], axis=0)
        else:
            new_points = np.append(new_points, [points[-1]], axis=0)

        points = new_points

    return points


def smooth_geodataframe_optimized(
    gdf: gpd.GeoDataFrame, num_iterations: int = 1
) -> gpd.GeoDataFrame:
    gdf["geometry"] = gdf["geometry"].apply(
        lambda line: LineString(
            chaikin_corner_cutting_optimized(
                np.array(line.coords), num_iterations=num_iterations
            )
        )
    )

    return gdf

In [None]:
# smooth all the lines
lines_gpd = smooth_geodataframe_optimized(clipped_gdf, num_iterations=2)
lines_gpd

In [None]:
lines_gpd.to_file(output_vector)

In [None]:
output_vector