In [9]:
# ============================================================
# 0) Config
# ============================================================

import os
from pathlib import Path

OSN_ENDPOINT_URL = "https://nyu1.osn.mghpcc.org"
OSN_BUCKET = "leap-pangeo-manual"
HACKATHON_PREFIX = "hackathon-2026/"
OSN_ROOT = f"s3://{OSN_BUCKET}/{HACKATHON_PREFIX}"

HRRR_PREFIX = f"{OSN_ROOT}hrrr/"

# Where to write derived outputs (writable scratch)
SCRATCH_BUCKET = os.environ.get("SCRATCH_BUCKET", "gs://leap-scratch/renriviera")
OUT_PREFIX = f"{SCRATCH_BUCKET}/sfincs_soundview_preproc"
WIND_OUT_PREFIX = f"{OUT_PREFIX}/forcing/wind_hrrr"

print("OSN_ROOT:", OSN_ROOT)
print("HRRR_PREFIX:", HRRR_PREFIX)
print("WIND_OUT_PREFIX:", WIND_OUT_PREFIX)


OSN_ROOT: s3://leap-pangeo-manual/hackathon-2026/
HRRR_PREFIX: s3://leap-pangeo-manual/hackathon-2026/hrrr/
WIND_OUT_PREFIX: gs://leap-scratch/renriviera/sfincs_soundview_preproc/forcing/wind_hrrr


In [10]:
# ============================================================
# 1) Dask cluster
# ============================================================

from dask.distributed import Client

client = None

try:
    from dask_gateway import Gateway
    gw = Gateway()
    cluster = gw.new_cluster()
    cluster.scale(4)  # adjust: 2‚Äì8 workers typical
    client = cluster.get_client()
    print("‚úÖ Using Dask Gateway cluster")
    print(client)
except Exception as e:
    print("Gateway not available (or failed). Falling back to LocalCluster.")
    from dask.distributed import LocalCluster
    cluster = LocalCluster(
        n_workers=2,
        threads_per_worker=2,
        memory_limit="3GB",
        dashboard_address=":8787",
    )
    client = Client(cluster)
    print("‚úÖ Using LocalCluster")
    print(client)


‚úÖ Using Dask Gateway cluster
<Client: 'tls://10.0.34.165:8786' processes=0 threads=0, memory=0 B>


2026-01-16 22:54:08,130 - distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client


In [11]:
# ============================================================
# 2) OSN S3 filesystem (anonymous)
# ============================================================

import s3fs

fs = s3fs.S3FileSystem(
    anon=True,
    client_kwargs={"endpoint_url": OSN_ENDPOINT_URL},
)

# quick sanity check
print("‚úÖ Connected to OSN:", OSN_ENDPOINT_URL)
print("Listing HRRR prefix:")
print(fs.ls(HRRR_PREFIX)[:20])


‚úÖ Connected to OSN: https://nyu1.osn.mghpcc.org
Listing HRRR prefix:
['leap-pangeo-manual/hackathon-2026/hrrr/refc', 'leap-pangeo-manual/hackathon-2026/hrrr/temp2m', 'leap-pangeo-manual/hackathon-2026/hrrr/tp', 'leap-pangeo-manual/hackathon-2026/hrrr/u10m', 'leap-pangeo-manual/hackathon-2026/hrrr/v10m']


In [12]:
# ============================================================
# 3) Open HRRR u10m/v10m Zarr stores for YEAR=2025 only (OSN S3)
# ============================================================

import xarray as xr
import fsspec

YEAR = 2025

U10M_STORE = f"s3://{OSN_BUCKET}/{HACKATHON_PREFIX}hrrr/u10m/hrrru10m{YEAR}.zarr"
V10M_STORE = f"s3://{OSN_BUCKET}/{HACKATHON_PREFIX}hrrr/v10m/hrrrv10m{YEAR}.zarr"

print("‚úÖ Using YEAR:", YEAR)
print("U10M_STORE:", U10M_STORE)
print("V10M_STORE:", V10M_STORE)

# ------------------------------------------------------------
# IMPORTANT: OSN is S3-compatible, not GCS.
# We MUST use an S3 filesystem pointed at the OSN endpoint.
# ------------------------------------------------------------
fs_s3 = fsspec.filesystem(
    "s3",
    anon=True,
    client_kwargs={"endpoint_url": OSN_ENDPOINT_URL},
)

def exists_zarr_s3(store_path: str) -> bool:
    # Zarr v2 marker: .zgroup
    # Zarr v3 marker: zarr.json
    return (
        fs_s3.exists(store_path)
        or fs_s3.exists(store_path.rstrip("/") + "/.zgroup")
        or fs_s3.exists(store_path.rstrip("/") + "/zarr.json")
    )

# ---- Existence check
if not exists_zarr_s3(U10M_STORE):
    raise FileNotFoundError(f"‚ùå u10m store not found: {U10M_STORE}")

if not exists_zarr_s3(V10M_STORE):
    raise FileNotFoundError(f"‚ùå v10m store not found: {V10M_STORE}")

print("‚úÖ Confirmed both 2025 Zarr stores exist on OSN.")

# ---- Open with consolidated fallback
def open_zarr_safely_s3(store_path: str):
    mapper = fs_s3.get_mapper(store_path)
    try:
        return xr.open_zarr(mapper, consolidated=True)
    except Exception:
        return xr.open_zarr(mapper, consolidated=False)

ds_u = open_zarr_safely_s3(U10M_STORE)
ds_v = open_zarr_safely_s3(V10M_STORE)

print("\n--- ds_u ---")
print(ds_u)
print("u vars:", list(ds_u.data_vars)[:30])

print("\n--- ds_v ---")
print(ds_v)
print("v vars:", list(ds_v.data_vars)[:30])

# ---- Quick time coverage sanity check
def time_range(ds):
    if "time" not in ds.coords:
        return None
    t0 = ds["time"].values[0]
    t1 = ds["time"].values[-1]
    n = ds.sizes.get("time", None)
    return t0, t1, n

u_t = time_range(ds_u)
v_t = time_range(ds_v)

print("\n--- Time coverage ---")
print("u10m time:", u_t)
print("v10m time:", v_t)

if u_t and v_t:
    if (u_t[0] != v_t[0]) or (u_t[1] != v_t[1]) or (u_t[2] != v_t[2]):
        print("‚ö†Ô∏è WARNING: u/v time ranges differ (we will intersect later).")
    else:
        print("‚úÖ u/v time ranges match.")





‚úÖ Using YEAR: 2025
U10M_STORE: s3://leap-pangeo-manual/hackathon-2026/hrrr/u10m/hrrru10m2025.zarr
V10M_STORE: s3://leap-pangeo-manual/hackathon-2026/hrrr/v10m/hrrrv10m2025.zarr
‚úÖ Confirmed both 2025 Zarr stores exist on OSN.

--- ds_u ---
<xarray.Dataset> Size: 59GB
Dimensions:              (time: 7800, y: 1059, x: 1799)
Coordinates:
  * time                 (time) datetime64[ns] 62kB 2025-01-01 ... 2025-11-21...
    gribfile_projection  float64 8B ...
    heightAboveGround    float64 8B ...
    latitude             (y, x) float64 15MB dask.array<chunksize=(1059, 1799), meta=np.ndarray>
    longitude            (y, x) float64 15MB dask.array<chunksize=(1059, 1799), meta=np.ndarray>
    step                 timedelta64[ns] 8B ...
    valid_time           (time) datetime64[ns] 62kB dask.array<chunksize=(24,), meta=np.ndarray>
Dimensions without coordinates: y, x
Data variables:
    u10                  (time, y, x) float32 59GB dask.array<chunksize=(24, 1059, 1799), meta=np.ndarray>


In [13]:
# ============================================================
# 4) Extract u10m and v10m variables + standardize names
# ============================================================

import xarray as xr
import numpy as np

def pick_main_var(ds, prefer_substrings=("u10", "u10m", "v10", "v10m", "wind")):
    if len(ds.data_vars) == 0:
        raise RuntimeError("Dataset has no data variables.")
    # Prefer variables that look like wind
    for key in ds.data_vars:
        lk = key.lower()
        if any(s in lk for s in prefer_substrings):
            return key
    return list(ds.data_vars)[0]

u_var = pick_main_var(ds_u, prefer_substrings=("u10", "u10m", "ugrd", "wind"))
v_var = pick_main_var(ds_v, prefer_substrings=("v10", "v10m", "vgrd", "wind"))

print("Selected u var:", u_var)
print("Selected v var:", v_var)

u10 = ds_u[u_var].rename("wind10_u")
v10 = ds_v[v_var].rename("wind10_v")

# Force consistent coordinates (time/x/y)
u10, v10 = xr.align(u10, v10, join="inner")

wind = xr.Dataset({"wind10_u": u10, "wind10_v": v10})
wind["wind10_speed"] = np.sqrt(wind["wind10_u"]**2 + wind["wind10_v"]**2)

print("\n‚úÖ wind dataset:")
print(wind)



Selected u var: u10
Selected v var: v10


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.



‚úÖ wind dataset:
<xarray.Dataset> Size: 178GB
Dimensions:              (y: 1059, x: 1799, time: 7800)
Coordinates:
  * time                 (time) datetime64[ns] 62kB 2025-01-01 ... 2025-11-21...
    gribfile_projection  float64 8B nan
    heightAboveGround    float64 8B 10.0
    latitude             (y, x) float64 15MB 21.14 21.15 21.15 ... 47.85 47.84
    longitude            (y, x) float64 15MB 237.3 237.3 237.3 ... 299.0 299.1
    step                 timedelta64[ns] 8B 00:00:00
    valid_time           (time) datetime64[ns] 62kB 2025-01-01 ... 2025-11-21...
Dimensions without coordinates: y, x
Data variables:
    wind10_u             (time, y, x) float32 59GB dask.array<chunksize=(24, 1059, 1799), meta=np.ndarray>
    wind10_v             (time, y, x) float32 59GB dask.array<chunksize=(24, 1059, 1799), meta=np.ndarray>
    wind10_speed         (time, y, x) float32 59GB dask.array<chunksize=(24, 1059, 1799), meta=np.ndarray>


In [14]:
# ============================================================
# 5) Define ROI (Soundview) + subset HRRR winds using 2D lat/lon
#     Works even when latitude/longitude are (y,x) 2D arrays.
# ============================================================

import numpy as np
import xarray as xr

print("Wind dims:", wind.dims)
print("Wind coords:", list(wind.coords))

# ------------------------------------------------------------
# (A) Define your ROI in WGS84 (lon/lat)
#     Soundview Bronx (approx bbox). You can adjust these later.
# ------------------------------------------------------------
ROI_MIN_LON = -73.882
ROI_MAX_LON = -73.842
ROI_MIN_LAT = 40.807
ROI_MAX_LAT = 40.836

print("\nROI WGS84 bbox:")
print("  lon:", (ROI_MIN_LON, ROI_MAX_LON))
print("  lat:", (ROI_MIN_LAT, ROI_MAX_LAT))

# ------------------------------------------------------------
# (B) Grab 2D lat/lon from dataset
# ------------------------------------------------------------
if "latitude" not in wind.coords or "longitude" not in wind.coords:
    raise RuntimeError("Expected wind coords 'latitude' and 'longitude' but did not find them.")

lat2d = wind["latitude"]
lon2d = wind["longitude"]

# Ensure they are (y,x)
print("\nLatitude shape:", lat2d.shape, "Longitude shape:", lon2d.shape)

# ------------------------------------------------------------
# (C) Normalize longitudes if stored as 0..360
# ------------------------------------------------------------
lon_vals = lon2d.values
if np.nanmax(lon_vals) > 180:
    print("Detected 0..360 longitude convention -> converting to -180..180")
    lon2d_fixed = ((lon2d + 180) % 360) - 180
else:
    lon2d_fixed = lon2d

# ------------------------------------------------------------
# (D) Build ROI mask on the 2D grid
# ------------------------------------------------------------
mask = (
    (lat2d >= ROI_MIN_LAT) & (lat2d <= ROI_MAX_LAT) &
    (lon2d_fixed >= ROI_MIN_LON) & (lon2d_fixed <= ROI_MAX_LON)
)

mask_count = int(mask.sum().values) if hasattr(mask.sum().values, "item") else int(mask.sum().values)
print("\nMask pixels inside ROI:", mask_count)

if mask_count == 0:
    # Helpful debugging: print dataset geographic extent
    lat_min = float(np.nanmin(lat2d.values))
    lat_max = float(np.nanmax(lat2d.values))
    lon_min = float(np.nanmin(lon2d_fixed.values))
    lon_max = float(np.nanmax(lon2d_fixed.values))
    raise RuntimeError(
        "ROI mask returned 0 pixels.\n"
        f"Wind lat range: {lat_min:.4f} .. {lat_max:.4f}\n"
        f"Wind lon range: {lon_min:.4f} .. {lon_max:.4f}\n"
        "Your ROI bbox is outside the dataset coverage OR lon convention mismatch."
    )

# ------------------------------------------------------------
# (E) Convert mask -> bounding box indices (y_min..y_max, x_min..x_max)
# ------------------------------------------------------------
yy, xx = np.where(mask.values)

y0, y1 = int(yy.min()), int(yy.max())
x0, x1 = int(xx.min()), int(xx.max())

# Add a small pad so we don't clip tightly
PAD = 4
y0 = max(0, y0 - PAD)
x0 = max(0, x0 - PAD)
y1 = min(wind.sizes["y"] - 1, y1 + PAD)
x1 = min(wind.sizes["x"] - 1, x1 + PAD)

print("\nSubset index window:")
print("  y:", (y0, y1), "=> height:", (y1 - y0 + 1))
print("  x:", (x0, x1), "=> width :", (x1 - x0 + 1))

# ------------------------------------------------------------
# (F) Subset wind by y/x index window
# ------------------------------------------------------------
wind_roi = wind.isel(y=slice(y0, y1 + 1), x=slice(x0, x1 + 1))

print("\n‚úÖ Wind subset done.")
print("Subset dims:", wind_roi.dims)

# Replace wind with ROI subset for the rest of notebook
wind = wind_roi

# ============================================================
# 5.9) FORCE OPEN HRRR u10m/v10m from YEAR=2025 stores (NO STALE DS)
# ============================================================

import xarray as xr
import numpy as np
import s3fs

YEAR = 2025

# --- These MUST be correct ---
# Example OSN_ROOT: "s3://leap-pangeo-manual/hackathon-2026/"
# (Make sure your OSN_ROOT ends with "/")
if "OSN_ROOT" not in globals():
    raise RuntimeError("‚ùå OSN_ROOT not defined. It should be like: s3://leap-pangeo-manual/hackathon-2026/")

U10M_STORE = f"{OSN_ROOT}hrrr/u10m/hrrru10m{YEAR}.zarr"
V10M_STORE = f"{OSN_ROOT}hrrr/v10m/hrrrv10m{YEAR}.zarr"

print("‚úÖ Using YEAR:", YEAR)
print("U10M_STORE:", U10M_STORE)
print("V10M_STORE:", V10M_STORE)

# ------------------------------------------------------------
# 0) DELETE stale variables if they exist
# ------------------------------------------------------------
for stale in ["ds_u", "ds_v", "ds_u_sub", "ds_v_sub", "ds_u_roi", "ds_v_roi", "u", "v", "wind_fews"]:
    if stale in globals():
        del globals()[stale]
        print("üßπ deleted stale:", stale)

# ------------------------------------------------------------
# 1) Open via s3fs (important: DO NOT use gcsfs here)
# ------------------------------------------------------------
fs_s3 = s3fs.S3FileSystem(anon=True)

def open_zarr_s3(store: str) -> xr.Dataset:
    mapper = fs_s3.get_mapper(store)
    try:
        return xr.open_zarr(mapper, consolidated=True)
    except Exception:
        return xr.open_zarr(mapper, consolidated=False)

ds_u = open_zarr_s3(U10M_STORE)
ds_v = open_zarr_s3(V10M_STORE)

print("\n‚úÖ ds_u opened:", list(ds_u.data_vars))
print("‚úÖ ds_v opened:", list(ds_v.data_vars))

# ------------------------------------------------------------
# 2) Verify time really belongs to 2025
# ------------------------------------------------------------
t0_u = ds_u["time"].values[0]
t1_u = ds_u["time"].values[-1]
t0_v = ds_v["time"].values[0]
t1_v = ds_v["time"].values[-1]

print("\n--- TIME CHECK ---")
print("u10 time:", t0_u, "->", t1_u)
print("v10 time:", t0_v, "->", t1_v)

ymin = int(np.min(ds_u["time"].dt.year.values))
ymax = int(np.max(ds_u["time"].dt.year.values))
print("u10 year span:", ymin, "->", ymax)

if ymin != YEAR and ymax != YEAR:
    raise RuntimeError(
        f"‚ùå ds_u is NOT from {YEAR}.\n"
        f"Got year span: {ymin}->{ymax}\n"
        "This indicates ds_u is not truly using the 2025 store."
    )

print("\n‚úÖ SUCCESS: u10m/v10m are truly YEAR=2025.")




Wind coords: ['gribfile_projection', 'heightAboveGround', 'latitude', 'longitude', 'step', 'time', 'valid_time']

ROI WGS84 bbox:
  lon: (-73.882, -73.842)
  lat: (40.807, 40.836)

Latitude shape: (1059, 1799) Longitude shape: (1059, 1799)
Detected 0..360 longitude convention -> converting to -180..180

Mask pixels inside ROI: 2

Subset index window:
  y: (696, 704) => height: 9
  x: (1551, 1560) => width : 10

‚úÖ Wind subset done.
‚úÖ Using YEAR: 2025
U10M_STORE: s3://leap-pangeo-manual/hackathon-2026/hrrr/u10m/hrrru10m2025.zarr
V10M_STORE: s3://leap-pangeo-manual/hackathon-2026/hrrr/v10m/hrrrv10m2025.zarr
üßπ deleted stale: ds_u
üßπ deleted stale: ds_v
üßπ deleted stale: u
üßπ deleted stale: v
üßπ deleted stale: wind_fews


GroupNotFoundError: No group found in store <fsspec.mapping.FSMap object at 0x7dbc7f36f410> at path ''

In [None]:
# ============================================================
# Cell 6 (FIXED): Force matching u10m/v10m to a specific YEAR
# ============================================================

import s3fs
import xarray as xr
import numpy as np

YEAR = 2025  # <-- force this year only

OSN_ENDPOINT_URL = "https://nyu1.osn.mghpcc.org"
OSN_BUCKET = "leap-pangeo-manual"
HACKATHON_PREFIX = "hackathon-2026/"

U10M_STORE = f"s3://{OSN_BUCKET}/{HACKATHON_PREFIX}hrrr/u10m/hrrru10m{YEAR}.zarr"
V10M_STORE = f"s3://{OSN_BUCKET}/{HACKATHON_PREFIX}hrrr/v10m/hrrrv10m{YEAR}.zarr"

fs = s3fs.S3FileSystem(client_kwargs={"endpoint_url": OSN_ENDPOINT_URL}, anon=True)

def open_zarr_store(store_path):
    mapper = fs.get_mapper(store_path)
    return xr.open_zarr(mapper, consolidated=False)

print("‚úÖ Using YEAR:", YEAR)
print("U10M_STORE:", U10M_STORE)
print("V10M_STORE:", V10M_STORE)

ds_u = open_zarr_store(U10M_STORE)
ds_v = open_zarr_store(V10M_STORE)

print("\n--- ds_u ---")
print(ds_u)
print("u vars:", list(ds_u.data_vars))

print("\n--- ds_v ---")
print(ds_v)
print("v vars:", list(ds_v.data_vars))

print("\n‚úÖ Time ranges:")
print("u:", ds_u.time.values[0], "->", ds_u.time.values[-1])
print("v:", ds_v.time.values[0], "->", ds_v.time.values[-1])



In [None]:
# ============================================================
# 6) Subset u10/v10 to ROI + Build FEWS wind forcing dataset
#    ‚úÖ FORCE using ds_u/ds_v opened for YEAR=2025 (no stale globals)
# ============================================================

import numpy as np
import xarray as xr
from pyproj import Transformer

FEWS_TIME_UNITS = "minutes since 1970-01-01 00:00:00.0 +0000"
WIND_NODATA = -9999.0
TARGET_CRS = "EPSG:26918"   # UTM18N (matches your DEM)

YEAR = 2025

# ---- ROI bbox (WGS84) ----
ROI_LON_MIN, ROI_LON_MAX = -73.882, -73.842
ROI_LAT_MIN, ROI_LAT_MAX =  40.807,  40.836

# ------------------------------------------------------------
# 0) FORCE pick u/v from ds_u / ds_v ONLY
# ------------------------------------------------------------
if "ds_u" not in globals() or "ds_v" not in globals():
    raise RuntimeError("‚ùå ds_u / ds_v not found. Run the HRRR open-zarr cell first.")

def pick_var(ds: xr.Dataset, preferred=("u10", "v10")) -> xr.DataArray:
    for vn in preferred:
        if vn in ds.data_vars:
            return ds[vn]
    return ds[list(ds.data_vars)[0]]

u = pick_var(ds_u, preferred=("u10",))
v = pick_var(ds_v, preferred=("v10",))

print("‚úÖ picked u:", u.name, "| v:", v.name)
print("Raw u dims:", u.dims, "shape:", u.shape)
print("Raw v dims:", v.dims, "shape:", v.shape)

# ------------------------------------------------------------
# 1) Drop nondim coords that break merges (valid_time/step/etc.)
# ------------------------------------------------------------
def drop_nondim_coords(da: xr.DataArray) -> xr.DataArray:
    drop = [c for c in da.coords if c not in da.dims]
    return da.drop_vars(drop, errors="ignore")

u = drop_nondim_coords(u)
v = drop_nondim_coords(v)

# ------------------------------------------------------------
# 2) Force dim order to (time, y, x)
# ------------------------------------------------------------
def to_time_y_x(da: xr.DataArray) -> xr.DataArray:
    needed = ("time", "y", "x")
    if not set(needed).issubset(set(da.dims)):
        raise RuntimeError(f"Expected dims {needed}, got {da.dims}")
    return da.transpose("time", "y", "x")

u = to_time_y_x(u)
v = to_time_y_x(v)

# ------------------------------------------------------------
# 3) Align time by intersection
# ------------------------------------------------------------
t_u = np.asarray(u["time"].values)
t_v = np.asarray(v["time"].values)
t_common = np.intersect1d(t_u, t_v)

if t_common.size == 0:
    raise RuntimeError("‚ùå No overlapping timestamps between u and v after cleanup.")

u = u.sel(time=t_common)
v = v.sel(time=t_common)

print("‚úÖ common time len:", u.sizes["time"])
print("‚úÖ common time first/last:", u.time.values[0], "->", u.time.values[-1])

# ------------------------------------------------------------
# 4) HARD ASSERT: must be YEAR=2025
# ------------------------------------------------------------
year_min = int(np.min(u["time"].dt.year.values))
year_max = int(np.max(u["time"].dt.year.values))
print(f"‚úÖ Wind time span years: {year_min} -> {year_max}")

if year_max < YEAR or year_min > YEAR:
    raise RuntimeError(
        f"‚ùå Wind data is NOT in {YEAR}.\n"
        f"Found years: {year_min} -> {year_max}\n"
        "This means you are NOT actually using the 2025 HRRR store."
    )

# ------------------------------------------------------------
# 5) ROI mask via 2D latitude/longitude
# ------------------------------------------------------------
lat2d = None
lon2d = None

# HRRR stores usually have 2D latitude/longitude coords
for cand_ds in [ds_u, ds_v]:
    if ("latitude" in cand_ds.coords) and ("longitude" in cand_ds.coords):
        lat2d = np.asarray(cand_ds["latitude"].values)
        lon2d = np.asarray(cand_ds["longitude"].values)
        break

if lat2d is None or lon2d is None:
    raise RuntimeError("‚ùå Could not find latitude/longitude coords in ds_u or ds_v.")

# Convert 0..360 -> -180..180 if needed
if np.nanmax(lon2d) > 180:
    lon2d = ((lon2d + 180) % 360) - 180

inside = (
    (lon2d >= ROI_LON_MIN) & (lon2d <= ROI_LON_MAX) &
    (lat2d >= ROI_LAT_MIN) & (lat2d <= ROI_LAT_MAX)
)

n_inside = int(np.sum(inside))
print("Mask pixels inside ROI:", n_inside)
if n_inside == 0:
    raise RuntimeError("‚ùå ROI mask selected 0 pixels. Check bbox/coords.")

ys, xs = np.where(inside)
y0, y1 = int(ys.min()), int(ys.max())
x0, x1 = int(xs.min()), int(xs.max())

print("Subset index window:")
print(f"  y: ({y0}, {y1}) -> height: {y1-y0+1}")
print(f"  x: ({x0}, {x1}) -> width : {x1-x0+1}")

u = u.isel(y=slice(y0, y1 + 1), x=slice(x0, x1 + 1))
v = v.isel(y=slice(y0, y1 + 1), x=slice(x0, x1 + 1))

lat_roi = lat2d[y0:y1 + 1, x0:x1 + 1]
lon_roi = lon2d[y0:y1 + 1, x0:x1 + 1]

print("‚úÖ u subset shape:", u.shape, "| v subset shape:", v.shape)

# ------------------------------------------------------------
# 6) Project ROI lat/lon -> UTM x/y centers
# ------------------------------------------------------------
transformer = Transformer.from_crs("EPSG:4326", TARGET_CRS, always_xy=True)
x2d, y2d = transformer.transform(lon_roi, lat_roi)

x2d = np.asarray(x2d, dtype="float64")
y2d = np.asarray(y2d, dtype="float64")

x_1d = np.nanmean(x2d, axis=0)
y_1d = np.nanmean(y2d, axis=1)

# Ensure x/y monotonic increasing
if np.any(np.diff(x_1d) < 0):
    x_1d = x_1d[::-1]
    u = u.isel(x=slice(None, None, -1))
    v = v.isel(x=slice(None, None, -1))

if np.any(np.diff(y_1d) < 0):
    y_1d = y_1d[::-1]
    u = u.isel(y=slice(None, None, -1))
    v = v.isel(y=slice(None, None, -1))

print("‚úÖ final x/y sizes:", len(x_1d), len(y_1d))

# ------------------------------------------------------------
# 7) Build FEWS dataset (amu/amv) with datetime time axis
# ------------------------------------------------------------
amu = u.astype("float32").where(np.isfinite(u), WIND_NODATA)
amv = v.astype("float32").where(np.isfinite(v), WIND_NODATA)

wind_fews = xr.Dataset(
    data_vars={
        "amu": (("time", "y", "x"), amu.values),
        "amv": (("time", "y", "x"), amv.values),
    },
    coords={
        "time": ("time", u["time"].values),   # ‚úÖ keep as datetime64
        "x": ("x", x_1d.astype("float64")),
        "y": ("y", y_1d.astype("float64")),
    },
)

wind_fews["amu"].attrs.update({"long_name": "x_wind", "units": "m s-1", "_FillValue": WIND_NODATA})
wind_fews["amv"].attrs.update({"long_name": "y_wind", "units": "m s-1", "_FillValue": WIND_NODATA})
wind_fews.attrs["crs"] = TARGET_CRS

print("\n‚úÖ FEWS wind dataset built:")
print(wind_fews)
print("Time first/last:", wind_fews.time.values[0], "->", wind_fews.time.values[-1])


In [None]:
# ============================================================
# 7) Write FEWS netamuamvfile.nc + upload to SCRATCH_BUCKET via gcsfs
# ============================================================

import os
import tempfile
from pathlib import Path

import gcsfs

# ---- Use your preferred scratch bucket location
SCRATCH_BUCKET = os.environ.get("SCRATCH_BUCKET", "gs://leap-scratch/renriviera")
print("‚úÖ Using SCRATCH_BUCKET:", SCRATCH_BUCKET)

# ---- Where inside scratch you want the file
# We'll keep your OUT_PREFIX structure but rooted at SCRATCH_BUCKET
# Example final path:
# gs://leap-scratch/renriviera/sfincs_soundview_preproc/forcing/wind/...
out_prefix = f"{OUT_PREFIX}/forcing/wind"
out_gcs = f"{out_prefix}/sfincs_netamuamv_hrrr_u10v10_soundview_2025.nc"

# If OUT_PREFIX is already absolute gs://..., override to use SCRATCH_BUCKET explicitly:
if out_gcs.startswith("gs://"):
    # Make path relative after your scratch root if needed
    # If your OUT_PREFIX already starts with SCRATCH_BUCKET, keep it
    if not out_gcs.startswith(SCRATCH_BUCKET):
        # fallback: store under SCRATCH_BUCKET/forcing/wind/
        out_gcs = f"{SCRATCH_BUCKET}/forcing/wind/sfincs_netamuamv_hrrr_u10v10_soundview_2025.nc"

print("üìå Target scratch path:", out_gcs)

# ---- Write locally
local_dir = Path(tempfile.mkdtemp(prefix="sfincs_wind_fews_"))
out_local = local_dir / "netamuamvfile.nc"

print("Writing local:", out_local)

# IMPORTANT: remove _FillValue from attrs if you also set it in encoding
for var in ["amu", "amv"]:
    if "_FillValue" in wind_fews[var].attrs:
        wind_fews[var].attrs.pop("_FillValue", None)

# NetCDF encoding
WIND_NODATA = float(WIND_NODATA)
encoding = {
    "amu": {"dtype": "float32", "zlib": True, "complevel": 4, "_FillValue": WIND_NODATA},
    "amv": {"dtype": "float32", "zlib": True, "complevel": 4, "_FillValue": WIND_NODATA},
}

f = wind_fews.to_netcdf(out_local, encoding=encoding)
print("@@@@@@", f)
print("‚úÖ Local netcdf written:", out_local)

# ---- Upload with gcsfs (no gcloud/gsutil)
fs_gcs = gcsfs.GCSFileSystem(token="cloud")

# Convert gs://bucket/path -> bucket/path for gcsfs
assert out_gcs.startswith("gs://")
gcs_path_no_scheme = out_gcs.replace("gs://", "", 1)

print("Uploading via gcsfs ->", out_gcs)
fs_gcs.put(str(out_local), gcs_path_no_scheme)
print("‚úÖ Uploaded netamuamvfile to:", out_gcs)

# ---- Quick existence check
print("Exists on GCS:", fs_gcs.exists(gcs_path_no_scheme))


In [None]:
# ============================================================
# 8) Validate uploaded FEWS netamuamvfile.nc for HydroMT-SFINCS
#    Checks: existence, vars, dims, dtype, nodata, monotonic time,
#            CRS attrs, finite values, and basic range sanity.
# ============================================================

import os
import numpy as np
import xarray as xr
import gcsfs

SCRATCH_BUCKET = os.environ.get("SCRATCH_BUCKET", "gs://leap-scratch/renriviera")
print("‚úÖ Using SCRATCH_BUCKET:", SCRATCH_BUCKET)

# ---- Point to your uploaded file
# If you already have out_gcs from the previous cell, this will use it.
# Otherwise set it explicitly here:
try:
    OUT_NETCDF = out_gcs
except NameError:
    OUT_NETCDF = f"{SCRATCH_BUCKET}/forcing/wind/sfincs_netamuamv_hrrr_u10v10_soundview_2025.nc"

print("üìå Validating:", OUT_NETCDF)

# ---- Existence check
fs = gcsfs.GCSFileSystem(token="cloud")
gcs_path_no_scheme = OUT_NETCDF.replace("gs://", "", 1)
if not fs.exists(gcs_path_no_scheme):
    raise FileNotFoundError(f"‚ùå Not found on GCS: {OUT_NETCDF}")

print("‚úÖ File exists on GCS")

# ---- Open (gcsfs -> fsspec) without downloading
# (Works well for NetCDF4; if it fails, we fallback to caching locally.)
try:
    ds = xr.open_dataset(OUT_NETCDF, engine="netcdf4")
    print("‚úÖ Opened remotely with netcdf4 engine")
except Exception as e:
    print("‚ö†Ô∏è Remote open failed, caching locally. Reason:", type(e).__name__, "-", str(e)[:200])
    import tempfile
    from pathlib import Path

    local_dir = Path(tempfile.mkdtemp(prefix="sfincs_wind_validate_"))
    local_nc = local_dir / "netamuamvfile.nc"
    fs.get(gcs_path_no_scheme, str(local_nc))
    print("‚úÖ Downloaded to:", local_nc)
    ds = xr.open_dataset(local_nc, engine="netcdf4")
    print("‚úÖ Opened locally with netcdf4 engine")

print("\n--- Dataset summary ---")
print(ds)

# ============================================================
# Required structure for SFINCS FEWS wind forcing
# ============================================================

REQUIRED_VARS = ["amu", "amv"]
REQUIRED_DIMS = ("time", "y", "x")
EXPECTED_FILL = -9999.0

# ---- 1) Required variables exist
missing_vars = [v for v in REQUIRED_VARS if v not in ds.data_vars]
if missing_vars:
    raise AssertionError(f"‚ùå Missing required vars: {missing_vars}")
print("‚úÖ Required vars present:", REQUIRED_VARS)

# ---- 2) Each variable has dims (time,y,x)
for v in REQUIRED_VARS:
    if tuple(ds[v].dims) != REQUIRED_DIMS:
        raise AssertionError(f"‚ùå {v} dims {ds[v].dims} != {REQUIRED_DIMS}")
print("‚úÖ Variable dimensions are correct:", REQUIRED_DIMS)

# ---- 3) Dtypes are numeric + float-ish
for v in REQUIRED_VARS:
    if not np.issubdtype(ds[v].dtype, np.floating):
        raise AssertionError(f"‚ùå {v} dtype {ds[v].dtype} is not float")
print("‚úÖ Variable dtypes are float")

# ---- 4) Time coordinate validity
if "time" not in ds.coords:
    raise AssertionError("‚ùå Missing time coordinate")
if ds["time"].size < 2:
    raise AssertionError("‚ùå time coord too short")
if not np.issubdtype(ds["time"].dtype, np.datetime64):
    raise AssertionError(f"‚ùå time dtype should be datetime64, got {ds['time'].dtype}")

t = ds["time"].values
if not np.all(np.diff(t).astype("timedelta64[s]") > np.timedelta64(0, "s")):
    raise AssertionError("‚ùå time is not strictly increasing")
print("‚úÖ time is datetime64 and strictly increasing")

# ---- 5) Spatial coordinates exist and are 1D
for coord in ["x", "y"]:
    if coord not in ds.coords:
        raise AssertionError(f"‚ùå Missing coord: {coord}")
    if ds[coord].ndim != 1:
        raise AssertionError(f"‚ùå {coord} must be 1D, got ndim={ds[coord].ndim}")
print("‚úÖ x/y are present and 1D")

# ---- 6) FillValue / missing data check
def get_fillvalue(da):
    # Prefer encoding _FillValue, fallback to attrs
    fv = da.encoding.get("_FillValue", None)
    if fv is None:
        fv = da.attrs.get("_FillValue", None)
    return fv

for v in REQUIRED_VARS:
    fv = get_fillvalue(ds[v])
    if fv is None:
        print(f"‚ö†Ô∏è {v}: no _FillValue found in encoding/attrs (not always fatal)")
    else:
        if not np.isclose(float(fv), float(EXPECTED_FILL)):
            raise AssertionError(f"‚ùå {v}: _FillValue={fv} != expected {EXPECTED_FILL}")
        print(f"‚úÖ {v}: _FillValue OK ({fv})")

# ---- 7) Check for NaNs (should typically be filled, not NaN)
for v in REQUIRED_VARS:
    # sample a small slice to avoid loading everything
    sample = ds[v].isel(time=slice(0, min(48, ds.dims["time"]))).values
    if np.isnan(sample).any():
        raise AssertionError(f"‚ùå {v}: contains NaNs (should be filled to nodata={EXPECTED_FILL})")
print("‚úÖ No NaNs detected in early time sample")

# ---- 8) Basic range sanity (wind in m/s, very broad allowed)
# HRRR winds can spike but this catches unit mistakes like km/h or knots
for v in REQUIRED_VARS:
    sample = ds[v].isel(time=slice(0, min(168, ds.dims["time"]))).values  # 1 week
    # ignore fillvalues
    sample = sample[np.isfinite(sample)]
    sample = sample[sample != EXPECTED_FILL]
    if sample.size == 0:
        raise AssertionError(f"‚ùå {v}: no valid data found after removing fillvalues")
    p99 = float(np.quantile(sample, 0.99))
    p01 = float(np.quantile(sample, 0.01))
    print(f"Sanity {v}: p01={p01:.2f}, p99={p99:.2f} (m/s)")
    if abs(p99) > 80 or abs(p01) > 80:
        raise AssertionError(f"‚ùå {v}: suspicious wind magnitude (>80 m/s). Units wrong?")
print("‚úÖ Wind magnitude sanity checks passed")

# ---- 9) Optional CRS attribute (nice to have)
crs_attr = ds.attrs.get("crs", None)
if crs_attr is None:
    print("‚ö†Ô∏è Dataset attribute 'crs' missing (not always fatal).")
else:
    print("‚úÖ Dataset CRS attr:", crs_attr)

# ---- 10) Final verdict
print("\n‚úÖ FEWS netamuamvfile.nc looks HydroMT-SFINCS compatible.")
print("   - Vars: amu/amv")
print("   - Dims: (time,y,x)")
print("   - time is monotonic datetime64")
print("   - x/y are 1D coords")
print("   - FillValue handled")
