In [1]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import xarray as xr
import gcsfs
import s3fs

# ============================================================
# Configuration
# ============================================================

# OSN (HRRR data source)
OSN_ENDPOINT_URL = "https://nyu1.osn.mghpcc.org"
OSN_BUCKET = "leap-pangeo-manual"
HACKATHON_PREFIX = "hackathon-2026/"
OSN_ROOT = f"s3://{OSN_BUCKET}/{HACKATHON_PREFIX}"

# Output paths
SCRATCH_BUCKET = "gs://leap-scratch/renriviera"
OUT_PREFIX = f"{SCRATCH_BUCKET}/sfincs_soundview_preproc"
RAIN_OUT_PREFIX = f"{OUT_PREFIX}/forcing/rain_hrrr"

# Year to process
YEAR = 2025

# Soundview ROI (lat/lon WGS84)
ROI_MIN_LON = -73.882
ROI_MAX_LON = -73.842
ROI_MIN_LAT = 40.807
ROI_MAX_LAT = 40.836

# Target CRS (UTM Zone 18N for NYC)
TARGET_CRS = "EPSG:26918"

# CRITICAL: FEWS time format (same as windspeed notebook!)
FEWS_TIME_UNITS = "minutes since 1970-01-01 00:00:00.0 +0000"
RAIN_NODATA = -9999.0

print("Configuration:")
print(f"  OSN_ROOT: {OSN_ROOT}")
print(f"  SCRATCH: {SCRATCH_BUCKET}")
print(f"  RAIN_OUT: {RAIN_OUT_PREFIX}")
print(f"  YEAR: {YEAR}")
print(f"  ROI (lon): {ROI_MIN_LON} to {ROI_MAX_LON}")
print(f"  ROI (lat): {ROI_MIN_LAT} to {ROI_MAX_LAT}")

Configuration:
  OSN_ROOT: s3://leap-pangeo-manual/hackathon-2026/
  SCRATCH: gs://leap-scratch/renriviera
  RAIN_OUT: gs://leap-scratch/renriviera/sfincs_soundview_preproc/forcing/rain_hrrr
  YEAR: 2025
  ROI (lon): -73.882 to -73.842
  ROI (lat): 40.807 to 40.836


In [2]:
# ============================================================
# OSN S3 Filesystem (anonymous)
# ============================================================

fs_osn = s3fs.S3FileSystem(
    anon=True,
    client_kwargs={"endpoint_url": OSN_ENDPOINT_URL},
)

print("✅ Connected to OSN")
print("HRRR available:")
print(fs_osn.ls(f"{OSN_BUCKET}/{HACKATHON_PREFIX}hrrr/tp")[:5])

✅ Connected to OSN
HRRR available:
['leap-pangeo-manual/hackathon-2026/hrrr/tp/hrrrtp2020.zarr', 'leap-pangeo-manual/hackathon-2026/hrrr/tp/hrrrtp2021.zarr', 'leap-pangeo-manual/hackathon-2026/hrrr/tp/hrrrtp2022.zarr', 'leap-pangeo-manual/hackathon-2026/hrrr/tp/hrrrtp2023.zarr', 'leap-pangeo-manual/hackathon-2026/hrrr/tp/hrrrtp2024.zarr']


In [4]:
# ============================================================
# Open HRRR tp (total precipitation) for specified YEAR
# ============================================================

TP_STORE = f"s3://{OSN_BUCKET}/{HACKATHON_PREFIX}hrrr/tp/hrrrtp{YEAR}.zarr"

print(f"✅ Using YEAR: {YEAR}")
print(f"TP_STORE: {TP_STORE}")

# Open with fsspec
mapper = fs_osn.get_mapper(TP_STORE)

try:
    ds = xr.open_zarr(mapper, consolidated=True)
    print("✅ Opened with consolidated=True")
except Exception:
    ds = xr.open_zarr(mapper, consolidated=False)
    print("✅ Opened with consolidated=False")

print("\n--- Dataset ---")
print(ds)
print("\nVariables:", list(ds.data_vars))
print("\nDimensions:", dict(ds.dims))

# Check time coverage
if "time" in ds.coords:
    print(f"\nTime range:")
    print(f"  Start: {ds.time.values[0]}")
    print(f"  End: {ds.time.values[-1]}")
    print(f"  Steps: {len(ds.time)}")

✅ Using YEAR: 2025
TP_STORE: s3://leap-pangeo-manual/hackathon-2026/hrrr/tp/hrrrtp2025.zarr
✅ Opened with consolidated=True

--- Dataset ---
<xarray.Dataset> Size: 61GB
Dimensions:              (time: 8040, y: 1059, x: 1799)
Coordinates:
  * time                 (time) datetime64[ns] 64kB 2025-01-01 ... 2025-12-01...
    gribfile_projection  float64 8B ...
    latitude             (y, x) float64 15MB dask.array<chunksize=(1059, 1799), meta=np.ndarray>
    longitude            (y, x) float64 15MB dask.array<chunksize=(1059, 1799), meta=np.ndarray>
    step                 timedelta64[ns] 8B ...
    surface              float64 8B ...
    valid_time           (time) datetime64[ns] 64kB dask.array<chunksize=(24,), meta=np.ndarray>
Dimensions without coordinates: y, x
Data variables:
    tp                   (time, y, x) float32 61GB dask.array<chunksize=(24, 1059, 1799), meta=np.ndarray>
Attributes:
    GRIB_edition:            2
    GRIB_centre:             kwbc
    GRIB_centreDescriptio

  print("\nDimensions:", dict(ds.dims))


In [5]:
# ============================================================
# Extract precipitation variable + standardize
# ============================================================

# Pick main variable (should be 'tp')
def pick_main_var(ds, prefer_substrings=("tp", "precip", "precipitation")):
    if len(ds.data_vars) == 0:
        raise RuntimeError("Dataset has no data variables")
    
    # Prefer variables that look like precip
    for key in ds.data_vars:
        lk = key.lower()
        if any(s in lk for s in prefer_substrings):
            return key
    return list(ds.data_vars)[0]

tp_var = pick_main_var(ds)
print(f"Selected variable: {tp_var}")

# Extract as DataArray
tp = ds[tp_var]

print("\nPrecipitation variable:")
print(f"  Dimensions: {tp.dims}")
print(f"  Shape: {tp.shape}")
print(f"  Dtype: {tp.dtype}")

# Check coordinates
print(f"\nCoordinates:")
for coord in tp.coords:
    print(f"  {coord}: {tp.coords[coord].shape}")

Selected variable: tp

Precipitation variable:
  Dimensions: ('time', 'y', 'x')
  Shape: (8040, 1059, 1799)
  Dtype: float32

Coordinates:
  gribfile_projection: ()
  latitude: (1059, 1799)
  longitude: (1059, 1799)
  step: ()
  surface: ()
  time: (8040,)
  valid_time: (8040,)


In [16]:
# ============================================================
# Cell 6 (FIXED): Calculate spatial maximum AND create 2D grid
# ============================================================

print("Calculating spatial maximum...")

# Get subset lat/lon
lat_subset = tp["latitude"].values
lon_subset = tp["longitude"].values

if np.nanmax(lon_subset) > 180:
    lon_subset = ((lon_subset + 180) % 360) - 180

# Create precise mask
precise_mask = (
    (lat_subset >= ROI_MIN_LAT) & (lat_subset <= ROI_MAX_LAT) &
    (lon_subset >= ROI_MIN_LON) & (lon_subset <= ROI_MAX_LON)
)

print(f"Exact cells in ROI: {precise_mask.sum()}")

# Mask out non-ROI cells
tp_masked = tp.where(precise_mask)

# Calculate MAXIMUM across space at each timestep
rain_max = tp_masked.max(dim=["y", "x"], skipna=True)

print("\n✅ Computing spatial maximum...")
rain_values = rain_max.compute()

print(f"\nTime series:")
print(f"  Length: {len(rain_values)}")
print(f"  Precip max: {rain_values.max().values:.2f} mm/hr")

# ============================================================
# CRITICAL FIX: Create proper 2D grid for SFINCS
# Tile the maximum value across a minimal 2x2 grid
# ============================================================

print("\n⚠️ Creating 2D grid for HydroMT compatibility...")

# Get center coordinates of ROI
lat_center = (ROI_MIN_LAT + ROI_MAX_LAT) / 2
lon_center = (ROI_MIN_LON + ROI_MAX_LON) / 2

# Transform to UTM
from pyproj import Transformer
transformer = Transformer.from_crs("EPSG:4326", TARGET_CRS, always_xy=True)
x_center, y_center = transformer.transform(lon_center, lat_center)

# Create a small 2x2 grid around center point
# Grid spacing ~5km (SFINCS will interpolate to model grid anyway)
grid_spacing = 5000  # meters

x_coords = np.array([x_center - grid_spacing, x_center + grid_spacing])
y_coords = np.array([y_center - grid_spacing, y_center + grid_spacing])

print(f"Created 2x2 grid centered at ({x_center:.0f}, {y_center:.0f})")
print(f"  Grid spacing: {grid_spacing/1000:.1f} km")

# Tile the maximum precip value to all grid cells
# Shape: (time, y=2, x=2)
rain_2d = np.tile(
    rain_values.values[:, np.newaxis, np.newaxis],  # (time, 1, 1)
    (1, 2, 2)  # Tile to (time, 2, 2)
)

print(f"✅ Tiled precip grid: {rain_2d.shape}")

Calculating spatial maximum...
Exact cells in ROI: 2

✅ Computing spatial maximum...

Time series:
  Length: 8040
  Precip max: 28.50 mm/hr

⚠️ Creating 2D grid for HydroMT compatibility...
Created 2x2 grid centered at (595966, 4519565)
  Grid spacing: 5.0 km
✅ Tiled precip grid: (8040, 2, 2)


In [17]:
# ============================================================
# Convert time to FEWS format (unchanged)
# ============================================================

print("Converting time to FEWS format...")

t0 = np.datetime64("1970-01-01T00:00:00")
t_minutes = ((rain_values["time"].values - t0) / np.timedelta64(1, "m")).astype("int64")

print(f"\n✅ Time converted to FEWS format:")
print(f"  Units: {FEWS_TIME_UNITS}")
print(f"  Range: {t_minutes.min()} to {t_minutes.max()} minutes")

# Verify
first_datetime = np.datetime64("1970-01-01T00:00:00") + np.timedelta64(int(t_minutes[0]), "m")
last_datetime = np.datetime64("1970-01-01T00:00:00") + np.timedelta64(int(t_minutes[-1]), "m")

print(f"\nVerification:")
print(f"  First time: {first_datetime}")
print(f"  Last time: {last_datetime}")

if first_datetime.astype('datetime64[Y]').astype(int) + 1970 != YEAR:
    raise ValueError(f"Time conversion error! First time is not in {YEAR}")

print("✅ Time conversion verified")

Converting time to FEWS format...

✅ Time converted to FEWS format:
  Units: minutes since 1970-01-01 00:00:00.0 +0000
  Range: 28928160 to 29410500 minutes

Verification:
  First time: 2025-01-01T00:00:00
  Last time: 2025-12-01T23:00:00
✅ Time conversion verified


In [18]:
# ============================================================
# Build FEWS-format NetCDF with proper 2D spatial grid
# ============================================================

print("Building FEWS precipitation dataset...")

# Ensure float32 and replace NaN
rain_2d = np.where(np.isfinite(rain_2d), rain_2d, RAIN_NODATA).astype("float32")

# Build xarray Dataset with proper 2D grid
rain_fews = xr.Dataset(
    data_vars={
        "precip": (("time", "y", "x"), rain_2d),
    },
    coords={
        "time": ("time", t_minutes),
        "x": ("x", x_coords),
        "y": ("y", y_coords),
    },
)

# Set attributes
rain_fews["time"].attrs["units"] = FEWS_TIME_UNITS
rain_fews["precip"].attrs.update({
    "long_name": "precipitation",
    "units": "mm/hr",
    "description": "Spatially-uniform hourly precipitation from HRRR (max across ROI)"
})
rain_fews.attrs["crs"] = TARGET_CRS

print("\n✅ FEWS dataset built:")
print(rain_fews)
print(f"\nDimensions: {dict(rain_fews.sizes)}")
print(f"Spatial grid: {len(rain_fews.y)} x {len(rain_fews.x)}")

# Verify it's a proper 2D grid
if rain_fews.sizes["y"] < 2 or rain_fews.sizes["x"] < 2:
    raise ValueError(
        f"Grid too small! Need at least 2x2, got {rain_fews.sizes['y']}x{rain_fews.sizes['x']}"
    )

print("✅ Grid validation passed (2x2 minimum)")

Building FEWS precipitation dataset...

✅ FEWS dataset built:
<xarray.Dataset> Size: 193kB
Dimensions:  (time: 8040, y: 2, x: 2)
Coordinates:
  * time     (time) int64 64kB 28928160 28928220 28928280 ... 29410440 29410500
  * y        (y) float64 16B 4.515e+06 4.525e+06
  * x        (x) float64 16B 5.91e+05 6.01e+05
Data variables:
    precip   (time, y, x) float32 129kB 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0
Attributes:
    crs:      EPSG:26918

Dimensions: {'time': 8040, 'y': 2, 'x': 2}
Spatial grid: 2 x 2
✅ Grid validation passed (2x2 minimum)


In [19]:
# ============================================================
# Write FEWS NetCDF to local temp file
# ============================================================

import tempfile

print("Writing NetCDF locally...")

local_dir = Path(tempfile.mkdtemp(prefix="sfincs_rain_fews_"))
out_local = local_dir / "sfincs_rain_hrrr_soundview_2025.nc"

# Remove _FillValue from attrs (goes in encoding)
if "_FillValue" in rain_fews["precip"].attrs:
    rain_fews["precip"].attrs.pop("_FillValue")

# NetCDF encoding
encoding = {
    "precip": {
        "dtype": "float32",
        "zlib": True,
        "complevel": 4,
        "_FillValue": RAIN_NODATA,
    },
}

rain_fews.to_netcdf(out_local, encoding=encoding)

print(f"✅ Local NetCDF written: {out_local}")
print(f"  Size: {out_local.stat().st_size / (1024*1024):.2f} MB")

Writing NetCDF locally...
✅ Local NetCDF written: /tmp/sfincs_rain_fews_yggwg_vm/sfincs_rain_hrrr_soundview_2025.nc
  Size: 0.08 MB


In [20]:
# ============================================================
# Upload to GCS Scratch
# ============================================================

print("Uploading to GCS...")

fs_gcs = gcsfs.GCSFileSystem()

out_gcs = f"{RAIN_OUT_PREFIX}/sfincs_rain_hrrr_soundview_{YEAR}.nc"

# Remove gs:// prefix for gcsfs
gcs_path_no_scheme = out_gcs.replace("gs://", "")

print(f"Target: {out_gcs}")

fs_gcs.put(str(out_local), gcs_path_no_scheme)

print("✅ Uploaded to GCS")
print(f"\nValidation:")
print(f"  Exists: {fs_gcs.exists(gcs_path_no_scheme)}")

if fs_gcs.exists(gcs_path_no_scheme):
    size_mb = fs_gcs.size(gcs_path_no_scheme) / (1024*1024)
    print(f"  Size: {size_mb:.2f} MB")

Uploading to GCS...
Target: gs://leap-scratch/renriviera/sfincs_soundview_preproc/forcing/rain_hrrr/sfincs_rain_hrrr_soundview_2025.nc
✅ Uploaded to GCS

Validation:
  Exists: True
  Size: 0.08 MB


In [21]:
# ============================================================
# Validate uploaded file matches FEWS requirements
# ============================================================

print("="*60)
print("VALIDATION")
print("="*60)

# Reopen from GCS
try:
    ds_check = xr.open_dataset(out_gcs, engine="netcdf4")
    print("✅ Opened from GCS with netcdf4")
except Exception as e:
    print(f"⚠️ Remote open failed, downloading for validation")
    import tempfile
    local_check = Path(tempfile.mkdtemp()) / "check.nc"
    fs_gcs.get(gcs_path_no_scheme, str(local_check))
    ds_check = xr.open_dataset(local_check)

print("\nDataset:")
print(ds_check)

# Required checks
REQUIRED_VARS = ["precip"]
REQUIRED_DIMS = ("time", "y", "x")

# 1. Variables
missing = [v for v in REQUIRED_VARS if v not in ds_check.data_vars]
if missing:
    raise AssertionError(f"❌ Missing variables: {missing}")
print("✅ Required variables present")

# 2. Dimensions
for v in REQUIRED_VARS:
    if tuple(ds_check[v].dims) != REQUIRED_DIMS:
        raise AssertionError(f"❌ {v} dims {ds_check[v].dims} != {REQUIRED_DIMS}")
print("✅ Dimensions correct: (time, y, x)")

# 3. Time format
if "time" not in ds_check.coords:
    raise AssertionError("❌ Missing time coordinate")

if not np.issubdtype(ds_check["time"].dtype, np.datetime64):
    raise AssertionError(f"❌ time dtype should be datetime64, got {ds_check['time'].dtype}")

# Check time is monotonic
t_vals = ds_check["time"].values
t_diffs = np.diff(t_vals.astype("datetime64[s]"))
if not np.all(t_diffs > np.timedelta64(0, "s")):
    raise AssertionError("❌ time not monotonic")
print("✅ Time is datetime64 and monotonic")

# 4. Spatial coords
for coord in ["x", "y"]:
    if coord not in ds_check.coords:
        raise AssertionError(f"❌ Missing {coord}")
    if ds_check[coord].ndim != 1:
        raise AssertionError(f"❌ {coord} must be 1D")
print("✅ Spatial coordinates valid")

# 5. Data validity
sample = ds_check["precip"].isel(time=slice(0, min(100, len(ds_check.time)))).values
if np.isnan(sample).any():
    raise AssertionError("❌ Contains NaNs (should be filled)")
print("✅ No NaNs in data")

# 6. Value ranges
valid_data = sample[sample != RAIN_NODATA]
if valid_data.size > 0:
    p99 = float(np.quantile(valid_data, 0.99))
    print(f"\nData range check:")
    print(f"  p99: {p99:.2f} mm/hr")
    if p99 > 500:
        print(f"  ⚠️ Very high precipitation (>{p99:.1f} mm/hr)")
    else:
        print(f"  ✅ Values in reasonable range")

# 7. CRS attribute
crs_attr = ds_check.attrs.get("crs", None)
if crs_attr:
    print(f"✅ CRS attribute: {crs_attr}")
else:
    print("⚠️ CRS attribute missing (not fatal)")

print("\n" + "="*60)
print("VALIDATION COMPLETE")
print("="*60)
print(f"\n✅ Rain forcing file ready:")
print(f"   {out_gcs}")
print(f"\n✅ Compatible with HydroMT-SFINCS FEWS format")
print(f"   - Time: {FEWS_TIME_UNITS}")
print(f"   - Variable: precip (mm/hr)")
print(f"   - Dimensions: (time={len(ds_check.time)}, y={len(ds_check.y)}, x={len(ds_check.x)})")

VALIDATION
⚠️ Remote open failed, downloading for validation

Dataset:
<xarray.Dataset> Size: 193kB
Dimensions:  (time: 8040, y: 2, x: 2)
Coordinates:
  * time     (time) datetime64[ns] 64kB 2025-01-01 ... 2025-12-01T23:00:00
  * y        (y) float64 16B 4.515e+06 4.525e+06
  * x        (x) float64 16B 5.91e+05 6.01e+05
Data variables:
    precip   (time, y, x) float32 129kB ...
Attributes:
    crs:      EPSG:26918
✅ Required variables present
✅ Dimensions correct: (time, y, x)
✅ Time is datetime64 and monotonic
✅ Spatial coordinates valid
✅ No NaNs in data

Data range check:
  p99: 1.24 mm/hr
  ✅ Values in reasonable range
✅ CRS attribute: EPSG:26918

VALIDATION COMPLETE

✅ Rain forcing file ready:
   gs://leap-scratch/renriviera/sfincs_soundview_preproc/forcing/rain_hrrr/sfincs_rain_hrrr_soundview_2025.nc

✅ Compatible with HydroMT-SFINCS FEWS format
   - Time: minutes since 1970-01-01 00:00:00.0 +0000
   - Variable: precip (mm/hr)
   - Dimensions: (time=8040, y=2, x=2)
