This notebook contains the methodology for: 
 1. choosing the best IOC candidates for storm surge validation purposes
 2. extracting STOFS2D data

and edit of this notebook will be done for: 

 3. the data availability
 4. the data quality 
of IOC candidates

In [None]:
import geopandas as gp
import pandas as pd
import searvey
from datetime import datetime
import numpy as np
import sklearn.neighbors
import xarray as xr
import hvplot.pandas
import os

We will extract observed data from the STOF2D model, we will focus on the points locations exported by the model. 

The files have the following format: `stofs_2d_glo.tCCz.points.{cwl,htp,swl}.nc` for "Six-minute station time series water level (m, MSL) forecast files at verification sites"
 * `swl` is storm surge only
 * `htp` is astromical tide only
 * `cwl` is combined water level (`swl` + `htp`)

More information can be found on this page: https://noaa-gestofs-pds.s3.amazonaws.com/README.html

In [None]:
# A file available at the following address references all stofs 2D stations:

def get_stofs():
    mycols = [str(i) for i in range(6)] # we expect 17 cols max in that file
    stof2d = pd.read_csv(
        "https://polar.ncep.noaa.gov/stofs/data/stofs_2d_glo_elev_stat_v2_1_0",
        names=mycols, 
        sep="\t+|!", 
        header=None, 
        skiprows=1
    )
    stof2d['Info'] = stof2d.apply(lambda row: ' '.join(filter(None, row[2:])), axis=1)
    stof2d['ID'] = stof2d['Info'].apply(lambda x: ' '.join(x.split()[:3]))
    stof2d['Info'] = stof2d.apply(lambda row: row['Info'].replace(row['ID'], '').strip(), axis=1)
    stof2d = stof2d.drop(columns=["2", "3", "4", "5"])
    stof2d.rename(columns={"0": 'lon', "1": 'lat'}, inplace=True)
    return stof2d

stofs = get_stofs()
stofs

A caveat is that the 1D output files evolve over time: 

In [None]:
stofs1 = xr.open_dataset("stofs2d/20220912_stofs_2d_glo.t18z.points.swl.nc")
stofs2 = xr.open_dataset("stofs2d/20231010_stofs_2d_glo.t18z.points.swl.nc")
stofs3 = xr.open_dataset("stofs2d/20241229_stofs_2d_glo.t12z.points.swl.nc")

stofs_2022 = stofs[stofs.ID.isin([' '.join(s.decode("utf-8").strip().split()[:3]) for s in stofs1.station_name.values])];len(stofs_2022)
stofs_2023 = stofs[stofs.ID.isin([' '.join(s.decode("utf-8").strip().split()[:3]) for s in stofs2.station_name.values])];len(stofs_2023)
stofs_2024 = stofs[stofs.ID.isin([' '.join(s.decode("utf-8").strip().split()[:3]) for s in stofs3.station_name.values])];len(stofs_2024)

luckily the new stations were appended at the end of the file. So this will be easier to concatenate data between all the files

In [None]:
stofs_2022[:557].equals(stofs_2023[:557])
stofs_2022[:557].equals(stofs_2024[:557])

We need to compare model storm surge with observation. We use IOC tide stations

In [32]:
def get_meta() -> gp.GeoDataFrame:
    meta_web = searvey.get_ioc_stations().drop(columns=["lon", "lat"])
    meta_api = (
        pd.read_json(
            "http://www.ioc-sealevelmonitoring.org/service.php?query=stationlist&showall=all"
        )
        .drop_duplicates()
        .drop(columns=["lon", "lat"])
        .rename(columns={"Code": "ioc_code", "Lon": "lon", "Lat": "lat"})
    )
    merged = pd.merge(
        meta_web,
        meta_api[["ioc_code", "lon", "lat"]].drop_duplicates(),
        on=["ioc_code"],
    )
    return merged.drop(columns=["geometry"])
ioc_ = get_meta()

We already have established a database for clean IOC data between 2022 and 2023, we'll use it as a reference: 

In [35]:
IOC_CLEANUP = "ioc_cleanup_2023.csv"
ioc_cleanup = pd.read_csv(IOC_CLEANUP, index_col=0).rename(columns={"longitude": 'lon', "latitude": 'lat', "Station_Name":"location","Country":"country"})

In [None]:
stofs_plot = stofs_2022.hvplot.scatter(x= "lon", y="lat", hover_cols = "ID", s=130, c='lightgrey', label = 'STOFS 2022 output stations')
stofs_plot1 = stofs_2023.hvplot.scatter(x="lon", y="lat", hover_cols = "ID", s=150, c='grey', label = 'STOFS 2023 output stations')
stofs_plot2 = stofs_2024.hvplot.scatter(x="lon", y="lat", hover_cols = "ID", s=200, c='k', label = 'STOFS 2024 output stations')
ioc_plot = ioc_.hvplot.scatter(x="lon", y="lat",hover_cols = "ioc_code", s= 30 , c = 'y', label = 'all IOC stations')
ioc_cleanup_plot = ioc_cleanup.hvplot.scatter(x="lon", y="lat",s = 80, c='r', label = "stations cleaned for 2022-2023")

(stofs_plot2 * stofs_plot1 * stofs_plot * ioc_cleanup_plot* ioc_plot).opts(width = 1600, height = 800)

We graphically detected all stations not already used in `ioc_cleanup` and corresponding with STOFS2D output locations

In [38]:
station_to_add = ["juan", "sanf", "anto", "ptmo", "valp", "ferg", "ambon", "bitu", "saum", "sho2", "ushu", 
                  "espr", "gamb", "riki", "prud", "vald", "cord", "paak", "dsea", "ketc", "june", "skag", "sewa", "anch", "niki", "seld", "kodi", "alak", 
                  "dshu", "dkod", "nome", "adak", "niko", "dchu", "midx", "fren", "sthl", "ascen", "jask", "chab", "kara", "musc", 
                  "masi", "mais", "kerg", "syow", "ver1", "vern", "wait", "stpa", "sala", "tara", "marsh", "kwaj", "wake", "fong", 
                  "solo", "vanu", "numbo", "numb2", "levu", "wlgt", "jack", "hako", "abas", "ofun", "mera", "toya", "nawi", "brpt", "heeia", 
                  "moku", "mane", "john", "plmy", "xmas", "penr", "hiva", "pape", "raro", "pago", "pagx", "east", "garc", "Male2", "ganm", "male", "hani", 
                  "mini", "coch", "vish", "chtt", "sitt", "moul", "ptbl", "komi", "kota", "lank", "ms001", "sab2", "saba", "vung", "quin", 
                  "quar", "curri", "subi", "mani", "luba", "lega", "tkao", "tkee", "chij", "mins", "saip", "mala", "chuu", "kapi", "deke", "naur", "nauu", 
                  "dumo", "espe", "porl", "hill", "waik", "lemba", "beno", "prgi", "prig", "cili", "cila", "tjls", "chrs", "ffcj", "cocb", "telu", "sibo", 
                  "sib2", "tanjo", "bupo", "padn", "pada", "fpga", "winc", "wbnc", "oinc", "kpva", "leva", "simd", "wsdc", "cbmd", "ocmd", "cmnj", "phap", 
                  "mhpa", "btny", "shnj", "mony", "ptme", "cwme", "epme", "hali", "nain", "nuk1", "nuuk", "qaqo", "reyk", "scor", "rptx", "cctx", "pitx", 
                  "pric", "ftfr", "rose", "barb", "stcr", "lame", "isab", "vieq", "yobu", "yabu", "faja", "sanj", "arac", "maya", "magi", "penu", "mona", 
                  "ptpr", "ptpl", "sama", "bull", "elpo", "limon", "quepo", "sana", "acaj", "acap", "acya", "manz", "mnza", "cabo", "fort", "call", "lobos", 
                  "tala", "lali", "vkfl", "nafl", "fmfl", "spfl", "pnfl", "pbfl", "apfl", "tpfl", "fbfl", "moal", "wlms", "psla", "gila", "pfla", "ncla", 
                  "apla", "eila", "cpla", "sptx", "gptx", "fptx", "bres", "sthm", "casc", "gibr", "ceut", "mars", "TR22", "gvd9", "alex", "palm", "pdas", 
                  "plus", "dakar", "tako", "tkdi", "lagos", "pntn", "sitin", "walvi", "prte", "durb", "pemba", "mtwa", "momb", "lamu", "pmon", "aric", "mata", 
                  "plat", "salv"]

some station can be declined in different names

In [None]:
possible_stations = []
all_ioc = ioc_.ioc_code.values
for stat in station_to_add:
    if any(stat in station for station in all_ioc):
        for station in all_ioc:
            if stat in station:
                possible_stations.append(station)
ioc_to_add = ioc_[ioc_.ioc_code.isin(possible_stations)]
ioc_to_add

In [None]:
stofs_plot =  stofs_2022.hvplot.scatter(x= "lon", y="lat", hover_cols = "ID", s=130, c='lightgrey', label = 'STOFS 2022 output stations')
stofs_plot1 = stofs_2023.hvplot.scatter(x="lon", y="lat", hover_cols = "ID", s=150, c='grey', label = 'STOFS 2023 output stations')
stofs_plot2 = stofs_2024.hvplot.scatter(x="lon", y="lat", hover_cols = "ID", s=200, c='k', label = 'STOFS 2024 output stations')
ioc_plot = ioc_.hvplot.scatter(x="lon", y="lat",hover_cols = "ioc_code", s= 30 , c = 'y',label = 'all IOC stations')
ioc_cleanup_plot = ioc_cleanup.hvplot.scatter(x="lon", y="lat",s = 90, c='r',label = 'stations already cleaned for 2022-2023')
ioc_to_add_plot = ioc_to_add.hvplot.scatter(x="lon", y="lat",s = 90, geo=True, c = 'g', label = 'stations to be added')

(stofs_plot2 * stofs_plot1 * stofs_plot * ioc_to_add_plot * ioc_cleanup_plot).opts(width = 1800, height = 800)

the 2024 IOC cleanup database is the red + green points 

In [None]:
ioc_cleanup_2024 = pd.concat([ioc_cleanup,ioc_to_add])
ioc_cleanup_2024
ioc_cleanup_2024.to_csv("ioc_cleanup_2024.csv")

In [43]:
def find_nearest_nodes(
    mesh_nodes: pd.DataFrame,
    points: pd.DataFrame,
    metric: str = "haversine",
    earth_radius = 6371000,
    ):
    """
    Calculate the mesh nodes that are nearest to the specified `points`.
    Both `mesh_nodes` and `points` must be `pandas.DataFrames` that have
    columns named `lon` and `lat` and the coords must be in EPSG:4326.
    Returns the `points` DataFrame after adding these extra columns:
    - `mesh_index` which is the index of the node in the `hgrid.gr3` file
    - `mesh_lon` which is the longitude of the nearest mesh node
    - `mesh_lat` which is the latitude of the nearest mesh node
    - `distance` which is the distance in meters between the point and the nearest mesh node
    Examples:
        >>> mesh_nodes = pd.DataFrame({
        ...     "lon": [0, 10, 20],
        ...     "lat": [0, 5, 0],
        ... })
        >>> points = pd.DataFrame({
        ...     "lon": [1, 11, 21],
        ...     "lat": [1, 4, 1],
        ...     "id": ["a", "b", "c"],
        ... })
        >>> nearest_nodes = find_nearest_nodes(mesh_nodes, points)
        >>> nearest_nodes
           lon  lat id  mesh_index  mesh_lon  mesh_lat       distance
        0    1    1  a           0         0         0  157249.381272
        1   11    4  b           1        10         5  157010.162641
        2   21    1  c           2        20         0  157249.381272
    """
    # The only requirement is that both `mesh_nodes and `points` have `lon/lat` columns
    tree = sklearn.neighbors.BallTree(
        np.radians(mesh_nodes[["lat", "lon"]]),
        metric=metric,
    )
    distances, indices = tree.query(np.radians(points[["lat", "lon"]].values))
    closest_nodes = (
        mesh_nodes
        .rename(columns={"lon": "mesh_lon", "lat": "mesh_lat"})
        .iloc[indices.flatten()]
        .assign(distance=(distances.flatten() * earth_radius))
        .reset_index(names=["mesh_index"])
    )

    return pd.concat((points.reset_index(drop = True), closest_nodes), axis="columns")

# 2 - get STOFS
nearest_nodes_2022 = find_nearest_nodes(stofs_2022, ioc_cleanup_2024[["lon","lat","ioc_code","location"]])
nearest_nodes_2023 = find_nearest_nodes(stofs_2023, ioc_cleanup_2024[["lon","lat","ioc_code","location"]])
nearest_nodes_2024 = find_nearest_nodes(stofs_2024, ioc_cleanup_2024[["lon","lat","ioc_code","location"]])
nearest_nodes_2022 = nearest_nodes_2022[~nearest_nodes_2022.mesh_index.isna()]
nearest_nodes_2023 = nearest_nodes_2023[~nearest_nodes_2023.mesh_index.isna()]
nearest_nodes_2024 = nearest_nodes_2024[~nearest_nodes_2024.mesh_index.isna()]
keep_nodes_2022 = nearest_nodes_2022[nearest_nodes_2022.distance < 5000]
keep_nodes_2023 = nearest_nodes_2023[nearest_nodes_2023.distance < 5000]
keep_nodes_2024 = nearest_nodes_2024[nearest_nodes_2024.distance < 5000]

keep_nodes_2022.to_csv("keep_nodes_2022.csv")
keep_nodes_2023.to_csv("keep_nodes_2023.csv")
keep_nodes_2024.to_csv("keep_nodes_2024.csv")

red are all the STOFS2D points to be extracted

In [None]:
p2 = stofs_2022.hvplot.scatter(x="lon", y="lat", hover_cols = "ID", s=50, c='grey', label = 'STOFS 2022 output stations')
ip = ioc_cleanup_2024.hvplot.scatter(x="lon", y="lat",s = 10, c='g',label = 'IOC_CLEANUP 2022-2024')
k2 = keep_nodes_2022.hvplot.scatter(x="lon", y="lat", c = 'red',    s = 10, label = "STOFS2D stations to be extracted")

(p2 * ip * k2).opts(width = 1800, height = 800)

download STOFS

In [None]:
import pathlib

import pandas as pd
import httpx
import multifutures

base_path = "stofs2d"

pathlib.Path(base_path).mkdir(exist_ok=True)

# The URL changed at 2023-01-08 
# These are the new urls 
base_url = "https://noaa-nos-stofs2d-pds.s3.amazonaws.com/stofs_2d_glo.{date_str}/stofs_2d_glo.t{time}z.points.swl.nc"
new_url_names = {}
for date in pd.date_range("2023-01-08", "2024-12-31"):
    date_str = date.strftime("%Y%m%d")
    for time in ("00", "06", "12", "18"):
        url = base_url.format(date_str=date_str, time=time)
        name = f"{date_str}_{url.rsplit('/', 1)[1]}"
        new_url_names[url] = name

# These are the old urls 
base_url = "https://noaa-nos-stofs2d-pds.s3.amazonaws.com/estofs.{date_str}/estofs.t{time}z.points.swl.nc"
old_url_names = {}
for date in pd.date_range("2022-01-01", "2023-01-07"):
    date_str = date.strftime("%Y%m%d")
    for time in ("00", "06", "12", "18"):
        url = base_url.format(date_str=date_str, time=time)
        name = f"{date_str}_{url.rsplit('/', 1)[1].replace('estofs', 'stofs_2d_glo')}"
        old_url_names[url] = name

def download_file(client, url, name):
    with open(name, "wb") as fd:
        with client.stream("GET", url) as response:
            for data in response.iter_bytes():
                fd.write(data)

with httpx.Client() as client:
    func_kwargs = [{"client": client, "url": url, "name": f"{base_path}/{name}"} for (url, name) in new_url_names.items()]
    multifutures.multithread(func=download_file, func_kwargs=func_kwargs, check=True)

with httpx.Client() as client:
    func_kwargs = [{"client": client, "url": url, "name": f"{base_path}/{name}"} for (url, name) in old_url_names.items()]
    multifutures.multithread(func=download_file, func_kwargs=func_kwargs, check=True)


In [1]:
STOFS_FOLDER = "stofs2d/"

extract stations

In [None]:
import glob
# gather stofs2D data
ds1 = []
ds2 = []
ds3 = []
for file in sorted(glob.glob(STOFS_FOLDER + "*.swl.nc")):
    root, filename = os.path.split(file)
    date = datetime.strptime(filename, "%Y%m%d_stofs_2d_glo.t%Hz.points.swl.nc")
    tmp = xr.open_dataset(file)
    
    if date > datetime(2024,5,14,11):
        ds3.append(tmp.sel(time=slice(date,date+pd.Timedelta(hours=6))))
        print(date, len(tmp.station_name),"ds3")
    else:
        if date > datetime(2023,1,7, 23):
            ds2.append(tmp.sel(time=slice(date,date+pd.Timedelta(hours=6))))
            print(date, len(tmp.station_name),"ds2")
        else: # date < datetime.datetime(2023,1,7)
            ds1.append(tmp.sel(time=slice(date,date+pd.Timedelta(hours=6))))
            print(date, len(tmp.station_name),"ds1")
# keep fist time 
ds1 = xr.concat(ds1, dim="time")
ds2 = xr.concat(ds2, dim="time")
ds3 = xr.concat(ds3, dim="time")
for it, station in keep_nodes_2022.iterrows(): # take 2022 because it has less nodes
    print(f'Station {station.ioc_code} at {station.lon:.2f}, {station.lat:.2f}')
    ds1_subset = ds1.isel(station = int(station.mesh_index))
    ds2_subset = ds2.isel(station = int(station.mesh_index))
    ds3_subset = ds3.isel(station = int(station.mesh_index))
    ds_subset = xr.concat([ds1_subset, ds2_subset, ds3_subset], dim="time")
    df = ds_subset.to_pandas()
    df.to_parquet(f'data/{station.ioc_code}.parquet')