This notebook contains the methodology for: 
 1. choosing the best IOC candidates for storm surge validation purposes

an edit of this notebook will be done for: 

 2. the data availability
 3. the data quality 
of IOC candidates

In [None]:
import geopandas as gp
import pandas as pd
import searvey
from datetime import datetime
import numpy as np
import sklearn.neighbors
import xarray as xr
import hvplot.pandas
import os

In [2]:
world_oceans = gp.read_file("https://gist.github.com/tomsail/2fa52d9667312b586e7d3baee123b57b/raw/23929561eaa8aa76376580a7df300c4e3eb2e509/world_maritime_sectors.json")

In [None]:
IOC_CLEANUP = "ioc_cleanup_2023.csv"
ioc_cleanup = pd.read_csv(IOC_CLEANUP, index_col=0).rename(columns={"longitude": 'lon', "latitude": 'lat', "Station_Name":"location","Country":"country"})
(   world_oceans.hvplot(c='ocean',geo=True).opts(cmap='tab20c') * 
    ioc_cleanup.hvplot.points(x="lon",y="lat",c='k', s= 40,geo=True,coastline=True)
 ).opts(height=450)

Our experience with [skill-panel-demo](https://github.com/seareport/skill-panel-demo) got us the to following conclusion:

The previous set of stations from `ioc_cleanup` is not sufficient. We now need to have: 
 * more stations to compare with models
 * more time coverage

We want to have a maximum of stations that correspond with STOFS2D output locations. as we  want to compare: 
 **IOC observations** vs **our model** vs **STOFS2D** output locations

In [None]:
# The latest STOFS2D locations output locations (for STOFS2D version 2.1.0) are: 
def get_stofs():
    mycols = [str(i) for i in range(6)] # we expect 17 cols max in that file
    stof2d = pd.read_csv(
        "https://polar.ncep.noaa.gov/stofs/data/stofs_2d_glo_elev_stat_v2_1_0",
        names=mycols, 
        sep="\t+|!", 
        header=None, 
        skiprows=1
    )
    stof2d['Info'] = stof2d.apply(lambda row: ' '.join(filter(None, row[2:])), axis=1)
    stof2d['ID'] = stof2d['Info'].apply(lambda x: ' '.join(x.split()[:3]))
    stof2d['Info'] = stof2d.apply(lambda row: row['Info'].replace(row['ID'], '').strip(), axis=1)
    stof2d = stof2d.drop(columns=["2", "3", "4", "5"])
    stof2d.rename(columns={"0": 'lon', "1": 'lat'}, inplace=True)
    return stof2d

stofs = get_stofs()
stofs.hvplot.points(geo=True,coastline=True).opts(height=450)

A caveat is that the 1D output files evolve over time: 

In [None]:
stofs1 = xr.open_dataset("stofs2d/20220912_stofs_2d_glo.t12z.points.swl.nc")
stofs2 = xr.open_dataset("stofs2d/20231010_stofs_2d_glo.t00z.points.swl.nc")
stofs3 = xr.open_dataset("stofs2d/20241229_stofs_2d_glo.t00z.points.swl.nc")
#SEE APPENDIX FOR DOWNLOADING STOFS2D DATA

stofs_2022 = stofs[stofs.ID.isin([' '.join(s.decode("utf-8").strip().split()[:3]) for s in stofs1.station_name.values])];len(stofs_2022)
stofs_2023 = stofs[stofs.ID.isin([' '.join(s.decode("utf-8").strip().split()[:3]) for s in stofs2.station_name.values])];len(stofs_2023)
stofs_2024 = stofs[stofs.ID.isin([' '.join(s.decode("utf-8").strip().split()[:3]) for s in stofs3.station_name.values])];len(stofs_2024)

luckily the new stations were appended at the end of the file. So this will be easier to concatenate data between all the files

In [None]:
stofs_2022[:557].equals(stofs_2023[:557])
stofs_2022[:557].equals(stofs_2024[:557])

We need to compare model storm surge with observation. We use IOC tide stations

In [7]:
def get_meta() -> gp.GeoDataFrame:
    meta_web = searvey.get_ioc_stations().drop(columns=["lon", "lat"])
    meta_api = (
        pd.read_json(
            "http://www.ioc-sealevelmonitoring.org/service.php?query=stationlist&showall=all"
        )
        .drop_duplicates()
        .drop(columns=["lon", "lat"])
        .rename(columns={"Code": "ioc_code", "Lon": "lon", "Lat": "lat"})
    )
    merged = pd.merge(
        meta_web,
        meta_api[["ioc_code", "lon", "lat"]].drop_duplicates(),
        on=["ioc_code"],
    )
    return merged.drop(columns=["geometry"])
ioc_ = get_meta()

We already have established a database for clean IOC data between 2022 and 2023 (see 1st plot), we'll use it as a reference: 

In [None]:
stofs_plot = stofs_2022.hvplot.scatter(x= "lon", y="lat", hover_cols = "ID", s=130, c='lightgrey', label = 'STOFS 2022 output stations')
stofs_plot1 = stofs_2023.hvplot.scatter(x="lon", y="lat", hover_cols = "ID", s=150, c='grey', label = 'STOFS 2023 output stations')
stofs_plot2 = stofs_2024.hvplot.scatter(x="lon", y="lat", hover_cols = "ID", s=200, c='k', label = 'STOFS 2024 output stations')
ioc_plot = ioc_.hvplot.scatter(x="lon", y="lat",hover_cols = "ioc_code", s= 30 , c = 'y', label = 'all IOC stations')
ioc_cleanup_plot = ioc_cleanup.hvplot.scatter(coastline=True,x="lon", y="lat",s = 80, c='r', label = "stations cleaned for 2022-2023")

(stofs_plot2 * stofs_plot1 * stofs_plot * ioc_cleanup_plot* ioc_plot).opts(width = 1300, height = 600)

We graphically detected all stations not already used in `ioc_cleanup` and corresponding with STOFS2D output locations

In [9]:
station_to_add = ["juan", "sanf", "anto", "ptmo", "valp", "ferg", "ambon", "bitu", "saum", "sho2", "ushu", 
                  "espr", "gamb", "riki", "prud", "vald", "cord", "paak", "dsea", "ketc", "june", "skag", "sewa", "anch", "niki", "seld", "kodi", "alak", 
                  "dshu", "dkod", "nome", "adak", "niko", "dchu", "midx", "fren", "sthl", "ascen", "jask", "chab", "kara", "musc", 
                  "masi", "mais", "kerg", "syow", "ver1", "vern", "wait", "stpa", "sala", "tara", "marsh", "kwaj", "wake", "fong", 
                  "solo", "vanu", "numbo", "numb2", "levu", "wlgt", "jack", "hako", "abas", "ofun", "mera", "toya", "nawi", "brpt", "heeia", 
                  "moku", "mane", "john", "plmy", "xmas", "penr", "hiva", "pape", "raro", "pago", "pagx", "east", "garc", "Male2", "ganm", "male", "hani", 
                  "mini", "coch", "vish", "chtt", "sitt", "moul", "ptbl", "komi", "kota", "lank", "ms001", "sab2", "saba", "vung", "quin", 
                  "quar", "curri", "subi", "mani", "luba", "lega", "tkao", "tkee", "chij", "mins", "saip", "mala", "chuu", "kapi", "deke", "naur", "nauu", 
                  "dumo", "espe", "porl", "hill", "waik", "lemba", "beno", "prgi", "prig", "cili", "cila", "tjls", "chrs", "ffcj", "cocb", "telu", "sibo", 
                  "sib2", "tanjo", "bupo", "padn", "pada", "fpga", "winc", "wbnc", "oinc", "kpva", "leva", "simd", "wsdc", "cbmd", "ocmd", "cmnj", "phap", 
                  "mhpa", "btny", "shnj", "mony", "ptme", "cwme", "epme", "hali", "nain", "nuk1", "nuuk", "qaqo", "reyk", "scor", "rptx", "cctx", "pitx", 
                  "pric", "ftfr", "rose", "barb", "stcr", "lame", "isab", "vieq", "yobu", "yabu", "faja", "sanj", "arac", "maya", "magi", "penu", "mona", 
                  "ptpr", "ptpl", "sama", "bull", "elpo", "limon", "quepo", "sana", "acaj", "acap", "acya", "manz", "mnza", "cabo", "fort", "call", "lobos", 
                  "tala", "lali", "vkfl", "nafl", "fmfl", "spfl", "pnfl", "pbfl", "apfl", "tpfl", "fbfl", "moal", "wlms", "psla", "gila", "pfla", "ncla", 
                  "apla", "eila", "cpla", "sptx", "gptx", "fptx", "bres", "sthm", "casc", "gibr", "ceut", "mars", "TR22", "gvd9", "alex", "palm", "pdas", 
                  "plus", "dakar", "tako", "tkdi", "lagos", "pntn", "sitin", "walvi", "prte", "durb", "pemba", "mtwa", "momb", "lamu", "pmon", "aric", "mata", 
                  "plat", "salv"]

some station can be declined in different names

In [None]:
possible_stations = []
all_ioc = ioc_.ioc_code.values
for stat in station_to_add:
    if any(stat in station for station in all_ioc):
        for station in all_ioc:
            if stat in station:
                possible_stations.append(station)
ioc_to_add = ioc_[ioc_.ioc_code.isin(possible_stations)]
ioc_to_add

In [None]:
stofs_plot =  stofs_2022.hvplot.scatter(x= "lon", y="lat", hover_cols = "ID", s=130, c='lightgrey', label = 'STOFS 2022 output stations')
stofs_plot1 = stofs_2023.hvplot.scatter(x="lon", y="lat", hover_cols = "ID", s=150, c='grey', label = 'STOFS 2023 output stations')
stofs_plot2 = stofs_2024.hvplot.scatter(x="lon", y="lat", hover_cols = "ID", s=200, c='k', label = 'STOFS 2024 output stations')
ioc_cleanup_plot = ioc_cleanup.hvplot.scatter(x="lon", y="lat",hover_cols = "ioc_code",s = 90, c='r',label = 'stations already cleaned for 2022-2023')
ioc_to_add_plot = ioc_to_add.hvplot.scatter(coastline=True,x="lon", y="lat",s = 90, c = 'g', label = 'stations to be added')

(stofs_plot2 * stofs_plot1 * stofs_plot * ioc_to_add_plot * ioc_cleanup_plot).opts(width = 1400, height = 600)

the 2024 IOC cleanup database is the red + green points 

In [None]:
ioc_cleanup_2024 = pd.concat([ioc_cleanup,ioc_to_add])
ioc_cleanup_2024
ioc_cleanup_2024.to_csv("ioc_cleanup_2024.csv")

In [13]:
def find_nearest_nodes(
    mesh_nodes: pd.DataFrame,
    points: pd.DataFrame,
    metric: str = "haversine",
    earth_radius = 6371000,
    ):
    """
    Calculate the mesh nodes that are nearest to the specified `points`.
    Both `mesh_nodes` and `points` must be `pandas.DataFrames` that have
    columns named `lon` and `lat` and the coords must be in EPSG:4326.
    Returns the `points` DataFrame after adding these extra columns:
    - `mesh_index` which is the index of the node in the `hgrid.gr3` file
    - `mesh_lon` which is the longitude of the nearest mesh node
    - `mesh_lat` which is the latitude of the nearest mesh node
    - `distance` which is the distance in meters between the point and the nearest mesh node
    Examples:
        >>> mesh_nodes = pd.DataFrame({
        ...     "lon": [0, 10, 20],
        ...     "lat": [0, 5, 0],
        ... })
        >>> points = pd.DataFrame({
        ...     "lon": [1, 11, 21],
        ...     "lat": [1, 4, 1],
        ...     "id": ["a", "b", "c"],
        ... })
        >>> nearest_nodes = find_nearest_nodes(mesh_nodes, points)
        >>> nearest_nodes
           lon  lat id  mesh_index  mesh_lon  mesh_lat       distance
        0    1    1  a           0         0         0  157249.381272
        1   11    4  b           1        10         5  157010.162641
        2   21    1  c           2        20         0  157249.381272
    """
    # The only requirement is that both `mesh_nodes and `points` have `lon/lat` columns
    tree = sklearn.neighbors.BallTree(
        np.radians(mesh_nodes[["lat", "lon"]]),
        metric=metric,
    )
    distances, indices = tree.query(np.radians(points[["lat", "lon"]].values))
    closest_nodes = (
        mesh_nodes
        .rename(columns={"lon": "mesh_lon", "lat": "mesh_lat"})
        .iloc[indices.flatten()]
        .assign(distance=(distances.flatten() * earth_radius))
        .reset_index(names=["mesh_index"])
    )

    return pd.concat((points.reset_index(drop = True), closest_nodes), axis="columns")

# 2 - get STOFS
nearest_nodes_2022 = find_nearest_nodes(stofs_2022, ioc_cleanup_2024[["lon","lat","ioc_code","location"]])
nearest_nodes_2023 = find_nearest_nodes(stofs_2023, ioc_cleanup_2024[["lon","lat","ioc_code","location"]])
nearest_nodes_2024 = find_nearest_nodes(stofs_2024, ioc_cleanup_2024[["lon","lat","ioc_code","location"]])
nearest_nodes_2022 = nearest_nodes_2022[~nearest_nodes_2022.mesh_index.isna()]
nearest_nodes_2023 = nearest_nodes_2023[~nearest_nodes_2023.mesh_index.isna()]
nearest_nodes_2024 = nearest_nodes_2024[~nearest_nodes_2024.mesh_index.isna()]
keep_nodes_2022 = nearest_nodes_2022[nearest_nodes_2022.distance < 5000]
keep_nodes_2023 = nearest_nodes_2023[nearest_nodes_2023.distance < 5000]
keep_nodes_2024 = nearest_nodes_2024[nearest_nodes_2024.distance < 5000]

keep_nodes_2022.to_csv("keep_nodes_2022.csv")
keep_nodes_2023.to_csv("keep_nodes_2023.csv")
keep_nodes_2024.to_csv("keep_nodes_2024.csv")

red are all the STOFS2D points to be extracted

In [None]:
p2 = stofs_2022.hvplot.scatter(x="lon", y="lat", hover_cols = "ID", s=70, c='grey',line_color="lightgrey", label = 'STOFS 2022 output stations')
ip = ioc_cleanup_2024.hvplot.scatter(x="lon", y="lat",s = 10, c='k',coastline=True, label = 'IOC_CLEANUP 2022-2024')
k2 = keep_nodes_2022.hvplot.scatter(x="lon", y="lat", c = 'red', s = 20,coastline=True, label = "STOFS2D stations to be extracted")

# (world_oceans.hvplot(c='ocean',alpha= 0.9).opts(cmap='tab20c') * p2 * ip * k2 ).opts(width = 1100, height = 800)
(world_oceans.hvplot(c='ocean',alpha= 0.9).opts(cmap='tab20c') * p2 * ip ).opts(width = 1500, height = 900)

download IOC data

In [15]:
# for i_s, station in ioc_cleanup_2024.iterrows():
#     print(station.ioc_code)
#     df = searvey.fetch_ioc_station(station.ioc_code, "2022-01-01", "2024-12-31")
#     df.to_parquet(f"data/{station.ioc_code}.parquet")

check data availability

In [None]:
import re 
import glob

ioc_cleanup_2024['n_sensors'] = 0

pattern = r"data/([\w\d]+)\.parquet"
stations = [re.search(pattern, path).group(1) for path in glob.glob("data/*parquet")]
f"Total stations: {len(stations)}"

# get the stations with data
keep_stations = []
for station in sorted(stations): 
    df = pd.read_parquet(f"data/{station}.parquet")
    if df.empty:
        pass
    else:
        print(station, end="  ")
        keep_stations.append(station)
        if "sw1" in df.columns: df = df.drop(columns=["sw1"])
        if "sw2" in df.columns: df = df.drop(columns=["sw2"])
        if "bat" in df.columns: df = df.drop(columns=["bat"])
        print(list(df.columns))

        ioc_cleanup_2024.loc[ioc_cleanup_2024.ioc_code == station, "n_sensors"] = len(df.columns)
        # disregard sw1, sw2 and bat
f"Stations with data: {len(keep_stations)}"

In [None]:
ioc_cleanup_2024.n_sensors.hvplot.hist(bins=[-0.5,0.5,1.5,2.5,3.5,4.5,5.5])

In [18]:
ioc_cleanup_2024_with_data = ioc_cleanup_2024[ioc_cleanup_2024.ioc_code.isin(keep_stations)]

store in separate files

In [19]:
for i_s, s in ioc_cleanup_2024_with_data.iterrows():
    df = pd.read_parquet(f"data/{s.ioc_code}.parquet")
    if "sw1" in df.columns: df = df.drop(columns=["sw1"])
    if "sw2" in df.columns: df = df.drop(columns=["sw2"])
    if "bat" in df.columns: df = df.drop(columns=["bat"])
    for sensor in df.columns:
        ts = df[[sensor]]
        ts.to_parquet(f"raw/{s.ioc_code}_{sensor}.parquet")

evaluate data availabilty

In [None]:
pattern = r"raw/([\w\d]+)\.parquet"
stations_sensors = [re.search(pattern, path).group(1) for path in glob.glob("raw/*parquet")]
f"Total individual recordings: {len(stations_sensors)}"

In [21]:
import typing as T
DETIDE_START = pd.Timestamp(2022,1,1)
DETIDE_END = pd.Timestamp(2025,1,1)

def calc_ratio(sr: pd.Series, period: pd.DatetimeIndex) -> float:
    sr = sr[(period[0] <= sr.index) & (sr.index <= period[-1])]
    return len(sr) / len(period)

table = dict()
for station_sensor in sorted(stations_sensors):
    station, sensor = station_sensor.split('_')
    df = pd.read_parquet(f"raw/{station_sensor}.parquet")
    interval_value_counts = df.index.to_series().diff().value_counts()
    main_interval_occurences = interval_value_counts.iloc[0]
    main_interval = T.cast(pd.Timedelta, interval_value_counts.index[0])
    detide_period = pd.date_range(DETIDE_START, DETIDE_END, freq=main_interval, inclusive="left")
    table[station_sensor] = dict()
    item = ioc_cleanup_2024_with_data[ioc_cleanup_2024_with_data.ioc_code == station]
    table[station_sensor]["lon"] = item.lon.values[0]
    table[station_sensor]["lat"] = item.lat.values[0]
    table[station_sensor]["completeness"] = calc_ratio(df, detide_period)
    # redo per sensor 

stations_sensors_availability = pd.DataFrame(table).T

In [None]:
stations_sensors_availability.describe()
stations_sensors_availability.completeness.hvplot.hist()

In [None]:
stations_sensors_availability.hvplot.points(
    x= "lon", y='lat', 
    hover_cols = ['index',"completeness" ],
    color = "completeness", 
    geo=True,
    s = 200
).opts(
    height = 800,
    width = 1600, 
    cmap='colorwheel'
) * k2