## Find IOC stations in STOFS2D/3D output file

In [None]:
import geopandas as gp
import pandas as pd
import searvey
import hvplot.pandas
import hvplot.xarray
import numpy as np
import sklearn.neighbors

get stofs stations

In [None]:
mycols = [str(i) for i in range(6)] # we expect 17 cols max in that file
stof2d = pd.read_csv(
    "https://polar.ncep.noaa.gov/stofs/data/stofs_2d_glo_elev_stat_v2_1_0",
    names=mycols, 
    sep="\t+|!", 
    header=None, 
    skiprows=1
)
stof2d['Info'] = stof2d.apply(lambda row: ' '.join(filter(None, row[2:])), axis=1)
stof2d['ID'] = stof2d['Info'].apply(lambda x: ' '.join(x.split()[:3]))
stof2d['Info'] = stof2d.apply(lambda row: row['Info'].replace(row['ID'], '').strip(), axis=1)
stof2d = stof2d.drop(columns=["2", "3", "4", "5"])
stof2d.rename(columns={"0": 'lon', "1": 'lat'}, inplace=True)
stof2d

get ioc stations

In [None]:
import os

def ioc_subset_from_files_in_folder(
    df: pd.DataFrame, folder: str, ext: str = ".parquet"
):
    """this function return a subset of the ioc database from all the files (json or parquet)
    present in a folder
    """
    list_files = []
    for file in os.listdir(folder):
        name = file.split(ext)[0]
        if file.endswith(ext):
            list_files.append(name)
    return df[df.ioc_code.isin(list_files)]

def get_meta() -> gp.GeoDataFrame:
    meta_web = searvey.get_ioc_stations().drop(columns=["lon", "lat"])
    meta_api = (
        pd.read_json(
            "http://www.ioc-sealevelmonitoring.org/service.php?query=stationlist&showall=all"
        )
        .drop_duplicates()
        .drop(columns=["lon", "lat"])
        .rename(columns={"Code": "ioc_code", "Lon": "lon", "Lat": "lat"})
    )
    merged = pd.merge(
        meta_web,
        meta_api[["ioc_code", "lon", "lat"]].drop_duplicates(),
        on=["ioc_code"],
    )
    return merged.drop(columns=["geometry"])

ioc_ = get_meta()
ioc_cleanup = ioc_subset_from_files_in_folder(ioc_, "/home/tomsail/Documents/work/python/seareport_org/skill-panel/01_obs/surge")
drop_index = ioc_cleanup.ioc_code.isin(['dapi', 'datu', 'djve', 'dkwa'])
ioc_cleanup = ioc_cleanup[~drop_index]

In [None]:
def find_nearest_nodes(
    mesh_nodes: pd.DataFrame,
    points: pd.DataFrame,
    metric: str = "haversine",
    earth_radius = 6371000,
    ):
    """
    Calculate the mesh nodes that are nearest to the specified `points`.
    Both `mesh_nodes` and `points` must be `pandas.DataFrames` that have
    columns named `lon` and `lat` and the coords must be in EPSG:4326.
    Returns the `points` DataFrame after adding these extra columns:
    - `mesh_index` which is the index of the node in the `hgrid.gr3` file
    - `mesh_lon` which is the longitude of the nearest mesh node
    - `mesh_lat` which is the latitude of the nearest mesh node
    - `distance` which is the distance in meters between the point and the nearest mesh node
    Examples:
        >>> mesh_nodes = pd.DataFrame({
        ...     "lon": [0, 10, 20],
        ...     "lat": [0, 5, 0],
        ... })
        >>> points = pd.DataFrame({
        ...     "lon": [1, 11, 21],
        ...     "lat": [1, 4, 1],
        ...     "id": ["a", "b", "c"],
        ... })
        >>> nearest_nodes = find_nearest_nodes(mesh_nodes, points)
        >>> nearest_nodes
           lon  lat id  mesh_index  mesh_lon  mesh_lat       distance
        0    1    1  a           0         0         0  157249.381272
        1   11    4  b           1        10         5  157010.162641
        2   21    1  c           2        20         0  157249.381272
    """
    # The only requirement is that both `mesh_nodes and `points` have `lon/lat` columns
    tree = sklearn.neighbors.BallTree(
        np.radians(mesh_nodes[["lat", "lon"]]),
        metric=metric,
    )
    distances, indices = tree.query(np.radians(points[["lat", "lon"]].values))
    closest_nodes = (
        mesh_nodes
        .rename(columns={"lon": "mesh_lon", "lat": "mesh_lat"})
        .iloc[indices.flatten()]
        .assign(distance=(distances.flatten() * earth_radius))
        .reset_index(names=["mesh_index"])
    )
    return pd.concat((points, closest_nodes), axis="columns")

In [None]:
nearest_nodes = find_nearest_nodes(stof2d, ioc_cleanup)
nearest_nodes = nearest_nodes[~nearest_nodes.mesh_index.isna()]
nearest_nodes

In [None]:
keep_nodes = nearest_nodes[nearest_nodes.distance < 2000]
stof2d.iloc[keep_nodes.mesh_index]

In [None]:
plot1 = stof2d.hvplot.points(
    x='lon', y='lat', 
    s = 50,
    geo=True, 
    tiles = True, 
    hover_cols=["ID"], 
    label = 'STOFS stations'
)
plot2 = stof2d.iloc[keep_nodes.mesh_index].hvplot.points(
    x='lon', y='lat', 
    s = 50,
    geo=True, 
    tiles = True, 
    hover_cols=["ID"], 
    label = 'IOC stations'
)
plot3 = ioc_cleanup.hvplot.points(
    x='lon', y='lat', 
    s = 10,
    geo=True, 
    tiles = True, 
    hover_cols=["ioc_code"], 
    label = 'concomittent IOC/STOFS stations'
)
(plot1 * plot2 * plot3).opts(
    width=1400, height=800, title='STOFS2D/3D stations and IOC stations'
)

In [None]:
keep_nodes.mesh_index.describe()

In [None]:
import xarray as xr
import glob
import dask

filenames1 = []
filenames2 = []
for file in sorted(glob.glob('noaa/stofs2d/202*')):
    root, filename = os.path.split(file)
    number = int(filename.split("_")[0])
    if number > 20230107:
        filenames2.append(file)
    else: 
        filenames1.append(file)

ds1 = xr.open_mfdataset(filenames1, combine='nested', concat_dim = "time")
ds2 = xr.open_mfdataset(filenames2, combine='nested', concat_dim = "time")

def keep_first_timeindex(ds):
    mask = ds.time.to_pandas().duplicated("last").values
    data_vars = []
    for var in list(ds.data_vars):
        with dask.config.set(**{"array.slicing.split_large_chunks": True}):
            data_vars.append(ds[var][~mask])
    return xr.merge(data_vars)


ds1 = keep_first_timeindex(ds1)
ds2 = keep_first_timeindex(ds2)
ds1

In [None]:
for it, tmp in keep_nodes.iterrows():
    print(f'Station {tmp.ioc_code} at {tmp.lon:.2f}, {tmp.lat:.2f}')

    ds1_subset = ds1.isel(station = int(tmp.mesh_index))
    ds2_subset = ds2.isel(station = int(tmp.mesh_index))
    print(ds1_subset)
    ds_subset = xr.concat([ds1_subset, ds2_subset], dim="time")
    ds_subset.zeta.hvplot(figsize=(10, 5))
    df = ds_subset.to_pandas()
    df.to_parquet(f'01_obs/stofs2d/{tmp.ioc_code}.parquet')
    break
