This notebook contains the methodology for: 
 1. choosing the best IOC candidates for storm surge validation purposes

an edit of this notebook will be done for: 

 2. the data availability
 3. the data quality 
of IOC candidates

In [1]:
import geopandas as gp
import pandas as pd
import searvey
from datetime import datetime
import numpy as np
import sklearn.neighbors
import xarray as xr
import hvplot.pandas
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
world_oceans = gp.read_file("https://gist.github.com/tomsail/2fa52d9667312b586e7d3baee123b57b/raw/23929561eaa8aa76376580a7df300c4e3eb2e509/world_maritime_sectors.json")

In [3]:
IOC_CLEANUP = "ioc_cleanup_2023.csv"
ioc_cleanup = pd.read_csv(IOC_CLEANUP, index_col=0).rename(columns={"longitude": 'lon', "latitude": 'lat', "Station_Name":"location","Country":"country"})
(   world_oceans.hvplot(c='ocean',geo=True).opts(cmap='tab20c') * 
    ioc_cleanup.hvplot.points(x="lon",y="lat",c='k', s= 40,geo=True,coastline=True)
 ).opts(height=450)



Our experience with [skill-panel-demo](https://github.com/seareport/skill-panel-demo) got us the to following conclusion:

The previous set of stations from `ioc_cleanup` is not sufficient. We now need to have: 
 * more stations to compare with models
 * more time coverage

We want to have a maximum of stations that correspond with STOFS2D output locations. as we  want to compare: 
 **IOC observations** vs **our model** vs **STOFS2D** output locations

In [4]:
# The latest STOFS2D locations output locations (for STOFS2D version 2.1.0) are: 
def get_stofs():
    mycols = [str(i) for i in range(6)] # we expect 17 cols max in that file
    stof2d = pd.read_csv(
        "https://polar.ncep.noaa.gov/stofs/data/stofs_2d_glo_elev_stat_v2_1_0",
        names=mycols, 
        sep="\t+|!", 
        header=None, 
        skiprows=1
    )
    stof2d['Info'] = stof2d.apply(lambda row: ' '.join(filter(None, row[2:])), axis=1)
    stof2d['ID'] = stof2d['Info'].apply(lambda x: ' '.join(x.split()[:3]))
    stof2d['Info'] = stof2d.apply(lambda row: row['Info'].replace(row['ID'], '').strip(), axis=1)
    stof2d = stof2d.drop(columns=["2", "3", "4", "5"])
    stof2d.rename(columns={"0": 'lon', "1": 'lat'}, inplace=True)
    return stof2d

stofs = get_stofs()
stofs.hvplot.points(geo=True,coastline=True).opts(height=450)

  stof2d = pd.read_csv(


A caveat is that the 1D output files evolve over time: 

In [5]:
stofs1 = xr.open_dataset("stofs2d/20220912_stofs_2d_glo.t12z.points.swl.nc")
stofs2 = xr.open_dataset("stofs2d/20231010_stofs_2d_glo.t00z.points.swl.nc")
stofs3 = xr.open_dataset("stofs2d/20241229_stofs_2d_glo.t00z.points.swl.nc")
#SEE APPENDIX FOR DOWNLOADING STOFS2D DATA

stofs_2022 = stofs[stofs.ID.isin([' '.join(s.decode("utf-8").strip().split()[:3]) for s in stofs1.station_name.values])];len(stofs_2022)
stofs_2023 = stofs[stofs.ID.isin([' '.join(s.decode("utf-8").strip().split()[:3]) for s in stofs2.station_name.values])];len(stofs_2023)
stofs_2024 = stofs[stofs.ID.isin([' '.join(s.decode("utf-8").strip().split()[:3]) for s in stofs3.station_name.values])];len(stofs_2024)

562

1687

1688

luckily the new stations were appended at the end of the file. So this will be easier to concatenate data between all the files

In [6]:
stofs_2022[:557].equals(stofs_2023[:557])
stofs_2022[:557].equals(stofs_2024[:557])

True

True

We need to compare model storm surge with observation. We use IOC tide stations

In [7]:
def get_meta() -> gp.GeoDataFrame:
    meta_web = searvey.get_ioc_stations().drop(columns=["lon", "lat"])
    meta_api = (
        pd.read_json(
            "http://www.ioc-sealevelmonitoring.org/service.php?query=stationlist&showall=all"
        )
        .drop_duplicates()
        .drop(columns=["lon", "lat"])
        .rename(columns={"Code": "ioc_code", "Lon": "lon", "Lat": "lat"})
    )
    merged = pd.merge(
        meta_web,
        meta_api[["ioc_code", "lon", "lat"]].drop_duplicates(),
        on=["ioc_code"],
    )
    return merged.drop(columns=["geometry"])
ioc_ = get_meta()

We already have established a database for clean IOC data between 2022 and 2023 (see 1st plot), we'll use it as a reference: 

In [8]:
stofs_plot = stofs_2022.hvplot.scatter(x= "lon", y="lat", hover_cols = "ID", s=130, c='lightgrey', label = 'STOFS 2022 output stations')
stofs_plot1 = stofs_2023.hvplot.scatter(x="lon", y="lat", hover_cols = "ID", s=150, c='grey', label = 'STOFS 2023 output stations')
stofs_plot2 = stofs_2024.hvplot.scatter(x="lon", y="lat", hover_cols = "ID", s=200, c='k', label = 'STOFS 2024 output stations')
ioc_plot = ioc_.hvplot.scatter(x="lon", y="lat",hover_cols = "ioc_code", s= 30 , c = 'y', label = 'all IOC stations')
ioc_cleanup_plot = ioc_cleanup.hvplot.scatter(coastline=True,x="lon", y="lat",s = 80, c='r', label = "stations cleaned for 2022-2023")

(stofs_plot2 * stofs_plot1 * stofs_plot * ioc_cleanup_plot* ioc_plot).opts(width = 1300, height = 600)



We graphically detected all stations not already used in `ioc_cleanup` and corresponding with STOFS2D output locations

In [48]:
station_to_add = [
    "juan", "sanf", "anto", "ptmo", "valp", "ferg", "ambon", "bitu", "saum", "sho2", "ushu", 
    "espr", "gamb", "riki", "prud", "vald", "cord", "paak", "dsea", "ketc", "june", "skag", "sewa", "anch", "niki", "seld", "kodi", "alak", 
    "dshu", "dkod", "nome", "adak", "niko", "dchu", "midx", "fren", "sthl", "ascen", "jask", "chab", "kara", "musc", 
    "masi", "mais", "kerg", "syow", "ver1", "vern", "wait", "stpa", "sala", "tara", "marsh", "kwaj", "wake", "fong", 
    "solo", "vanu", "numbo", "numb2", "levu", "wlgt", "jack", "hako", "abas", "ofun", "mera", "toya", "nawi", "brpt", "heeia", 
    "moku", "mane", "john", "plmy", "xmas", "penr", "hiva", "pape", "raro", "pago", "pagx", "east", "garc", "Male2", "ganm", "male", "hani", 
    "mini", "coch", "vish", "chtt", "sitt", "moul", "ptbl", "komi", "kota", "lank", "ms001", "sab2", "saba", "vung", "quin", 
    "quar", "curri", "subi", "mani", "luba", "lega", "tkao", "tkee", "chij", "mins", "saip", "mala", "chuu", "kapi", "deke", "naur", "nauu", 
    "dumo", "espe", "porl", "hill", "waik", "lemba", "beno", "prgi", "prig", "cili", "cila", "tjls", "chrs", "ffcj", "cocb", "telu", "sibo", 
    "sib2", "tanjo", "bupo", "padn", "pada", "fpga", "winc", "wbnc", "oinc", "kpva", "leva", "simd", "wsdc", "cbmd", "ocmd", "cmnj", "phap", 
    "mhpa", "btny", "shnj", "mony", "ptme", "cwme", "epme", "hali", "nain", "nuk1", "nuuk", "qaqo", "reyk", "scor", "rptx", "cctx", "pitx", 
    "pric", "ftfr", "rose", "barb", "stcr", "lame", "isab", "vieq", "yobu", "yabu", "faja", "sanj", "arac", "maya", "magi", "penu", "mona", 
    "ptpr", "ptpl", "sama", "bull", "elpo", "limon", "quepo", "sana", "acaj", "acap", "acya", "manz", "mnza", "cabo", "fort", "call", "lobos", 
    "tala", "lali", "vkfl", "nafl", "fmfl", "spfl", "pnfl", "pbfl", "apfl", "tpfl", "fbfl", "moal", "wlms", "psla", "gila", "pfla", "ncla", 
    "apla", "eila", "cpla", "sptx", "gptx", "fptx", "bres", "sthm", "casc", "gibr", "ceut", "mars", "TR22", "gvd9", "alex", "palm", "pdas", 
    "plus", "dakar", "tako", "tkdi", "lagos", "pntn", "sitin", "walvi", "prte", "durb", "pemba", "mtwa", "momb", "lamu", "pmon", "aric", "mata", 
    "plat", "salv", "blueb", 
    # extra for Europe
    "delf", "bork", "harl", "ters", "denh", "hoek", "kiel", "warn", "euro", "mpcw", "dunk", "boul", "diep", "leha", "ouis", "rosc", "stma", "jers", "leco", "audi", "tudy", 
    "lecy", "conc", "sain", "leso", "larp", "iaix", "port", "arca", "scoa", "bil3", "san2", "gij2", "vil2", "setu", "arri", "sagr", "albu", "huel",
    "bon2", "cadi", "mal3", "motr", "alme", "carb", "murc2", "carg", "alac", "gand", "vale", "sagu", "tarr", "barc", "tst", "ptve", "ptln", "sete", "fosm",
    "toul", "figu", "monc", "cent", "rous", "ajac", "sole", "GE25", "LA38", "LI11", "MC41", "PT17", "CF06", "CA02", "MRTM", "PA07", "usti", "matel",
    "ME13", "RC09", "ST44", "GI20", "PDCR", "PLBR", "SC43", "PE09", "PE21", "PRTP", "CT03", "pant", "PSCA", "ppcp", "CI20", "AZ42", "PO40", "SA16", "GA37", 
    "NA23", "PL14", "CETR", "RCCL", "CR08", "lcst", "TA18", "tara1", "OT15", "MNPL", "BA05", "BRLT", "VI12", "IT45", "OR24", "SB36", "AN15", "RA10", "VE19", 
    "baka", "stari", "vela", "sobr", "corf", "prev", "zkth", "kata", "kypa", "kala", "koro", "kaps", "kast", "pale", "hrak", "iera", "kaso", "aigi", "pano", 
    "peir", "noat", "syro", "myko", "delo", "thes", "smth", "gokc", "bozc", "plom", "ment", "bodr", "kos", "plim", "kalt", "mrms", "feth", "bozy", "tasu", 
    "erdem", "arsu", "iske", "girn", "papho", "leme", "zygi", "larn", "para", "gazi", "batr", "haif", "hade", "ashd", "askl", "psail", "matr", "mang", "csta", 
    "sino", "kaci", "sams", "trab", "elja", "said"
]

some station can be declined in different names

In [49]:
possible_stations = []
all_ioc = ioc_.ioc_code.values
for stat in station_to_add:
    if any(stat in station for station in all_ioc):
        for station in all_ioc:
            if stat in station:
                possible_stations.append(station)
ioc_to_add = ioc_[ioc_.ioc_code.isin(possible_stations)]
ioc_to_add

Unnamed: 0,ioc_code,gloss_id,country,location,connection,contacts,added_to_system,observations_arrived_per_week,observations_expected_per_week,observations_ratio_per_week,...,sample_interval,average_delay_per_day,transmit_interval,dcp_id,last_observation_level,last_observation_time,delay,interval,lon,lat
0,abas,327,Japan,Abashiri,SWJP40,Japan Meteorological Agency ( Japan ),2012-03-21 09:54:59,10040,10080.0,100,...,1',8',10',ABASHIRI,1.66,07:59,28',10',144.290000,44.020000
3,acaj,182,El Salvador,Acajutla SV,SZXX01,Ministerio de Medio Ambiente y Recursos Natura...,2008-06-20 12:17:00,9750,10080.0,97,...,1',7',5',300434064008810,0.56,08:09,18',5',-89.838128,13.573792
4,acap,267,Mexico,Acapulco MX,SEPA40,Centro de Investigación Científica y de Educac...,2008-04-28 12:36:00,-down-,10080.0,0,...,1',,5',3540E15A,8.26,-down-,2799d,5',-99.916600,16.833300
5,acap2,267,Mexico,Acapulco API,SOMX10,Universidad Nacional Autónoma de México ( Mexi...,2014-05-19 10:50:47,10020,10080.0,99,...,1',9',10',0100D7CA,4.37,08:06,21',10',-99.903000,16.837933
9,acya,267,Mexico,Acapulco Club de Yates,ftp,Universidad Nacional Autónoma de México ( Mexi...,2010-08-10 09:24:41,,10080.0,0,...,1',,10',,1.31,2025-03-05 14:59,17h,10',-99.902980,16.837990
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1631,yabu,,USA,"Yabucoa Harbor, PR",SXXX03,Puerto Rico Seismic Network ( USA ) +National ...,2010-03-26 14:06:00,9828,10080.0,98,...,1',7',6',3366B5CA,-6.27,08:08,19',6',-65.833000,18.055100
1636,yobu,,Puerto Rico,Yobucou PR,SXXX03,,2006-06-07 04:30:00,,,0,...,',,6',3366B5CA,,,,6',-65.833000,18.050100
1643,zkth,,Greece,"Zakynthos, Ionian Islands",bgan,National Observatory of Athens ( Greece ),2023-09-29 06:31:39,10032,10080.0,100,...,1',1',1',GR-ZKTH-00,0.01,08:25,2',1',20.905200,37.781420
1646,zygi,,Cyprus,Zygi,ftp,Cyprus Oceanography Center ( Cyprus ),2011-09-07 14:55:04,-down-,20160.0,0,...,0.5',,1',,1.91,-down-,3548d,1',33.338375,34.727083


In [45]:
stofs_plot =  stofs_2022.hvplot.scatter(x= "lon", y="lat", hover_cols = "ID", s=130, c='lightgrey', label = 'STOFS 2022 output stations')
stofs_plot1 = stofs_2023.hvplot.scatter(x="lon", y="lat", hover_cols = "ID", s=150, c='grey', label = 'STOFS 2023 output stations')
stofs_plot2 = stofs_2024.hvplot.scatter(x="lon", y="lat", hover_cols = "ID", s=200, c='k', label = 'STOFS 2024 output stations')
ioc_cleanup_plot = ioc_cleanup.hvplot.scatter(x="lon", y="lat",hover_cols = "ioc_code",s = 90, c='r',label = 'stations already cleaned for 2022-2023')
ioc_to_add_plot = ioc_to_add.hvplot.scatter(coastline=True,x="lon", y="lat",hover_cols = "ioc_code", s = 90, c = 'g', label = 'stations to be added')

(stofs_plot2 * stofs_plot1 * stofs_plot * ioc_to_add_plot * ioc_cleanup_plot).opts(width = 1400, height = 600)



the 2024 IOC cleanup database is the red + green points 

In [61]:
ioc_cleanup_2024 = pd.concat([ioc_cleanup,ioc_to_add])
ioc_cleanup_2024

Unnamed: 0,location,ioc_code,gloss_id,lat,lon,country,connection,contacts,dcp_id,last_observation_level,...,number_of_years,time_zone_hours,datum_information,instrument,precision,null_value,gauge_type,overall_record_quality,gesla3_id,seaset_id
125,Base O'Higgins,ohig,,-63.321000,-57.901000,Antarctica,SXCH40,Servicio Hidrográfico y Oceanográfico de la Ar...,ADC04BE6,1.75,...,,,,,,,,,,125.0
135,Puerto Deseado,dese,190.0,-47.754000,-65.915000,Argentina,SEPO40,Armada Argentina Servicio de Hidrografía Naval...,33912088,3.69,...,,,,,,,,,,135.0
136,Puerto Madryn,madry,191.0,-42.763000,-65.031000,Argentina,SEPO40,Armada Argentina Servicio de Hidrografía Naval...,335665D2,6.60,...,,,,,,,,,,136.0
139,Battery Point,bapj,,-42.892000,147.338000,Australia,SZAU01,National Tidal Centre/Australian Bureau of Met...,61221,0.91,...,,,,,,,,,,139.0
140,Broome,brom,40.0,-18.001000,122.219000,Australia,SZAU01,National Tidal Centre/Australian Bureau of Met...,62650,7.69,...,32.0,0.0,Unspecified,Unspecified,Unspecified,-99.9999,Coastal,No obvious issues,Broome,140.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1631,"Yabucoa Harbor, PR",yabu,,18.055100,-65.833000,USA,SXXX03,Puerto Rico Seismic Network ( USA ) +National ...,3366B5CA,-6.27,...,,,,,,,,,,
1636,Yobucou PR,yobu,,18.050100,-65.833000,Puerto Rico,SXXX03,,3366B5CA,,...,,,,,,,,,,
1643,"Zakynthos, Ionian Islands",zkth,,37.781420,20.905200,Greece,bgan,National Observatory of Athens ( Greece ),GR-ZKTH-00,0.01,...,,,,,,,,,,
1646,Zygi,zygi,,34.727083,33.338375,Cyprus,ftp,Cyprus Oceanography Center ( Cyprus ),,1.91,...,,,,,,,,,,


In [18]:
def find_nearest_nodes(
    mesh_nodes: pd.DataFrame,
    points: pd.DataFrame,
    metric: str = "haversine",
    earth_radius = 6371000,
    ):
    """
    Calculate the mesh nodes that are nearest to the specified `points`.
    Both `mesh_nodes` and `points` must be `pandas.DataFrames` that have
    columns named `lon` and `lat` and the coords must be in EPSG:4326.
    Returns the `points` DataFrame after adding these extra columns:
    - `mesh_index` which is the index of the node in the `hgrid.gr3` file
    - `mesh_lon` which is the longitude of the nearest mesh node
    - `mesh_lat` which is the latitude of the nearest mesh node
    - `distance` which is the distance in meters between the point and the nearest mesh node
    Examples:
        >>> mesh_nodes = pd.DataFrame({
        ...     "lon": [0, 10, 20],
        ...     "lat": [0, 5, 0],
        ... })
        >>> points = pd.DataFrame({
        ...     "lon": [1, 11, 21],
        ...     "lat": [1, 4, 1],
        ...     "id": ["a", "b", "c"],
        ... })
        >>> nearest_nodes = find_nearest_nodes(mesh_nodes, points)
        >>> nearest_nodes
           lon  lat id  mesh_index  mesh_lon  mesh_lat       distance
        0    1    1  a           0         0         0  157249.381272
        1   11    4  b           1        10         5  157010.162641
        2   21    1  c           2        20         0  157249.381272
    """
    # The only requirement is that both `mesh_nodes and `points` have `lon/lat` columns
    tree = sklearn.neighbors.BallTree(
        np.radians(mesh_nodes[["lat", "lon"]]),
        metric=metric,
    )
    distances, indices = tree.query(np.radians(points[["lat", "lon"]].values))
    closest_nodes = (
        mesh_nodes
        .rename(columns={"lon": "mesh_lon", "lat": "mesh_lat"})
        .iloc[indices.flatten()]
        .assign(distance=(distances.flatten() * earth_radius))
        .reset_index(names=["mesh_index"])
    )

    return pd.concat((points.reset_index(drop = True), closest_nodes), axis="columns")

# 2 - get STOFS
nearest_nodes_2022 = find_nearest_nodes(stofs_2022, ioc_cleanup_2024[["lon","lat","ioc_code","location"]])
nearest_nodes_2023 = find_nearest_nodes(stofs_2023, ioc_cleanup_2024[["lon","lat","ioc_code","location"]])
nearest_nodes_2024 = find_nearest_nodes(stofs_2024, ioc_cleanup_2024[["lon","lat","ioc_code","location"]])
nearest_nodes_2022 = nearest_nodes_2022[~nearest_nodes_2022.mesh_index.isna()]
nearest_nodes_2023 = nearest_nodes_2023[~nearest_nodes_2023.mesh_index.isna()]
nearest_nodes_2024 = nearest_nodes_2024[~nearest_nodes_2024.mesh_index.isna()]
keep_nodes_2022 = nearest_nodes_2022[nearest_nodes_2022.distance < 5000]
keep_nodes_2023 = nearest_nodes_2023[nearest_nodes_2023.distance < 5000]
keep_nodes_2024 = nearest_nodes_2024[nearest_nodes_2024.distance < 5000]

keep_nodes_2022.to_csv("keep_nodes_2022.csv")
keep_nodes_2023.to_csv("keep_nodes_2023.csv")
keep_nodes_2024.to_csv("keep_nodes_2024.csv")

red are all the STOFS2D points to be extracted

In [19]:
p2 = stofs_2022.hvplot.scatter(x="lon", y="lat", hover_cols = "ID", s=70, c='grey',line_color="lightgrey", label = 'STOFS 2022 output stations')
ip = ioc_cleanup_2024.hvplot.scatter(x="lon", y="lat",s = 10, c='k',coastline=True, label = 'IOC_CLEANUP 2022-2024')
k2 = keep_nodes_2022.hvplot.scatter(x="lon", y="lat", c = 'red', s = 20,coastline=True, label = "STOFS2D stations to be extracted")

# (world_oceans.hvplot(c='ocean',alpha= 0.9).opts(cmap='tab20c') * p2 * ip * k2 ).opts(width = 1100, height = 800)
(world_oceans.hvplot(c='ocean',alpha= 0.9).opts(cmap='tab20c') * p2 * ip ).opts(width = 1500, height = 900)



download IOC data

In [51]:
for i_s, station in ioc_cleanup_2024.iterrows():
    if os.path.exists(f"data/{station.ioc_code}.parquet"):
        print(station.ioc_code, "done")
    else: 
        print(station.ioc_code, "downloading..")
        df = searvey.fetch_ioc_station(station.ioc_code, "2022-01-01", "2024-12-31")
        df.to_parquet(f"data/{station.ioc_code}.parquet")

ohig done
dese done
madry done
bapj done
brom done
barn done
djve done
darw done
pkem done
pmur done
ross done
sprg done
thev done
trst done
oste done
bele done
bamf done
prin done
stjo done
vhbc done
greg done
ohig3 done
cald done
coqu done
corr done
pich done
ptal done
pcha done
pwil done
qtro done
quir done
talc done
viti done
cher done
dzao done
herb done
mare done
nuku done
stqy done
stqy2 done
tubua done
cuxh done
helg done
horn done
itea done
LA23 done
abur done
fuka done
hmda done
hana done
ishig done
kusm done
kush done
naga done
naha done
omae done
sado done
saig done
tosa done
waka done
kant done
huat done
chst done
ande done
honn done
malo done
rorv done
treg done
vard done
davo done
dapi done
hie2 done
fue2 done
coru done
arko done
fors done
furu done
gokr done
holm done
kalit done
karl done
klag done
kung done
kungr done
land done
olan done
oska done
oxel done
rata done
simp done
simr done
smog done
spik done
visb done
zanz done
nkfa done
amas done
anta done
igne done
sil

IOC-ST44: No data. Creating a dummy dataframe


stari downloading..
stcr done
stcr2 done
sthl done
sthl2 done
sthm done
stma downloading..
stma2 downloading..
stpa done
subi done
syow done
syro downloading..
TA18 downloading..
tako done
tala done
tanjo done
tara done
tara1 done
tarr downloading..
tasu downloading..
telu done
ters downloading..
thes downloading..
tjls done
tkao done
tkdi done
tkee done
toul downloading..
toul2 downloading..
toya done
tpfl done
TR22 done
trab downloading..
tst1 downloading..
tst2 downloading..


IOC-tst2: Dropped duplicates: 15477 rows
IOC-tst2: Dropped duplicates: 15194 rows
IOC-tst2: Dropped duplicates: 3565 rows
IOC-tst2: Dropped duplicates: 26706 rows
IOC-tst2: Dropped duplicates: 13797 rows
IOC-tst2: Dropped duplicates: 31348 rows
IOC-tst2: Dropped duplicates: 20907 rows
IOC-tst2: Dropped duplicates: 11034 rows


tudy downloading..
ushu done
usti downloading..
vald done
vald2 done
vale downloading..
valp done
valp2 done
valp3 done
vanu done
VE19 downloading..
vela downloading..


IOC-vela: No data. Creating a dummy dataframe


ver1 done
vern done
VI12 downloading..
vieq done
vieq2 done
vil2 downloading..
vish done
vkfl done
vung done
waik done
wait done
wake done
wake2 done
walvi done
warn downloading..
wbnc done
winc done
wlgt done
wlms done
wlms2 done
wsdc done
xmas done
yabu done
yobu done
zkth downloading..
zygi downloading..


IOC-zygi: No data. Creating a dummy dataframe


zygi1 downloading..


check data availability

In [52]:
import re 
import glob

ioc_cleanup_2024['n_sensors'] = 0

pattern = r"data/([\w\d]+)\.parquet"
stations = [re.search(pattern, path).group(1) for path in glob.glob("data/*parquet")]
f"Total stations: {len(stations)}"

# get the stations with data
keep_stations = []
for station in sorted(stations): 
    df = pd.read_parquet(f"data/{station}.parquet")
    if df.empty:
        pass
    else:
        print(station, end="  ")
        keep_stations.append(station)
        df = df.drop(columns=[col for col in ["sw1", "sw2", "bat"] if col in df.columns])
        print(list(df.columns))

        ioc_cleanup_2024.loc[ioc_cleanup_2024.ioc_code == station, "n_sensors"] = len(df.columns)
        # disregard sw1, sw2 and bat
f"Stations with data: {len(keep_stations)}"

'Total stations: 719'

AN15  ['rad']
AZ42  ['rad']
BA05  ['rad']
BRLT  ['rad']
CA02  ['rad']
CETR  ['pr1', 'pr2']
CF06  ['rad']
CI20  ['rad']
CR08  ['rad']
CT03  ['rad']
GA37  ['rad']
GE25  ['rad']
GI20  ['prs']
IT45  ['rad']
LA23  ['rad']
LA38  ['rad']
LI11  ['rad']
MC41  ['rad']
ME13  ['rad']
MNPL  ['rad']
MRTM  ['pr1', 'pr2']
Male2  ['prs', 'ra2', 'rad']
NA23  ['rad']
OR24  ['rad']
OT15  ['rad']
PA07  ['rad']
PDCR  ['pr1', 'pr2']
PE09  ['prs']
PL14  ['rad']
PLBR  ['pr1']
PO40  ['rad']
PRTP  ['pr1', 'pr2']
PSCA  ['pr1', 'pr2']
PT17  ['rad']
RA10  ['rad']
RC09  ['rad']
RCCL  ['pr1', 'pr2']
SA16  ['rad']
SB36  ['rad']
SC43  ['rad']
TA18  ['rad']
TR22  ['rad']
VE19  ['rad']
VI12  ['rad']
abas  ['rad']
abed  ['bub']
abur  ['rad']
acaj  ['atm', 'prs', 'ra2', 'rad']
acap2  ['rad']
acnj  ['wls']
acya  ['flt']
adak  ['wls']
adak2  ['pwl']
aigi  ['rad']
ajac  ['rad']
ajac2  ['rad']
alac1  ['rad']
alac2  ['rad']
alak  ['pwl']
alak2  ['pwl']
alam  ['pwl']
albu  ['rad']
alex1  ['rad']
alex2  ['rad']
alex3  ['pwl']
alm

'Stations with data: 601'

In [53]:
ioc_cleanup_2024.n_sensors.hvplot.hist(bins=[-0.5,0.5,1.5,2.5,3.5,4.5,5.5])

In [62]:
ioc_cleanup_2024_with_data = ioc_cleanup_2024[ioc_cleanup_2024.ioc_code.isin(keep_stations)]
ioc_cleanup_2024_with_data.to_csv("ioc_cleanup_2024.csv")

store in separate files

In [56]:
for i_s, s in ioc_cleanup_2024_with_data.iterrows():
    df = pd.read_parquet(f"data/{s.ioc_code}.parquet")
    df = df.drop(columns=[col for col in ["sw1", "sw2", "bat"] if col in df.columns])
    for sensor in df.columns:
        ts = df[[sensor]]
        ts.to_parquet(f"raw/{s.ioc_code}_{sensor}.parquet")

evaluate data availabilty

In [57]:
pattern = r"raw/([\w\d]+)\.parquet"
stations_sensors = [re.search(pattern, path).group(1) for path in glob.glob("raw/*parquet")]
f"Total individual recordings: {len(stations_sensors)}"

'Total individual recordings: 826'

In [58]:
import typing as T
DETIDE_START = pd.Timestamp(2022,1,1)
DETIDE_END = pd.Timestamp(2025,1,1)

def calc_ratio(sr: pd.Series, period: pd.DatetimeIndex) -> float:
    sr = sr[(period[0] <= sr.index) & (sr.index <= period[-1])]
    return len(sr) / len(period)

table = dict()
for station_sensor in sorted(stations_sensors):
    station, sensor = station_sensor.split('_')
    df = pd.read_parquet(f"raw/{station_sensor}.parquet")
    interval_value_counts = df.index.to_series().diff().value_counts()
    main_interval_occurences = interval_value_counts.iloc[0]
    main_interval = T.cast(pd.Timedelta, interval_value_counts.index[0])
    detide_period = pd.date_range(DETIDE_START, DETIDE_END, freq=main_interval, inclusive="left")
    table[station_sensor] = dict()
    item = ioc_cleanup_2024_with_data[ioc_cleanup_2024_with_data.ioc_code == station]
    table[station_sensor]["lon"] = item.lon.values[0]
    table[station_sensor]["lat"] = item.lat.values[0]
    table[station_sensor]["completeness"] = calc_ratio(df, detide_period)
    # redo per sensor 

stations_sensors_availability = pd.DataFrame(table).T

In [59]:
stations_sensors_availability.describe()
stations_sensors_availability.completeness.hvplot.hist()

Unnamed: 0,lon,lat,completeness
count,826.0,826.0,826.0
mean,-5.504539,19.513416,0.795058
std,89.155516,30.031451,0.275727
min,-177.708,-69.007778,3e-06
25%,-71.627873,-0.95,0.718041
50%,-0.280155,24.3061,0.931198
75%,39.65,42.635556,0.981895
max,179.1949,70.98,1.44877


In [60]:
stations_sensors_availability.hvplot.points(
    x= "lon", y='lat', 
    hover_cols = ['index',"completeness" ],
    color = "completeness", 
    geo=True,
    s = 200
).opts(
    height = 800,
    width = 1600, 
    cmap='colorwheel'
) * k2