# Generate Test Data

In [1]:
from getpass import getuser
from pathlib import Path

import dask
import intake
import numpy as np
import xarray as xr

import marEx
import marEx.helper as hpc

In [2]:
# Lustre Scratch Directory
scratch_dir = Path("/scratch") / getuser()[0] / getuser()
save_dir = Path("/home") / getuser()[0] / getuser() / "opt" / "marEx" / "tests" / "data"

In [3]:
# Start Dask Cluster
client = hpc.start_local_cluster(
    n_workers=16, threads_per_worker=1, scratch_dir=scratch_dir / "clients"
)  # Specify temporary scratch directory for dask to use

# Raw SST Data

In [None]:
cat = intake.open_catalog(
    "https://raw.githubusercontent.com/eerie-project/intake_catalogues/main/eerie.yaml"
)

In [None]:
# Import 40 years of OSTIA Observations
dat_regrid = cat["dkrz.disk.observations.OSTIA.OSTIA.daily"]
sst_regrid = (
    dat_regrid(chunks={})
    .to_dask()
    .sst.astype(np.float32)
    .coarsen(lat=5, lon=5)
    .mean()
    .rename("to")
)
sst_regrid

In [None]:
# Import 40 years of Daily ICON data (ref. EERIE project)
dat_native = cat["dkrz.disk.model-output.icon-esm-er.eerie-control-1950.v20240618"][
    "ocean"
]["native"]
sst_native = (
    dat_native["2d_daily_mean"](chunks={})
    .to_dask()
    .to.isel(depth=0)
    .drop_vars({"depth", "cell_sea_land_mask"})
    .chunk({"time": 32})
)
sst_native

In [None]:
# Define subset
time_slice_ostia = slice("1982-01-01", "2022-01-01")
time_slice_icon = slice("1991-01-01", "2031-01-01")
regrid_slice_ostia = dict(lat=slice(35, 40), lon=slice(-40, -30))
native_slice_icon = slice(0, 1000)

In [None]:
sst_regrid_slice = sst_regrid.sel(time=time_slice_ostia, **regrid_slice_ostia).chunk(
    {"time": 30, "lat": -1, "lon": -1}
)
sst_regrid_slice

In [None]:
sst_native_slice = (
    sst_native.sel(time=time_slice_icon)
    .isel(ncells=native_slice_icon)
    .drop_vars({"lat", "lon"})
)
sst_native_slice

In [None]:
# Save to Zarr
sst_regrid_slice.to_zarr(save_dir / "sst_gridded.zarr", mode="w")
sst_native_slice.to_zarr(save_dir / "sst_unstructured.zarr", mode="w")

# Pre-processed Data

In [None]:
time_slice = slice("2010-01-01", "2010-02-01")
extremes_ds = (
    xr.open_zarr(
        scratch_dir / "mhws" / "extremes_binary_gridded_shifting_hobday.zarr", chunks={}
    )
    .sel(time=time_slice)
    .drop_vars({"thresholds", "dat_anomaly", "dayofyear"})
)
extremes_ds

In [None]:
# Coarsen extremes_ds (lat & lon)
extremes_ds_coarsen = (
    extremes_ds.coarsen(lat=4, lon=4, boundary="trim")
    .any()
    .chunk({"time": 2, "lat": -1, "lon": -1})
    .persist()
)
extremes_ds_coarsen

In [None]:
# Clear encoding so we can write to Zarr
extremes_ds_coarsen.encoding = {}
for var in extremes_ds_coarsen.data_vars:
    extremes_ds_coarsen[var].encoding = {}

In [None]:
extremes_ds_coarsen.to_zarr(save_dir / "extremes_gridded.zarr", mode="w")

# Make Pre-processed Native Data

In [4]:
cat = intake.open_catalog(
    "https://raw.githubusercontent.com/eerie-project/intake_catalogues/main/eerie.yaml"
)

In [5]:
# Import 40 years of Daily ICON data (ref. EERIE project)
dat_native = cat["dkrz.disk.model-output.icon-esm-er.eerie-control-1950.v20240618"][
    "ocean"
]["native"]
sst_native = (
    dat_native["2d_daily_mean"](chunks={})
    .to_dask()
    .to.isel(depth=0)
    .drop_vars({"depth", "cell_sea_land_mask"})
    .chunk({"time": 32})
)
sst_native

In [6]:
time_slice_icon = slice("2000-01-01", "2002-01-01")

In [7]:
sst_native_slice = sst_native.sel(
    time=time_slice_icon
)  # .isel(ncells=native_slice_icon)
sst_native_slice

In [8]:
grid2d = dat_native["2d_grid"](chunks={}).to_dask().rename({"cell": "ncells"})
neighbours = grid2d.neighbor_cell_index.rename({"clat": "lat", "clon": "lon"})
areas = grid2d.cell_area.rename({"clat": "lat", "clon": "lon"})

In [9]:
sst_native_slice = sst_native_slice.chunk({"ncells": 100000})
sst_native_slice

In [10]:
# Process Data using `MarEx Detect` helper functions:

extremes_ds = marEx.preprocess_data(
    sst_native_slice,
    threshold_percentile=95,  # Use the 95th percentile as the extremes threshold
    dask_chunks={
        "time": 2
    },  # Dask chunks for *output* data (this is much smaller than the input chunks because the Tracking/ID is more memory-intensive)
    neighbours=neighbours,  # Pass information about neighbours to be used in subsequent processing
    cell_areas=areas,  # Pass information about each Unstructured Grid's cell area (in metres) to be used in subsequent processing
    dimensions={"time": "time", "xdim": "ncells"},
)  # Not specifying 'ydim' tells MarEx-Detect that it is an Unstructured Grid
extremes_ds

In [13]:
N_cells = 1000

In [14]:
native_slice_icon = slice(0, N_cells)
extremes_subset = xr.load_dataset(
    scratch_dir / "mhws" / "extremes_unstructured_small.zarr", chunks={}
).isel(ncells=native_slice_icon)
extremes_subset

In [15]:
# Set all neighbours > N_cells to be 0
extremes_subset["neighbours"] = extremes_subset.neighbours.where(
    extremes_subset.neighbours <= N_cells, 0
)

In [16]:
extremes_subset.to_zarr(save_dir / "extremes_unstructured.zarr", mode="w")