# Simple pre-processing of HEALPix output for cyclone tracking with TempestExtremes

By Alex Baker

* This takes approx. 3 hours to run for N2560 data. If you're familiar with batch processing on JASMIN, this will speed up processing time. A script is available to do this.

[Edit by Stella] Modified the number of cells to actually represent 0.2° grid. The higher number makes TempestExtremes very slow.

In [1]:
import os, intake, datetime
import xarray as xr
import numpy as np
import easygems.healpix as egh
import healpix as hp
from tqdm import tqdm

  _set_context_ca_bundle_path(ca_bundle_path)


In [2]:
# Select zoom level, variables and levels
zoom = 8
variables_1h_2d = ['psl','uas','vas']   # TempestExtremes tracking is psl-based, with surface wind maxima added to tracks
variables_3h_3d = ['zg']   # zg is used by TempestExtremes for warm-core detection and (later, optionally) computing cyclone phase-space parameters (see Stella Bourdin's code)
plevc = np.array([925,500,250])

In [3]:
cat = intake.open_catalog('https://digital-earths-global-hackathon.github.io/catalog/catalog.yaml')['online']

In [4]:
# List UM simulations
[key for key in cat if key.startswith('arp')]

['arp-gem-1p3km', 'arp-gem-2p6km']

In [5]:
# Select simulation and make an output directory (this may need to be on a group workspace or on scratch, rather than ~/)
#run = 'um_glm_n1280_GAL9'
run = 'arp-gem-2p6km'

working_dir = '/work/scratch-nopw2/sbourdin/'
output_dir = os.path.join(working_dir,'data_pp/',run)
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

In [6]:
ds_1h_2d = cat[run](zoom=zoom, time='PT1H').to_dask()#.pipe(egh.attach_coords)
ds_3h_3d = cat[run](zoom=zoom, time='PT6H').to_dask()#.pipe(egh.attach_coords)

  'dims': dict(self._ds.dims),
  'dims': dict(self._ds.dims),


In [9]:
ds_3h_3d.time

In [8]:
L = [t[0:13] for t in ds_1h_2d.time.where(ds_1h_2d.time.dt.hour % 6 == 0, drop = True).values.astype(str)]
L

['2020-01-01T06',
 '2020-01-01T12',
 '2020-01-01T18',
 '2020-01-02T00',
 '2020-01-02T06',
 '2020-01-02T12',
 '2020-01-02T18',
 '2020-01-03T00',
 '2020-01-03T06',
 '2020-01-03T12',
 '2020-01-03T18',
 '2020-01-04T00',
 '2020-01-04T06',
 '2020-01-04T12',
 '2020-01-04T18',
 '2020-01-05T00',
 '2020-01-05T06',
 '2020-01-05T12',
 '2020-01-05T18',
 '2020-01-06T00',
 '2020-01-06T06',
 '2020-01-06T12',
 '2020-01-06T18',
 '2020-01-07T00',
 '2020-01-07T06',
 '2020-01-07T12',
 '2020-01-07T18',
 '2020-01-08T00',
 '2020-01-08T06',
 '2020-01-08T12',
 '2020-01-08T18',
 '2020-01-09T00',
 '2020-01-09T06',
 '2020-01-09T12',
 '2020-01-09T18',
 '2020-01-10T00',
 '2020-01-10T06',
 '2020-01-10T12',
 '2020-01-10T18',
 '2020-01-11T00',
 '2020-01-11T06',
 '2020-01-11T12',
 '2020-01-11T18',
 '2020-01-12T00',
 '2020-01-12T06',
 '2020-01-12T12',
 '2020-01-12T18',
 '2020-01-13T00',
 '2020-01-13T06',
 '2020-01-13T12',
 '2020-01-13T18',
 '2020-01-14T00',
 '2020-01-14T06',
 '2020-01-14T12',
 '2020-01-14T18',
 '2020-01-

In [None]:
# Set up grid
# Find the HEALPix pixels that are closest to the native grid, for example, the .1x.1 degree grid points.
# N.B. A 1x1 has lines at lon=90, 180, 270 (need to avoid these).

#lon = np.arange(0, 360, 0.2)
#lat = np.arange(90, -91, -0.2)
lon = np.linspace(0, 360, 1800)
lat = np.linspace(90, -90, 900)

pix = xr.DataArray(
    hp.ang2pix(ds_1h_2d.crs.healpix_nside, *np.meshgrid(lon, lat), nest=True, lonlat=True),
    coords=(("lat", lat), ("lon", lon)))

In [None]:
# Make a test plot with remapped data for a given variable and time.
ds_1h_2d.psl.sel(time="2020-01-20 06:00").isel(cell=pix).plot()

In [None]:
%%time

# Loop over dataset, saving a file for each timestep (these can be combined later using "cdo cat...", if preferred)
date_start = str(ds_1h_2d.time[0].to_numpy())[:10]
date_end = str(ds_1h_2d.time[-2].to_numpy())[:10]
print('date start: {}'.format(date_start))
print('date end: {}'.format(date_end))

for var in variables_1h_2d:
    print(var)
    for t in tqdm(ds_1h_2d.time.values[::6]):   # select 6 hourly data
        output_fn = '_'.join([run,var,str(t)[:13]])+'.nc'
        output_ffp = os.path.join(output_dir,output_fn)
        d = ds_1h_2d[var].sel(time=t).isel(cell=pix)
        d.to_netcdf(output_ffp)
        
for var in variables_3h_3d:
    print(var)
    for t in tqdm(ds_3h_3d.time.values[::2]):   # select 6 hourly data
        output_fn = '_'.join([run,var,str(t)[:13]])+'.nc'
        output_ffp = os.path.join(output_dir,output_fn)
        d = ds_3h_3d[var].sel(time=t,pressure=plevc).isel(cell=pix)
        d.to_netcdf(output_ffp)