# Pre-process ERA5 pressure level data for CREDIT

This notebook provides some keynotes on the preprocessing of ERA5 pressure level data

* ARCO-ERA5 access
* Mass-conserved vertical level subsetting
* Stacking forecast lead time and initialization time for ERA forecast variables
* Aggregating hourly quantities to 6 hourly

In [1]:
import os
import sys
import numpy as np
import xarray as xr

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
sys.path.insert(0, os.path.realpath('../libs/'))
import verif_utils as vu

## Get data from Google Cloud ARCO-ERA5

* Source: https://console.cloud.google.com/storage/browser/gcp-public-data-arco-era5
* GitHub: https://github.com/google-research/arco-era5
* Latest hourly: gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3

In [4]:
# Data begins at 1900 with NaNs
# subset to 1940-01-01 or later to get actual values
ERA5_1h = xr.open_zarr(
    "gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3",
    chunks=None,
    storage_options=dict(token='anon'),)

In [5]:
time_start = '1979-01-01T00'
time_end = '1979-12-31T23'
ERA5_1h_yearly = ERA5_1h.sel(time=slice(time_start, time_end))

## Vertical coordinate subsetting with conserved total column properties

#### 1D example

In [6]:
def integral_conserved_subset_1d(x_column, level_p, ind_select):
    # allocate the output array
    out_column_a = np.empty(len(ind_select)-1); out_column_a.fill(np.nan)

    # compute the level difference
    diff_level_p = np.diff(level_p)
    
    # compute the area of each level using trapz rule
    x_column_midpoint = 0.5 * (x_column[1:] + x_column[:-1])
    x_column_area = x_column_midpoint * diff_level_p

    # subsetting levels through a way that conserves the total integral
    for i_ind, ind in enumerate(ind_select[:-1]):
        ind_start = ind
        ind_end = ind_select[i_ind+1]
        out_column_a[i_ind] = np.sum(x_column_area[ind_start:ind_end]) / (level_p[ind_end] - level_p[ind_start])

    return out_column_a

def integral_conserved_subset_4d(x_grid, level_p, ind_select):
    '''
    Given selected indices, subset a 4D grid (time, level, latitude, longitude)
    while conserving the total vertical integral.

    Args:
        x_grid: 4D grid of data with shape (time, level, latitude, longitude)
        level_p: 1D array of pressure levels
        ind_select: np.array of int values that select specific levels
    Returns:
        out_grid: subsetted copy of x_grid with conserved integral
    '''
    # Prepare the output array with the same dimensions except for the level dimension
    out_grid = np.empty((x_grid.shape[0], len(ind_select)-1, x_grid.shape[2], x_grid.shape[3]))
    out_grid.fill(np.nan)

    # Compute the level differences
    diff_level_p = np.diff(level_p)
    
    # Compute the midpoints along the level dimension
    x_grid_midpoint = 0.5 * (x_grid[:, 1:, :, :] + x_grid[:, :-1, :, :])
    
    # Compute the area of each level using the trapezoidal rule
    x_grid_area = x_grid_midpoint * diff_level_p[:, np.newaxis, np.newaxis]

    # Subsetting levels in a way that conserves the total integral
    for i_ind, ind in enumerate(ind_select[:-1]):
        ind_start = ind
        ind_end = ind_select[i_ind + 1]
        
        # Sum areas over the selected levels and normalize by the level difference
        out_grid[:, i_ind, :, :] = np.sum(x_grid_area[:, ind_start:ind_end, :, :], axis=1) / (level_p[ind_end] - level_p[ind_start])

    return out_grid



In [7]:
ERA5_1h = xr.open_zarr(
    "gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3",
    chunks=None,
    storage_options=dict(token='anon'),)

time_start = '1979-01-01T00'
time_end = '1979-12-31T23'
ERA5_1h = ERA5_1h.sel(time=slice(time_start, time_end))

In [8]:
# get the full pressure level coordinates
level_p = np.array(ERA5_1h['level'])
ind_select = np.array([0, 1, 2, 3, 5, 17, 24, 25, 26, 32, 36])

In [9]:
level_p_select = level_p[ind_select]

In [10]:
# a piece of example data
ds_full_level = ERA5_1h['temperature'].isel(time=slice(0, 2))
test_data = np.array(ds_full_level)
x_column = test_data[1, :, 100, 200]

In [11]:
x_column

array([279.89435, 269.9138 , 260.95508, 238.76361, 226.11641, 213.80423,
       206.43935, 201.34184, 203.07797, 205.13799, 208.43266, 211.47073,
       212.7036 , 214.33925, 215.40028, 215.38864, 214.9771 , 213.86823,
       216.5767 , 220.52747, 224.7109 , 227.27908, 229.39972, 233.72934,
       238.36395, 242.1929 , 245.68329, 247.31508, 248.88345, 250.06119,
       250.15376, 248.66463, 245.14752, 242.35803, 242.7212 , 243.93507,
       245.16734], dtype=float32)

In [12]:
# compute the vertical intergral that we need to conserve
int_original = np.trapz(x_column, level_p)
print('vertical integral of the original data: {}'.format(int_original))

vertical integral of the original data: 228125.6005706787


In [13]:
out_column_a = integral_conserved_subset_1d(x_column, level_p, ind_select)

In [14]:
out_column_a

array([274.90405273, 265.43444824, 249.85934448, 224.95219727,
       211.09089213, 225.47704642, 240.27842712, 243.93809509,
       248.41558584, 243.54293442])

In [15]:
level_p_select = level_p[ind_select]
int_x = np.sum(out_column_a * np.diff(level_p_select))
print('vertical integral after subsetting using conserved area: {}'.format(int_x))

vertical integral after subsetting using conserved area: 228125.6005706787


In [16]:
# bad example: what if we indexing
x_column_select = x_column[ind_select]
int_x_bad = np.trapz(x_column_select, level_p_select)
print('vertical integral after subsetting using a simple indexing: {}'.format(int_x_bad))

vertical integral after subsetting using a simple indexing: 228863.49476623535


In [17]:
def reverse_subset(out_column_a, level_p, ind_select):
    '''
    Estimate the original x_column from out_column_a using 1D interpolation.

    Args:
        out_column_a: The subsetted array produced by integral_conserved_subset_1d
        level_p: The pressure levels array
        ind_select: Indices used for subsetting x_column
        
    Returns:
        x_column_est: Interpolated x_column
    '''
    # Define the pressure levels at the selected indices
    level_p_selected = level_p[ind_select]
    level_p_midpoint = 0.5 * (level_p_selected[1:] + level_p_selected[:-1])
    
    x_column_est = np.interp(level_p, level_p_midpoint, out_column_a)
    
    return x_column_est

In [18]:
x_column_ = reverse_subset(out_column_a, level_p, ind_select)

In [19]:
x_column_[ind_select] 

array([274.90405273, 270.16925049, 260.24274699, 242.74301671,
       224.71725989, 217.6096183 , 238.42825454, 242.10826111,
       245.05746778, 245.49199498, 243.54293442])

In [20]:
x_column[ind_select]

array([279.89435, 269.9138 , 260.95508, 238.76361, 213.80423, 213.86823,
       238.36395, 242.1929 , 245.68329, 245.14752, 245.16734],
      dtype=float32)

#### 4D example

In [21]:
# a piece of example data
ds_full_level = ERA5_1h['temperature'].isel(time=slice(0, 2))
test_data = np.array(ds_full_level)
x_grid = test_data[1:5, :, :, :]

In [22]:
grid_shape = x_grid.shape
int_original = 0

for i_time in range(grid_shape[0]):
    for i_x in range(grid_shape[2]):
        for i_y in range(grid_shape[3]):
            int_original += np.trapz(x_grid[i_time, :, i_x, i_y], level_p)
            
print('vertical integral of the original data: {}'.format(int_original))

vertical integral of the original data: 256855241191.82944


In [23]:
out_grid_a = integral_conserved_subset_4d(x_grid, level_p, ind_select)

In [24]:
int_x = 0
level_p_diff = np.diff(level_p_select)

for i_time in range(grid_shape[0]):
    for i_x in range(grid_shape[2]):
        for i_y in range(grid_shape[3]):
            int_x += np.sum(out_grid_a[i_time, :, i_x, i_y] * level_p_diff)
            
print('vertical integral of the subsetted data: {}'.format(int_x))

vertical integral of the subsetted data: 256855241191.82962


## How to combine fcst lead time and init time dimensions

In [5]:
import pandas as pd

In [6]:
# my collection: time_start = '1979-01-01T00'; time_end = '1979-01-01T23'
xr_ARCO = xr.open_zarr('/glade/derecho/scratch/ksha/CREDIT_data/ERA5_plevel_base/test_data/surf_test.zarr')
tp_ARCO = xr_ARCO['total_precipitation']

base_dir = '/glade/campaign/collections/rda/data/d633000/e5.oper.fc.sfc.accumu/197901/'
xr_RDA_CP = xr.open_dataset(base_dir+'e5.oper.fc.sfc.accumu.128_143_cp.ll025sc.1979010106_1979011606.nc')
xr_RDA_LP = xr.open_dataset(base_dir+'e5.oper.fc.sfc.accumu.128_142_lsp.ll025sc.1979010106_1979011606.nc')

xr_RDA_CP = xr_RDA_CP.drop_vars('utc_date', errors='ignore')
xr_RDA_CP = xr_RDA_CP.rename({'CP': 'TP'})
xr_RDA_LP = xr_RDA_LP.drop_vars('utc_date', errors='ignore')
xr_RDA_LP = xr_RDA_LP.rename({'LSP': 'TP'})

da = xr_RDA_CP + xr_RDA_LP

time_deltas = pd.to_timedelta(da["forecast_hour"].values, unit="h")
new_times = np.add.outer(da["forecast_initial_time"].values, time_deltas)
new_times = new_times.flatten()

da_an = da.stack(time=("forecast_initial_time", "forecast_hour"))
da_an = da_an.drop_vars(['forecast_hour', 'forecast_initial_time', 'time'])
da_an = da_an.assign_coords(time=new_times)

for i_hour in range(10):
    # i + 7 becuase ini_time = 06Z, fcst_lead_time starts from 01 hr
    tp_ARCO_np = np.array(tp_ARCO.isel(time=i_hour+7))
    da_np = np.array(da_an['TP'].isel(time=i_hour))
    print(np.sum(np.abs(tp_ARCO_np - da_np)))

0.1861711
0.18500406
0.18471201
0.18455735
0.1844524
0.18450773
0.25197998
0.2279189
0.27061352
0.26860496


## How to accumulate hourly to 6 hourly

* The time coordinate is "ending-time". 
* For accumulating hourly quantities to 6-hourly, add index-0, 1, 2, 3, 4, 5 hourly values will give the accumulated result on index-0 for 6 hourly.
* Example: hourly quantities on 1959-01-02T01Z, 02Z, 03Z, 04Z, 05Z, 06Z accumulates to 1959-01-02T06Z

In [10]:
ERA5_1h = xr.open_zarr(
    "gs://gcp-public-data-arco-era5/ar/1959-2022-full_37-1h-0p25deg-chunk-1.zarr-v2/",
    chunks=None,
    storage_options=dict(token='anon'),)

In [11]:
tp_1h = ERA5_1h['total_precipitation']
tp_1h = tp_1h.isel(time=slice(0, 96))
tp_1h_np = np.array(tp_1h)

In [12]:
tp_6h_accum = np.sum(tp_1h_np[25:31, ...], axis=0)
print(tp_1h['time'][25:31])

<xarray.DataArray 'time' (time: 6)> Size: 48B
array(['1959-01-02T01:00:00.000000000', '1959-01-02T02:00:00.000000000',
       '1959-01-02T03:00:00.000000000', '1959-01-02T04:00:00.000000000',
       '1959-01-02T05:00:00.000000000', '1959-01-02T06:00:00.000000000'],
      dtype='datetime64[ns]')
Coordinates:
  * time     (time) datetime64[ns] 48B 1959-01-02T01:00:00 ... 1959-01-02T06:...


In [13]:
ERA5_6h = xr.open_zarr(
    "gs://gcp-public-data-arco-era5/ar/1959-2022-6h-1440x721.zarr",
    chunks=None,
    storage_options=dict(token='anon'),)

In [61]:
tp_6h = ERA5_6h['total_precipitation_6hr']
tp_6h = tp_6h.isel(time=slice(0, 16))
tp_6h_np = np.array(tp_6h)

In [62]:
print(np.sum(tp_6h_np[5, ...] - tp_6h_accum))
print(tp_6h['time'][5])

0.0
<xarray.DataArray 'time' ()> Size: 8B
array('1959-01-02T06:00:00.000000000', dtype='datetime64[ns]')
Coordinates:
    time     datetime64[ns] 8B 1959-01-02T06:00:00


0.0 means accumulated correctly

In [96]:
ERA5_1h_sub = ERA5_1h.isel(time=slice(0, 96))
ERA5_tp = ERA5_1h_sub['total_precipitation']
data_shifted = ERA5_tp.shift(time=-1)
data_6hourly = data_shifted.resample(time='6h').sum()
data_6hourly['time'] = data_6hourly['time'] + pd.Timedelta(hours=6)

print(np.sum(np.abs(tp_6h_np[5, ...] - data_6hourly[4, ...])))

# # the basic for loop numpy version
# ERA5_tp_np = np.array(ERA5_tp)

# ERA5_tp_np_6h = np.empty((15, 721, 1440))

# for i_6h, i_1h in enumerate(np.arange(0, 96, 6)[:-1]):
#     ERA5_tp_np_6h[i_6h, ...] = np.sum(ERA5_tp_np[i_1h+1:i_1h+7, ...], axis=0)

<xarray.DataArray 'total_precipitation' ()> Size: 4B
array(0., dtype=float32)
Coordinates:
    time     datetime64[ns] 8B 1959-01-02T06:00:00


`resample(time='6h').sum()` works but the time coord is shifted

## Preprocess examples

In [1]:
import os
import sys
import numpy as np
import xarray as xr

sys.path.insert(0, os.path.realpath('../libs/'))
import verif_utils as vu

In [3]:
# a preprocess example

# Data begins at 1900 will NaNs
# subset to 1940-01-01 or later to get actual values
ERA5_1h = xr.open_zarr(
    "gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3",
    chunks=None,
    storage_options=dict(token='anon'),)

time_start = '1979-01-01T00'
time_end = '1979-01-01T23'

ERA5_1h_yearly = ERA5_1h.sel(time=slice(time_start, time_end))


variables_levels = {}
variables_levels['geopotential'] = None
variables_levels['u_component_of_wind'] = None
variables_levels['v_component_of_wind'] = None
variables_levels['temperature'] = None
variables_levels['specific_humidity'] = None
variables_levels['specific_cloud_ice_water_content'] = None
variables_levels['specific_cloud_liquid_water_content'] = None

ERA5_1h_var = vu.ds_subset_everything(ERA5_1h_yearly, variables_levels)
ERA5_1h_var.to_zarr('/glade/derecho/scratch/ksha/CREDIT_data/ERA5_plevel_base/test_data/upper_air_test.zarr')

variables_levels = {}
variables_levels['total_precipitation'] = None
variables_levels['total_column_water'] = None
variables_levels['total_column_water_vapour'] = None
variables_levels['evaporation'] = None
variables_levels['surface_pressure'] = None
variables_levels['geopotential_at_surface'] = None
variables_levels['top_net_solar_radiation'] = None
variables_levels['top_net_thermal_radiation'] = None
variables_levels['surface_net_solar_radiation'] = None
variables_levels['surface_net_thermal_radiation'] = None
variables_levels['surface_latent_heat_flux'] = None
variables_levels['surface_sensible_heat_flux'] = None

ERA5_1h_var_surf = vu.ds_subset_everything(ERA5_1h_yearly, variables_levels)
ERA5_1h_var_surf.to_zarr('/glade/derecho/scratch/ksha/CREDIT_data/ERA5_plevel_base/test_data/surf_test.zarr')

<xarray.backends.zarr.ZarrStore at 0x1551b044cbc0>