# Pre-process ERA5 pressure level data for CREDIT

In [1]:
import os
import sys
import numpy as np
import xarray as xr

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
sys.path.insert(0, os.path.realpath('../libs/'))
import verif_utils as vu

## Get data from Google Cloud ARCO-ERA5

* Source: https://console.cloud.google.com/storage/browser/gcp-public-data-arco-era5
* GitHub: https://github.com/google-research/arco-era5
* Latest hourly: gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3

In [4]:
# Data begins at 1900 will NaNs
# subset to 1940-01-01 or later to get actual values
ERA5_1h = xr.open_zarr(
    "gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3",
    chunks=None,
    storage_options=dict(token='anon'),)

In [5]:
time_start = '1979-01-01T00'
time_end = '1979-01-01T12'

ERA5_1h_yearly = ERA5_1h.sel(time=slice(time_start, time_end))

In [6]:
variables_levels = {}
variables_levels['geopotential'] = None
variables_levels['u_component_of_wind'] = None
variables_levels['v_component_of_wind'] = None
variables_levels['temperature'] = None
variables_levels['specific_humidity'] = None
variables_levels['specific_cloud_ice_water_content'] = None
variables_levels['specific_cloud_liquid_water_content'] = None

In [7]:
ERA5_1h_var = vu.ds_subset_everything(ERA5_1h_yearly, variables_levels)

In [10]:
ERA5_1h_var.to_zarr('/glade/derecho/scratch/ksha/CREDIT_data/ERA5_pressure_lev/upper_air_test.zarr')

<xarray.backends.zarr.ZarrStore at 0x14fd185a3cc0>

In [10]:
list(ERA5_1h_yearly.keys())

['100m_u_component_of_wind',
 '100m_v_component_of_wind',
 '10m_u_component_of_neutral_wind',
 '10m_u_component_of_wind',
 '10m_v_component_of_neutral_wind',
 '10m_v_component_of_wind',
 '10m_wind_gust_since_previous_post_processing',
 '2m_dewpoint_temperature',
 '2m_temperature',
 'air_density_over_the_oceans',
 'angle_of_sub_gridscale_orography',
 'anisotropy_of_sub_gridscale_orography',
 'benjamin_feir_index',
 'boundary_layer_dissipation',
 'boundary_layer_height',
 'charnock',
 'clear_sky_direct_solar_radiation_at_surface',
 'cloud_base_height',
 'coefficient_of_drag_with_waves',
 'convective_available_potential_energy',
 'convective_inhibition',
 'convective_precipitation',
 'convective_rain_rate',
 'convective_snowfall',
 'convective_snowfall_rate_water_equivalent',
 'downward_uv_radiation_at_the_surface',
 'duct_base_height',
 'eastward_gravity_wave_surface_stress',
 'eastward_turbulent_surface_stress',
 'evaporation',
 'forecast_albedo',
 'forecast_logarithm_of_surface_roughne

In [14]:
variables_levels = {}
variables_levels['total_precipitation'] = None
variables_levels['total_column_water'] = None
variables_levels['total_column_water_vapour'] = None
variables_levels['mean_surface_latent_heat_flux'] = None
variables_levels['evaporation'] = None
variables_levels['surface_pressure'] = None

In [15]:
ERA5_1h_var_surf = vu.ds_subset_everything(ERA5_1h_yearly, variables_levels)

In [17]:
ERA5_1h_var_surf.to_zarr('/glade/derecho/scratch/ksha/CREDIT_data/ERA5_pressure_lev/surf_test.zarr')

<xarray.backends.zarr.ZarrStore at 0x14b8d8729140>

In [None]:
level_p = np.array(ERA5_1h['level'])
ind_select = [0, 8, 10, 12, 14, 16, 17, 19, 21, 23, 25, 26, 28, 30, 32, 33, 34, 35, 36]

In [42]:
# original version
def integral_conserved_subset_1d(x_column, level_p, ind_select):
    '''
    Given selected indices, subset a single-column of pressure level data.
    The subsetted version has conserved vertical integral
    
    Args:
        x_column: a single-column of data
        level_p: pressure (low to high) as vertical coordiantes
        ind_select: np.array int values that subsets x_column
    Returns:
        out_column_a: subsetted copy of x_column
    
    '''
    # allocate the output array
    out_column_a = np.empty(len(ind_select)-1); out_column_a.fill(np.nan)

    # compute the level difference
    diff_level_p = np.diff(level_p)
    
    # compute the area of each level using trapz rule
    x_column_midpoint = 0.5 * (x_column[1:] + x_column[:-1])
    x_column_area = x_column_midpoint * diff_level_p

    # subsetting levels through a way that conserves the total integral
    for i_ind, ind in enumerate(ind_select[:-1]):
        ind_start = ind
        ind_end = ind_select[i_ind+1]
        out_column_a[i_ind] = np.sum(x_column_area[ind_start:ind_end]) / (level_p[ind_end] - level_p[ind_start])

    return out_column_a

def wrapper_function(x_column):
    level_p = level_p
    ind_select = [0, 8, 10, 12, 14, 16, 17, 19, 21, 23, 25, 26, 28, 30, 32, 33, 34, 35, 36]
    return integral_conserved_subset_1d(x_column, level_p, ind_select)

In [None]:
result = xr.apply_ufunc(
    wrapper_function,
    ERA5_1h_var,
    input_core_dims=[['level']],
    vectorize=True,
    dask='parallelized',
    output_dtypes=[float]
)

In [None]:
result

In [None]:
# Load the dataset
ERA5_1h = xr.open_zarr(
    "gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3",
    chunks=None,
    storage_options=dict(token='anon'),
)

# Time slicing to get the relevant data
time_start = '1979-01-01T00'
time_end = '1979-12-31T23'
ERA5_1h_var = ERA5_1h.sel(time=slice(time_start, time_end))

# Extract the level dimension as a numpy array
level_p = ERA5_1h['level'].values
ind_select = np.array([0, 8, 10, 12, 14, 16, 17, 19, 21, 23, 25, 26, 28, 30, 32, 33, 34, 35, 36])

# Original function for subsetting and conserving the vertical integral
def integral_conserved_subset_1d(x_column, level_p, ind_select):
    '''
    Given selected indices, subset a single-column of pressure level data.
    The subsetted version has conserved vertical integral
    
    Args:
        x_column: a single-column of data
        level_p: pressure (low to high) as vertical coordinates
        ind_select: np.array int values that subset x_column
    Returns:
        out_column_a: subsetted copy of x_column
    '''
    # Allocate the output array
    out_column_a = np.empty(len(ind_select) - 1)
    out_column_a.fill(np.nan)

    # Compute the level difference
    diff_level_p = np.diff(level_p)
    
    # Compute the area of each level using the trapezoidal rule
    x_column_midpoint = 0.5 * (x_column[1:] + x_column[:-1])
    x_column_area = x_column_midpoint * diff_level_p

    # Subset levels while conserving the total integral
    for i_ind, ind in enumerate(ind_select[:-1]):
        ind_start = ind
        ind_end = ind_select[i_ind + 1]
        out_column_a[i_ind] = np.sum(x_column_area[ind_start:ind_end]) / (level_p[ind_end] - level_p[ind_start])

    return out_column_a

# Wrapper function to apply to each grid point
def wrapper_function(x_column):
    return integral_conserved_subset_1d(x_column, level_p, ind_select)

# Apply the function using xarray's apply_ufunc
result = xr.apply_ufunc(
    wrapper_function,
    ERA5_1h_var,
    input_core_dims=[['level']],
    vectorize=True,
    dask='parallelized',
    output_dtypes=[float]
)

In [4]:
# np.array(ERA5_1h['temperature'].isel(time=slice(800000, 800001)))

In [5]:
# variables to save for all years
## 'total_column_water' is used to check the effectiveness of (1/g)*\int(Q) = TWC, so not all years are needed
varname_save = ['geopotential', 'u_component_of_wind', 'v_component_of_wind', 'temperature', 'specific_humidity', 
                'surface_pressure', 'skin_temperature', '2m_temperature', '10m_u_component_of_wind', '10m_v_component_of_wind']

varname_water = ['total_precipitation', 'total_column_water', 'mean_surface_latent_heat_flux', 'evaporation']

varname_static = ['land_sea_mask', 'geopotential_at_surface']

In [15]:
# all variables
# list(ERA5_1h.keys())

## How to pre-process

### Vertical coordinate subsetting with conserved total column properties

In [4]:
# use a smaller dataset as example
ERA5_1h = xr.open_zarr(
    "gs://gcp-public-data-arco-era5/ar/1959-2022-full_37-1h-0p25deg-chunk-1.zarr-v2/",
    chunks=None,
    storage_options=dict(token='anon'),)

In [16]:
# get the full pressure level coordinates
level_p = np.array(ERA5_1h['level'])

In [41]:
level_p

array([   1,    2,    3,    5,    7,   10,   20,   30,   50,   70,  100,
        125,  150,  175,  200,  225,  250,  300,  350,  400,  450,  500,
        550,  600,  650,  700,  750,  775,  800,  825,  850,  875,  900,
        925,  950,  975, 1000])

In [32]:
level_p

array([   1,    2,    3,    5,    7,   10,   20,   30,   50,   70,  100,
        125,  150,  175,  200,  225,  250,  300,  350,  400,  450,  500,
        550,  600,  650,  700,  750,  775,  800,  825,  850,  875,  900,
        925,  950,  975, 1000])

In [33]:
level_p[ind_select]

array([   1,   50,  100,  150,  200,  250,  300,  400,  500,  600,  700,
        750,  800,  850,  900,  925,  950,  975, 1000])

In [None]:
1, 50, 100, 150, 200, 250, 300, 400, 500, 600, 700, 750, 800, 850, 900, 925, 950, 975, 1000

In [None]:
ind_select = [0, 8, 10, 12, 14, 16]

In [7]:
# a piece of example data
ds_full_level = ERA5_1h['temperature'].isel(time=slice(0, 2))
test_data = np.array(ds_full_level)
x_column = test_data[1, :, 100, 200]

In [8]:
x_column

array([255.67235, 242.60074, 238.53818, 221.50595, 206.83128, 197.17053,
       196.06165, 196.99301, 200.18994, 203.85724, 206.67328, 209.04404,
       210.58057, 210.52606, 210.01845, 210.13292, 209.95409, 213.27097,
       218.48499, 224.57101, 231.41153, 237.77028, 242.69957, 246.7081 ,
       250.43092, 253.82133, 257.235  , 258.3148 , 259.16742, 259.84662,
       259.82944, 258.4964 , 256.75854, 255.96848, 255.05963, 255.07288,
       256.65472], dtype=float32)

In [9]:
# compute the vertical intergral that we need to conserve
int_original = np.trapz(x_column, level_p)
print('vertical integral of the original data: {}'.format(int_original))

vertical integral of the original data: 233575.8140258789


In [10]:
# select levels
# !! the FIRST and the LAST index must be selected
ind_select = np.array([0, 1, 2, 3, 5, 17, 25, 32, 36])

In [37]:


# # # oringal version optimized by ChatGPT
# def integral_conserved_subset_ChatGPT(x_column, level_p, ind_select):
#     # Compute the level difference and the midpoints
#     diff_level_p = np.diff(level_p)
#     x_column_midpoint = 0.5 * (x_column[1:] + x_column[:-1])
    
#     # Compute the area of each level using the trapezoidal rule
#     x_column_area = x_column_midpoint * diff_level_p
    
#     # Compute the cumulative sum of the areas and levels
#     cumulative_area = np.zeros_like(x_column)
#     cumulative_area[1:] = np.cumsum(x_column_area)
    
#     cumulative_level_p = np.zeros_like(level_p)
#     cumulative_level_p[1:] = np.cumsum(diff_level_p)

#     # Use advanced indexing to vectorize the selection and summation
#     selected_areas = cumulative_area[ind_select[1:]] - cumulative_area[ind_select[:-1]]
#     selected_levels = cumulative_level_p[ind_select[1:]] - cumulative_level_p[ind_select[:-1]]
    
#     # Compute the output array
#     out_column_a = selected_areas / selected_levels
    
#     return out_column_a

In [12]:
out_column_a = integral_conserved_subset_1d(x_column, level_p, ind_select)

In [21]:
out_column_a

array([249.13653564, 240.56945801, 230.02206421, 206.86799316,
       207.40590299, 235.70281982, 257.96346664, 255.70190811])

In [13]:
level_p_select = level_p[ind_select]
int_x = np.sum(out_column_a * np.diff(level_p_select))
print('vertical integral after subsetting using conserved area: {}'.format(int_x))

vertical integral after subsetting using conserved area: 233575.8140258789


In [14]:
# bad example: what if we indexing
x_column_select = x_column[ind_select]
int_x_bad = np.trapz(x_column_select, level_p_select)
print('vertical integral after subsetting using a simple indexing: {}'.format(int_x_bad))

vertical integral after subsetting using a simple indexing: 231657.5684967041


## How to accumulate hourly to 6 hourly

* The time coordinate is "ending-time". 
* For accumulating hourly quantities to 6-hourly, add index-0, 1, 2, 3, 4, 5 hourly values will give the accumulated result on index-0 for 6 hourly.
* Example: hourly quantities on 1959-01-02T01Z, 02Z, 03Z, 04Z, 05Z, 06Z accumulates to 1959-01-02T06Z

In [15]:
ERA5_1h = xr.open_zarr(
    "gs://gcp-public-data-arco-era5/ar/1959-2022-full_37-1h-0p25deg-chunk-1.zarr-v2/",
    chunks=None,
    storage_options=dict(token='anon'),)

In [16]:
tp_1h = ERA5_1h['total_precipitation']
tp_1h = tp_1h.isel(time=slice(0, 96))
tp_1h_np = np.array(tp_1h)

In [17]:
tp_6h_accum = np.sum(tp_1h_np[25:31, ...], axis=0)
print(tp_1h['time'][25:31])

<xarray.DataArray 'time' (time: 6)> Size: 48B
array(['1959-01-02T01:00:00.000000000', '1959-01-02T02:00:00.000000000',
       '1959-01-02T03:00:00.000000000', '1959-01-02T04:00:00.000000000',
       '1959-01-02T05:00:00.000000000', '1959-01-02T06:00:00.000000000'],
      dtype='datetime64[ns]')
Coordinates:
  * time     (time) datetime64[ns] 48B 1959-01-02T01:00:00 ... 1959-01-02T06:...


In [18]:
ERA5_6h = xr.open_zarr(
    "gs://gcp-public-data-arco-era5/ar/1959-2022-6h-1440x721.zarr",
    chunks=None,
    storage_options=dict(token='anon'),)

In [19]:
tp_6h = ERA5_6h['total_precipitation_6hr']
tp_6h = tp_6h.isel(time=slice(0, 16))
tp_6h_np = np.array(tp_6h)

In [20]:
print(np.sum(tp_6h_np[5, ...] - tp_6h_accum))
print(tp_6h['time'][5])

0.0
<xarray.DataArray 'time' ()> Size: 8B
array('1959-01-02T06:00:00.000000000', dtype='datetime64[ns]')
Coordinates:
    time     datetime64[ns] 8B 1959-01-02T06:00:00


0.0 means accumulated correctly