# Global-scale atmospheric moisture and mass budgets on ERA5 pressure level data

In [1]:
import numpy as np
import xarray as xr

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

ERA5 pressure level data:

* `/glade/derecho/scratch/ksha/CREDIT_data/ERA5_plevel_base/upper_air/*.zarr`
* `/glade/derecho/scratch/ksha/CREDIT_data/ERA5_plevel_base/surf/*.zarr`
* `/glade/derecho/scratch/ksha/CREDIT_data/ERA5_plevel_base/accum/*.zarr`

In [3]:
base_dir = '/glade/derecho/scratch/ksha/CREDIT_data/ERA5_plevel_base/'
ds_surf = xr.open_zarr(base_dir + 'surf/ERA5_plevel_6h_surf_1979.zarr')
ds_accum = xr.open_zarr(base_dir + 'accum/ERA5_plevel_6h_accum_1979.zarr')
ds_upper = xr.open_zarr(base_dir + 'upper_air/ERA5_plevel_6h_upper_air_1979.zarr')
ds_static = xr.open_zarr(base_dir + 'static/ERA5_plevel_6h_static.zarr')

In [4]:
t0 = 100
t1 = 101

In [5]:
GRAVITY = 9.80665
R = 6371000  # m
RHO_WATER = 1000.0 # kg/m^3

In [6]:
x = ds_surf['longitude']
y = ds_surf['latitude']
lon, lat = np.meshgrid(x, y)
level_p = 100*np.array(ds_upper['level'])

In [7]:
level_p # Pa or kg/m/s2

array([   100.,    200.,    300.,    500.,    700.,   1000.,   2000.,
         3000.,   5000.,   7000.,  10000.,  12500.,  15000.,  17500.,
        20000.,  22500.,  25000.,  30000.,  35000.,  40000.,  45000.,
        50000.,  55000.,  60000.,  65000.,  70000.,  75000.,  77500.,
        80000.,  82500.,  85000.,  87500.,  90000.,  92500.,  95000.,
        97500., 100000.])

In [8]:
# level_diff = np.diff(level_p)
# level_diff_cumsum = np.concatenate(([0], np.cumsum(level_diff)))

In [9]:
q = np.array(ds_upper['Q'].isel(time=slice(t0, t1+1))) # kg/kg
u = np.array(ds_upper['U'].isel(time=slice(t0, t1+1))) # m/s
v = np.array(ds_upper['V'].isel(time=slice(t0, t1+1)))

In [10]:
precip = np.array(ds_accum['total_precipitation'].isel(time=slice(t0, t1+1)))
evapor = np.array(ds_accum['evaporation'].isel(time=slice(t0, t1+1)))

In [11]:
# def geometric_mean(data):
#     return np.exp(np.mean(np.log(data)))

# def weighted_mean(data, weights, axis, keepdims=False):
    
#     expanded_weights = np.broadcast_to(weights, data.shape)
#     weighted_sum = np.sum(data * expanded_weights, axis=axis, keepdims=keepdims)
#     weights_sum = np.sum(expanded_weights, axis=axis, keepdims=keepdims)
#     return weighted_sum / weights_sum

def weighted_sum(data, weights, axis, keepdims=False):
    '''
    Compute the weighted sum of a given quantity

    Args:
        data: the quantity to be sum-ed
        weights: weights that can be broadcasted to the shape of data
        axis: dims to compute the sum
        keepdims: keepdims

    Returns:
        weighted sum
    '''
    expanded_weights = np.broadcast_to(weights, data.shape)
    return np.sum(data * expanded_weights, axis=axis, keepdims=keepdims)

def pressure_integral(q, level_p, output_shape):
    '''
    Compute the pressure level integral of a given quantity using np.trapz

    Args:
        q: the quantity with dims of (level, lat, lon) or (time, level, lat, lon)
        level_p: the pressure level of q as [Pa] and with dims of (level,)
        output_shape: either (lat, lon) or (time, lat, lon)

    Returns:
        Pressure level integrals of q
    '''
    # (level, lat, lon) --> (lat, lon)
    if len(output_shape) == 2:
        Q = np.empty(output_shape)
        for ix in range(output_shape[0]):
            for iy in range(output_shape[1]):
                Q[ix, iy] = np.trapz(q[:, ix, iy], level_p)
                
    # (time, level, lat, lon) --> (time, lat, lon)
    elif len(output_shape) == 3:
        Q = np.empty(output_shape)
        for i_time in range(output_shape[0]):
            for ix in range(output_shape[1]):
                for iy in range(output_shape[2]):
                    Q[i_time, ix, iy] = np.trapz(q[i_time, :, ix, iy], level_p)
                    
    else:
        print('wrong output_shape')
        raise
        
    return Q
    
def dx_dy(lat, lon):
    '''
    Compute the grid spacing from 2D lat/lon grids using central difference
    for center grids and forward/backward difference for edge grids

    Args:
        lat, lon: 2D arrays of latitude and longitude.

    Return:
        dy, dx: 2D arrays of grid spacings
    '''
    
    # Convert latitude and longitude from degrees to radians
    lat_rad = np.radians(lat)
    lon_rad = np.radians(lon)
    
    # Compute the grid spacing in the latitude direction (dy)
    dy = np.zeros_like(lat)
    dy[1:-1, :] = R * (lat_rad[2:, :] - lat_rad[:-2, :]) / 2.0
    dy[0, :] = R * (lat_rad[1, :] - lat_rad[0, :])
    dy[-1, :] = R * (lat_rad[-1, :] - lat_rad[-2, :])
    
    # Compute the grid spacing in the longitude direction (dx)
    dx = np.zeros_like(lon)
    dx[:, 1:-1] = R * np.cos(lat_rad[:, 1:-1]) * (lon_rad[:, 2:] - lon_rad[:, :-2]) / 2.0
    dx[:, 0] = R * np.cos(lat_rad[:, 0]) * (lon_rad[:, 1] - lon_rad[:, 0])
    dx[:, -1] = R * np.cos(lat_rad[:, -1]) * (lon_rad[:, -1] - lon_rad[:, -2])

    return dy, dx

def compute_grid_area(lat, lon):
    '''
    Compute grid cell areas from 2D lat/lon grids

    Args:
        lat, lon: latitude and longitude with dims of (lat, lon)

    Return:
        grid cell area, dims are (lat, lon)
    '''
    dy, dx = dx_dy(lat, lon)
    area = dy*dx

    return area

area = np.abs(compute_grid_area(lat, lon))

# w_lat = np.cos(np.deg2rad(lat))
w_lat = area #/ np.sum(area)

## Negative humidity fixes

**Plan A**

Given $q_{min}=10^{-12}$ as the lowest possible specific humidity in an atmospheric column. Negative humidity at pressure layer $p_k$ will be corrected to $q_{min}$

\begin{equation}
q_{fill}\left(p_k\right) = q_{min}
\end{equation}

The humidity at layer $k-1$ will be adjusted to conserve the mass of water vapor

\begin{equation}
\Delta q\left(p_{k-1}\right) = \left(q_{min} - q_k\right)\frac{\Delta p_k}{\Delta p_{k-1}}
\end{equation}

\begin{equation}
q_{fill}\left(p_{k-1}\right) = q\left(p_{k-1}\right) - \Delta q\left(p_{k-1}\right)
\end{equation}

If $k=0$ or $q_{fill}\left(p_{k-1}\right)$ is negative, i.e., not enough mass above $p_k$ to be relocated, no adjustments will be made for $q\left(p_{k-1}\right)$.

**Plan B**

Fix the layer without any adjustments on other layers:

\begin{equation}
q_{fill}\left(p_k\right) = q_{min}
\end{equation}

The conservation of total dry air mass will fix the mass imbalance.

## Conservation of total dry air mass

Equation on a single air column (flux form equation with unit of kg/m^2/s):

\begin{equation}
\mathbf{\nabla} \cdot \frac{1}{g} \int_{p_0}^{p_1}{\left[\left(1-q\right)\mathbf{v}\right]}dp + \frac{1}{g}\frac{\partial}{\partial t}\int_{p_0}^{p_1}{\left(1-q\right)}dp = 0
\end{equation}

For global sum, the first term (the divergence of vertically integrated dry air mass flux) is zero. So the second term (the time tendency of vertically integrated dry air mass per area) is also zero. 

From the second term = 0, the global sum of dry air mass $\overline{m_d}$ stays unchanged (kg) :

\begin{equation}
m_d = \sum{\frac{1}{g}\int_{p_0}^{p_1}{\left(1-q\right)}dp}
\end{equation}

\begin{equation}
m_d\left(\mathrm{y_input}\right) - m_d\left(\mathrm{y_pred}\right) = 0
\end{equation}

For any residuals of this conservation, we apply multiplicative correction to specific humidty to close the budget:

\begin{equation}
q^*\left(\mathrm{y_pred}\right) = 1 - \left[1 - q\left(\mathrm{y_pred}\right)\right]*\frac{m_d\left(\mathrm{y_input}\right)}{m_d\left(\mathrm{y_pred}\right)}
\end{equation}

<!-- This can be modified to consider the vertical variability of specific humidity $q_w$:
\begin{equation}
r = 1 + q_w * \left(\frac{m_d\left(\mathrm{y_input}\right)}{m_d\left(\mathrm{y_pred}\right)} - 1\right)
\end{equation}

\begin{equation}
q^*\left(\mathrm{y_pred}\right) = 1 - r*\left[1 - q\left(\mathrm{y_pred}\right)\right]
\end{equation} -->

In [12]:
# a = np.array([[0.1, 0.2, 0.3, 0.4, 0.5], [0.15, 0.25, 0.35, 0.45, 0.55]])
# w = np.array([150, 100])
# y_level = np.array([100, 200, 300, 700, 1000])

# amount = np.trapz(1-a, y_level)
# amount_weighted_sum = np.sum(amount*w)

# correct_sum = 151750

# ratio = correct_sum / amount_weighted_sum
# a_correct = 1 - (1 - a) * ratio
# amount_fix = np.trapz(1-a_correct, y_level)
# amount_sum_fix = np.sum(amount_fix*w)

# error = correct_sum - amount_sum_fix

In [13]:
# a = np.array([[0.1, 0.2, 0.3, 0.4, 0.5],
#               [0.15, 0.25, 0.35, 0.45, 0.55]])
# w = np.array([150, 100])
# y_level = np.array([100, 200, 300, 700, 1000])

# # Calculate the original amount
# amount = np.trapz(1 - a, y_level)
# amount_weighted_sum = np.sum(amount * w)

# # Desired total sum after correction
# correct_sum = 151750

# # Calculate the additive increment (delta)
# delta = (correct_sum - amount_weighted_sum) / ((y_level[-1] - y_level[0]) * np.sum(w))

# # Adjust 'a' using the additive increment
# a_correct = a - delta

# # Ensure that 'a_correct' remains within valid limits [0, 1]
# a_correct = np.clip(a_correct, 0, 1)

# # Recalculate the amount after correction
# amount_fix = np.trapz(1 - a_correct, y_level)
# amount_sum_fix = np.sum(amount_fix * w)

# # Calculate the error
# error = correct_sum - amount_sum_fix

# print("Additive increment (delta):", delta)
# print("Corrected 'a':\n", a_correct)
# print("Corrected total amount:", amount_sum_fix)
# print("Error after correction:", error)

In [14]:
# # Original data
# a = np.array([[0.1, 0.2, 0.3, 0.4, 0.5],
#               [0.15, 0.25, 0.35, 0.45, 0.55]])
# w = np.array([150, 100])
# y_level = np.array([100, 200, 300, 700, 1000])

# # Calculate the original amount
# amount = np.trapz(1 - a, y_level)
# amount_weighted_sum = np.sum(amount * w)

# # Desired total sum after correction
# correct_sum = 151750

# # Calculate the maximum possible delta at each level without making 'a_correct' negative
# max_delta = np.min(a, axis=0)

# # Calculate the total possible correction
# # Compute the integration weights (differences between y_levels)
# dy = np.diff(y_level)
# # For trapezoidal rule, calculate the weights for each level
# weights = np.zeros_like(a[0])
# weights[0] = (dy[0]) / 2
# weights[1:-1] = (dy[:-1] + dy[1:]) / 2
# weights[-1] = (dy[-1]) / 2

# # Total possible correction
# total_possible_correction = np.sum(max_delta * weights)

# # Calculate the required total correction
# total_correction_needed = correct_sum - amount_weighted_sum

# # Check if the correction is possible
# if total_possible_correction < total_correction_needed / np.sum(w):
#     raise ValueError("Cannot achieve the desired correction without making 'a_correct' negative.")

# # Calculate the scaling factor
# scaling_factor = (total_correction_needed / np.sum(w)) / total_possible_correction

# # Calculate delta as an array varying with 'level'
# delta = max_delta * scaling_factor

# # Adjust 'a' using the additive delta
# a_correct = a - delta

# # Ensure that 'a_correct' remains non-negative
# a_correct = np.maximum(a_correct, 0)

# # Recalculate the amount after correction
# amount_fix = np.trapz(1 - a_correct, y_level)
# amount_sum_fix = np.sum(amount_fix * w)

# # Calculate the error
# error = correct_sum - amount_sum_fix

# print("Additive increment array (delta):", delta)
# print("Corrected 'a':\n", a_correct)
# print("Corrected total amount:", amount_sum_fix)
# print("Error after correction:", error)

In [15]:
# an example of std(q) that varies on pressure levels
# top of atmos --> surface

# damping_factor = 1.0
# q_std = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
#                   4, 4, 4, 4, 4, 2, 2])
# q_std_norm = q_std / np.mean(q_std) # geometric_mean(q_std) # q_w

# q_correct_ratio = 1 + q_std_norm * (mass_dry_sum[0] / mass_dry_sum[1] - 1)
# q_correct[1, ...] = 1 - (1 - q_correct[1, ...]) * (1 + damping_factor * (q_correct_ratio[:, None, None] - 1))

In [17]:
q_correct = np.copy(q) # <-- corrected y_pred on q
output_shape = (2,)+lon.shape

correction_cycle_num = 1 # iterative to handle numrical precision

for i in range(correction_cycle_num):
    
    mass_dry_per_area = pressure_integral(1-q_correct, level_p, output_shape) / GRAVITY # kg/m^2
    mass_dry_sum = weighted_sum(mass_dry_per_area, w_lat, axis=(1, 2), keepdims=False) # kg
    
    # ----------------------------------------------------------------------- #
    # check residual term
    mass_dry_res = mass_dry_sum[0] - mass_dry_sum[1]
    print('Residual to conserve the dry air mass [kg]: {}'.format(mass_dry_res))
    print('Ratio to the total amount of air [kg/kg]: {}'.format(mass_dry_res/5.148e18))
    # ----------------------------------------------------------------------- #
    # correction
    print('Correction iter {}'.format(i))
    # get correction ratio
    q_correct_ratio = mass_dry_sum[0] / mass_dry_sum[1] # no p level weighting
    q_correct[1, ...] = 1 - (1 - q_correct[1, ...]) * q_correct_ratio

# final checks
mass_dry_per_area = pressure_integral(1-q_correct, level_p, output_shape) / GRAVITY
mass_dry_sum = weighted_sum(mass_dry_per_area, w_lat, axis=(1, 2), keepdims=False)

# ----------------------------------------------------------------------- #
# check residual term
mass_dry_res = mass_dry_sum[0] - mass_dry_sum[1]
print('Residual to conserve the dry air mass [kg]: {}'.format(mass_dry_res))
print('Ratio to the total amount of air [kg/kg]: {}'.format(mass_dry_res/5.148e18))

Residual to conserve the dry air mass [kg]: -272134672384.0
Ratio to the total amount of air [kg/kg]: -5.2862212972804974e-08
Correction iter 0
Residual to conserve the dry air mass [kg]: 51641950208.0
Ratio to the total amount of air [kg/kg]: 1.003145885936286e-08


## Moisture budget

Equation on a single air column (flux form equation with unit of kg/m^2/s):

\begin{equation}
\mathbf{\nabla} \cdot \frac{1}{g} \int_{0}^{p_s}{\left(\mathbf{v}q\right)}dp = -\frac{1}{g}\frac{\partial}{\partial t}\int_{0}^{p_s}{q}dp - E - P
\end{equation}

For global sum, the first term (the divergence of integrated moisture flux) is zero. So the second term (the time tendency of total column water $Q$) is balanced by evaporation $E$ and precipitation $P$ (kg/s):

\begin{equation}
\overline{\left(\frac{\partial Q}{\partial t}\right)} =  - \overline{E} - \overline{P}
\end{equation}

For any residuals of this conservation, we use precipitation to close the budge:

\begin{equation}
\overline{P}^* = \overline{\left[\frac{Q\left(\mathrm{y_pred}\right) - Q\left(\mathrm{y_input}\right)}{\mathrm{second}}\right]} - \overline{E}
\end{equation}

\begin{equation}
P^* = P * \frac{\overline{P}^*}{\overline{P}}
\end{equation}

In [14]:
# q_correct = np.copy(q)

In [15]:
N_seconds = 3600 * 6 # 6 hourly data
output_shape = (2,)+lon.shape

precip_flux = precip[1, ...] * RHO_WATER / N_seconds # m/hour --> kg/m^2/s, positive
evapor_flux = evapor[1, ...] * RHO_WATER / N_seconds # kg/m^2/s, negative

precip_correct = np.copy(precip_flux) # <-- corrected y_pred on precip

correction_cycle_num = 1

# pre-compute TWC
TWC = pressure_integral(q_correct, level_p, output_shape) / GRAVITY # kg/m^2
dTWC_dt = (TWC[1, ...] - TWC[0, ...]) / N_seconds # kg/m^2/s
TWC_sum = weighted_sum(dTWC_dt, w_lat, axis=(0, 1), keepdims=False) # kg/s

# pre-compute evaporation
E_sum = weighted_sum(evapor_flux, w_lat, axis=(0, 1), keepdims=False) # kg/s

for i in range(correction_cycle_num):
    
    P_sum = weighted_sum(precip_correct, w_lat, axis=(0, 1), keepdims=False) # kg/s
    residual = -TWC_sum - E_sum - P_sum # kg/s
    print('Residual to conserve moisture budge [kg/s]: {}'.format(residual))
    P_correct = P_sum + residual # kg/s
    P_correct_ratio = (P_sum + residual) / P_sum
    print('correction ratio: {}'.format(P_correct_ratio))
    precip_correct = precip_correct * P_correct_ratio

# final checks
P_sum = weighted_sum(precip_correct, w_lat, axis=(0, 1), keepdims=False)
residual = -TWC_sum - E_sum - P_sum
print('Residual to conserve moisture budge [kg/s]: {}'.format(residual))

Residual to conserve moisture budge [kg/s]: -256320051.6473465
correction ratio: 0.9829969112402189
Residual to conserve moisture budge [kg/s]: -442.4025478363037


## Conclosions

* ERA5 conserves global mean quantities well with very little residuals.
* The correction method can close the budge subjet to numerical precision.
* Possible improvement: the corrections were applied equally to all lat/lon gids.
    * (1) 3D corrections can be applied to consider the spatial variability of specific humdity and precipitation flux.
    * (2) Alternatively, such spatial variability can be considered using single-column-based budget calculations.