In [2]:
from glob import glob
import rioxarray as rio
import xarray as xr
from rasterio.enums import Resampling
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

The main aim of this preprocessing step is to:
1. Perform different corrections for the L-VOD data to reduce the impact of Radio Frequency Interference (RFI).

2. Convert the data from an EASE2 grid to a regular grid.

# 1. Correct L-VOD data for RFI 

One of the limitations of the L-VOD data is that it suffers from Radio Frequency Interference. That leads to time varying coverage of the data. The main aim of this preprocessing step is to reduce the impact of this effect.

In our analysis, we rely on the approach used in [Yang et al. 2023](https://www.nature.com/articles/s41561-023-01274-4).

The Yang et al. 2023 approach is based on the following steps:

1. Calculate a long-term trend using a curve fitting method that has been used to filter and smooth CO2 measurements by NOAA

2. Correct for RFI by identifying invalid pixels based on the following conditions:
    - if the absolute difference between the ASC and DESC modes is greater than 10% of the mean of the the two

    - if either of the ASC and DESC modes is not null and the ASC_DESC mode is less than 90% of the mean of the ASC and DESC modes
    
    The method to correct for invalid pixels is to use the maximum of the ASC and DESC modes for invalid pixels. 

Because this analysis produces an average rate of change for each pixel, but our pipeline works with estimates of the stocks at different years, we combine the estimates of the stock change with the estimates of the stocks at the year 2019 to infer the stocks and the year 2010. The output of the analysis is thus stocks at two time periods - 2010 and 2019

In [3]:

def correct_invalid_RFI_pixels(data_yr:xr.DataArray,data_other_yr:xr.DataArray) -> xr.DataArray:
    '''
    Get valid value and year for each pixel. Pixels that suffer from Radio Frequency Interference (RFI) are identified based on the following conditions:
    - if the absolute difference between the ASC and DESC modes is greater than 10% of the mean of the the two
    - if either of the ASC and DESC modes is not null and the ASC_DESC mode is less than 90% of the mean of the ASC and DESC modes
    
    The method to correct for invalid pixels is to use the maximum of the ASC and DESC modes for invalid pixels. 

    Parameters:
    data_yr (xarray.DataArray): data for the year
    data_other_yr (xarray.DataArray): data for the next year 

    Returns:
    final_data (xarray.DataArray): corrected data
    '''

    # set up rultes for invalid pixels
    # if the absolute difference between the ASC and DESC modes is greater than 10% of the mean of the ASC and DESC modes
    mask1 = (np.abs(data_yr[1,:,:]-data_yr[2,:,:]) > data_yr[1:,:,:].mean(dim='type')*0.1)

    # if either of the ASC and DESC modes is not null and the ASC_DESC mode is less than 90% of the mean of the ASC and DESC modes
    mask2 = data_yr[1:,:,:].isnull().any(dim='type') & (data_yr[0,:,:] < data_yr[1:,:,:].mean(dim='type')*0.9)

    # combine the two masks
    cond = mask1 | mask2

    # if the condition is true (for invalid pixels), set the data as the maximum of ASC and DESC, otherwise keep the ASC_DESC mode value
    final_data = xr.where(cond,data_yr[1:,:,:].max(dim='type'),data_yr[0,:,:])
    
    return final_data

def calculate_trend(ds: xr.Dataset,calibration_mode='') -> xr.DataArray:
    '''
    Calculate the trend of the data

    Parameters:
    ds (xarray.Dataset): data to calculate the trend for
    calibration_mode (str): mode to use for calibration of L-VOD to biomass (either '', 'min' or 'max')

    Returns:
    trend xarray.DataArray: trend of the data (in units of MgC/ha/yr)
    '''

    # make sure the calibration mode is either None, 'min' or 'max'
    assert calibration_mode in ['','min','max']
    modes = ['AGC_ASC_DESC','AGC_ASC','AGC_DESC']
    modes = [m+calibration_mode for m in modes]


    # find the first time in which a pixel has non nan value in the time dimension for ASC_DESC

    # multiply each non-null value by its index in the time dimension
    time_inds = (ds['AGC_ASC_DESC'].notnull().transpose('lat','lon','time') * np.arange(1,12,dtype=int))

    # replace zeros with 13 and find min along time axis
    ASC_DESC_start = xr.where(time_inds == 0,13,time_inds).min(dim='time')

    # replace 13 with nan
    ASC_DESC_start = xr.where(ASC_DESC_start == 13, np.nan, ASC_DESC_start) -1

    # fill na for indexing
    ASC_DESC_start_filled = ASC_DESC_start.fillna(0).astype(int)

    # calculate the first valid value for each of the three acuisition modes
    start_data = xr.concat([ds[i].isel(time=ASC_DESC_start_filled) for i in modes],dim='type')

    # calculate the year after the first valid value for each of the three acuisition modes
    ASC_DESC_start_filled_next_yr = xr.where(ASC_DESC_start_filled<10,ASC_DESC_start_filled+1,ASC_DESC_start_filled)

    # calculate the value for the year after the first valid year for each of the three acuisition modes
    start_data_next_yr = xr.concat([ds[i].isel(time=ASC_DESC_start_filled_next_yr) for i in modes],dim='type')

    # set the end year to 2019
    end_yr = 9*xr.ones_like(ASC_DESC_start_filled)

    # calculate the last valid value for each of the three acuisition modes
    end_data = xr.concat([ds[i].isel(time=end_yr) for i in modes],dim='type')

    # calculate the year before the last valid value for each of the three acuisition modes
    end_data_prev = xr.concat([ds[i].isel(time=end_yr-1) for i in modes],dim='type')

    # correct the invalid pixels for the start and end years
    start_data_final = correct_invalid_RFI_pixels(start_data,start_data_next_yr)
    end_data_final = correct_invalid_RFI_pixels(end_data,end_data_prev)

    # filter the data to only include pixels that have at least one non-null value across the time series and have a start year less than 2018
    cond = ds['AGC_ASC_DESC'].notnull().any(dim='time') & (ASC_DESC_start < 8)
    start_data_final = xr.where(cond,start_data_final,np.nan)
    end_data_final = xr.where(cond,end_data_final,np.nan)

    # calculate the trend
    trend = ((end_data_final-start_data_final)/(end_yr-ASC_DESC_start))

    return trend

In [4]:
# load the L-VOD data
ds = xr.open_dataset('../data/biomass_estimates/LVOD/transfer_7684148_files_1514bd66/AGC_vod_annual_NOAA_Trend.nc',decode_times=False)

# create a corrected xr.Dataset with the same dimensions as the original dataset, but only for 2010 and 2019 in the time dimension
yang_correction = ds.drop_vars(['AGC_ASC','AGC_ASCmin','AGC_ASCmax','AGC_DESC','AGC_DESCmin','AGC_DESCmax']).sel(time=[2010,2019])

# for each calibration mode
for m in ['','min','max']:

    # calculate the trend for the three calibration modes
    correction = calculate_trend(ds,calibration_mode=m)

    # take the data for the year 2019 in places where we could infer the trend
    yang_correction['AGC_ASC_DESC'+m] = yang_correction['AGC_ASC_DESC'+m].where(correction.notnull())
    
    # calculate the correction for the year 2010
    first_year = yang_correction['AGC_ASC_DESC'+m][-1,:,:] - correction*9
    yang_correction['AGC_ASC_DESC'+m][0,:,:] = xr.where(first_year>0,first_year,np.nan)

# save the corrected data
yang_correction.to_netcdf('../results/00_preprocessing/AGC_vod_annual_NOAA_Trend_corrected.nc')

# 2. Convert L-VOD maps from EASE2 grid to regular grid

In [17]:
vars = ['AGC_ASC_DESC','AGC_ASC_DESCmin','AGC_ASC_DESCmax']

for var in vars:
    # first transform the coords of the netcdf files to meters as the projection is in meters
    # based on https://gis.stackexchange.com/questions/376463/gdal-translate-outputs-coordinates-in-metres-but-i-need-degrees
    !gdal_translate -of NetCDF -a_nodata -9999 -a_ullr -17367530.45 7314540.83 17367530.45 -7314540.83 -a_srs "+proj=cea +lon_0=0 +lat_ts=30 +x_0=0 +y_0=0 +ellps=WGS84 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs" NETCDF:"../results/00_preprocessing/AGC_vod_annual_NOAA_Trend_corrected.nc":{var} "../results/00_preprocessing/AGC_vod_annual_NOAA_Trend_corrected_"{var}"_meters.nc"


Input file size is 1388, 584
0...10...20...30...40...50...60...70...80...90...100 - done.
Input file size is 1388, 584
0...10...20...30...40...50...60...70...80...90...100 - done.
Input file size is 1388, 584
0...10...20...30...40...50...60...70...80...90...100 - done.
Input file size is 1388, 584
0...10...20...30...40...50...60...70...80...90...100 - done.
Input file size is 1388, 584
0...10...20...30...40...50...60...70...80...90...100 - done.
Input file size is 1388, 584
0...10...20...30...40...50...60...70...80...90...100 - done.
Input file size is 1388, 584
0...10...20...30...40...50...60...70...80...90...100 - done.
Input file size is 1388, 584
0...10...20...30...40...50...60...70...80...90...100 - done.
Input file size is 1388, 584
0...10...20...30...40...50...60...70...80...90...100 - done.
Input file size is 1388, 584
0...10...20...30...40...50...60...70...80...90...100 - done.
Input file size is 1388, 584
0...10...20...30...40...50...60...70...80...90...100 - done.
Input file

In [26]:
files = sorted(glob('../results/00_preprocessing/AGC_vod_annual_NOAA_Trend_corrected_*meters.nc'))
def load_LVOD(file):
    # load file
    LVOD = rio.open_rasterio(file,decode_times=False,masked=True)

    # get the crs
    crs = LVOD.rio.crs

    # set crs
    LVOD.rio.write_crs(crs,inplace=True)

    # reproject to epsg:4326
    LVOD = LVOD.rio.reproject('epsg:4326',resampling=Resampling.nearest)
    
    return LVOD

# run for all files
das = list(map(load_LVOD,files))

# concat all dataarrays
das = xr.concat(das,dim='method')

das['method'] = var_names = [f.split('/')[-1].split('_corrected_')[-1].split('_meters')[0] for f in files]

# save to netcdf
das.to_netcdf('../data/biomass_estimates/LVOD/AGC_vod_annual_NOAA_Trend_corrected_lat_lon_merged.nc')