In [1]:
import xarray as xr
import rioxarray as rio
from rasterio.enums import Resampling
from glob import glob
import os
from tqdm import tqdm
from utils import *

# Convert above ground biomass to total biomass

## 1. Introduction

## 2. Load data

### 2.1. Above to below ground biomass ratios

#### [Ma et al. (2021)](https://www.nature.com/articles/s41559-021-01485-1)

In [2]:
# load Merged Ma et al. data and grass data
ma_data = rio.open_rasterio('../results/00_preprocessing/ma_et_al_processed_all.nc',masked=True).sel(band=1)
ma_grass = rio.open_rasterio('../results/00_preprocessing/ma_et_al_processed.nc',masked=True).sel(landcover='grass')

# take the Ma et al. data where the land cover is not grass (because our product cover woody vegetation)
ma_data = ma_data.where(ma_grass.isnull())

# convert from percent to fraction
ma_data.data = ma_data.data/100

# set the no data to nan
ma_data.rio.write_nodata(np.nan,inplace=True)

# interpolate missing values based on close valid values
ma_data = ma_data.rio.interpolate_na(method='linear')

# concatenate forest and nonforest landcovers
ma_data = xr.concat([ma_data,ma_data],dim='landcover').transpose('landcover','y','x')

# set the land cover names
ma_data['landcover'] = ['forest','nonforest']

#### [Huang et al. (2021)](https://essd.copernicus.org/articles/13/4263/2021/)

In [3]:
# load data
huang_data = rio.open_rasterio('../results/00_preprocessing/huang_et_al_processed.nc',masked=True).sel(band=1)

# duplicate the data to be both for forest and nonforest
huang_data = xr.concat([huang_data,huang_data],dim='landcover')
huang_data['landcover'] = ['forest','nonforest']

### Regions

In [4]:
# load GFED regions
GFED_regions = rio.open_rasterio('../data/regions_data/GFED/GFED5_Beta_monthly_2002.nc',variable='basisregions').sel(band=1)['basisregions'].drop_vars('band')
GFED_regions.rio.write_crs(4326,inplace=True);

# replace 0 values with NaN
GFED_regions = GFED_regions.where(GFED_regions!=0)

# set the nodata value to NaN
GFED_regions.rio.write_nodata(np.nan,inplace=True);

# load the names of each region
GFED_region_names = pd.read_excel('../data/biomass/besnard_et_al_2021/data.xlsx',sheet_name='region_names')

## 3. Define functions for analysis

### 3.1. For gridded data sources

In [5]:
def analyze_gridded(AGB:xr.DataArray,RMF:xr.DataArray) -> xr.DataArray:
    '''
    This function calculates the total biomass from the AGB and RMF data.
    The function first reprojects the RMF data to match the AGB data.
    Then it calculates the conversion factor from AGB to total biomass according to the formula 1/(1-RMF).
    Finally, it calculates the total biomass by multiplying the AGB data by the conversion factor.

    Parameters:
    AGB (xr.DataArray): The above ground biomass data.
    RMF (xr.DataArray): The root mass fraction data.

    Returns:
    xr.DataArray: The total biomass data.
    '''

    # reproject RMF data to match AGB data
    RMF = RMF.rio.reproject_match(AGB,nodata=np.nan)    

    # calculate conversion factor - fill NaN values with 1   
    conversion = (1/(1-RMF)).fillna(1)

    # calculate total biomass
    total_biomass = AGB*conversion
    
    return total_biomass

### 3.2. For regional data sources

In [6]:
def analyze_regional(biomass:pd.DataFrame, regions:xr.DataArray, RMF:xr.DataArray, metadata:pd.DataFrame) -> pd.DataFrame:
    '''
    Calculate the conversion factor from AGB to TB for each region and apply over the biomass for each region and landcover for each year.

    Parameters:
    biomass: pd.DataFrame
        DataFrame with index 'region','landcover' columns 'time' giving the biomass values
    regions: xr.DataArray
        2D array with region ids
    RMF: xr.DataArray
        2D array with RMF values
    metadata: pd.DataFrame
        DataFrame with columns 'id' and 'name' giving the region names

    Returns:
    pd.DataFrame
        DataFrame with columns 'name' and 'landcover' and values the conversion factor from AGB to TB

    '''

    # reproject RMF data to match regions data
    RMF = RMF.rio.reproject_match(regions,nodata=np.nan)

    # create a merged dataset with RMF scaled by surface area (for an area-weighted mean), region ids and the surface area
    merged_ds = (RMF*calc_area(RMF)).to_dataset(name='RMF')
    merged_ds['id'] = regions
    merged_ds['area'] = calc_area(RMF).where(RMF.notnull())

    # group by region id and calculate the sum of RMF and area
    merged_ds = merged_ds.set_coords('id')
    ds_zonal_sum = merged_ds.groupby('id').sum()

    # normalize RMF by total area to get the mean RMF for each region and convert to DataFrame
    zonal_sum_df =(ds_zonal_sum['RMF']/ds_zonal_sum['area']).to_dataframe(name='RMF')

    # merge the DataFrame with the names of each region
    zonal_sum_df = zonal_sum_df.reset_index().merge(metadata.reset_index()[['name','id']],on='id')

    # set index to region name and landcover
    zonal_sum_df = zonal_sum_df.set_index(['name','landcover'])['RMF']

    # calculate the regional conversion factor based on the formula 1/(1-RMF)
    conversion = 1/(1-zonal_sum_df)

    # name the index levels
    conversion.index.names = ['region','landcover']

    # apply the conversion factor to the biomass data
    result = biomass.mul(conversion,axis=0)

    return result

## 4. Run analysis

The data sources that report only above ground biomass are:
1. Liu et al. (2015)
2. L-VOD data
3. Besnard et al. (2021)

### 4.1. For gridded data sources

The gridded data sources that report only above ground biomass are:
1. Liu et al. (2015)
2. L-VOD data

In [7]:
# find all gridded files from previous stage
files = glob('../results/01_split_forest_nonforest/*.nc')

# loop over all files
for file in tqdm(files):
    # load the data
    data = xr.open_dataarray(file)
    # if the file is from the Liu et al. or LVOD data
    if ('/liu_' in file) | ('/LVOD' in file):    
        # for each RMF data source
        for RMF,rmf_name in zip([ma_data,huang_data],['ma','huang']):
            # analyze the data
            result = analyze_gridded(data,RMF)

            # define the output file name
            out_file = file.replace('.nc',f'_{rmf_name}_TB.nc').replace('01_split_forest_nonforest','02_convert_AGB_TB')

            # save the data
            result.to_netcdf(out_file)
    else:
        # define the output file name
        out_file = file.replace('.nc','_TB.nc').replace('01_split_forest_nonforest','02_convert_AGB_TB')
    
        # save the data
        data.to_netcdf(out_file)

100%|██████████| 18/18 [01:38<00:00,  5.47s/it]


### 4.2. For regional data sources

The only data source that is regional is Besnard et al. (2021).

In [8]:
# load Besnard et al. data
besnard_data = pd.read_csv('../results/01_split_forest_nonforest/besnard_biomass_regional.csv',index_col=[0,1]).unstack('landcover').stack(future_stack=True)
besnard_data.columns.name = 'time'

for RMF,rmf_name in zip([ma_data,huang_data],['ma','huang']):
    # calculate the total biomass for each region
    result = analyze_regional(biomass   =besnard_data,
                              regions   =GFED_regions,
                              RMF       =RMF,
                              metadata  =GFED_region_names)

    result.to_csv(f'../results/02_convert_AGB_TB/besnard_biomass_regional_{rmf_name}_TB.csv')