In [1]:
import xarray as xr
import rioxarray as rio
import geopandas as gpd
from glob import glob
import os
from tqdm import tqdm
from typing import Union
from utils import *

# Aggregate data into RECCAP regions

## 1. Introduction

We aggregate the data into the RECCAP regions. For gridded data sources we use a GeoDataFrame that defines the RECCAP regions. The GeoDataFrame is created in the preprocessing notebook `00c_define_regions.ipynb`

For regional data sources, in case the regions match with RECCAP regions (for example country level data), we just sum all regions that is found in each RECCAP region. The mapping between country level data and RECCAP regions is also defined in the preprocessing notebook `00c_define_regions.ipynb`

In case the regions do not match with RECCAP regions, we use our gridded data source to estimate the contribution of each region to the respective RECCAP regions.

## 2. Load data

In [2]:
# load country data
countries_data = gpd.read_file('../data/country_data/country_data_w_RECCAP_Pan_FAO.shp')
countries_data['id'] = countries_data.index
# create reccap_regions
reccap_regions = countries_data.dissolve(by='RECCAP reg')

# create pan_regions
pan_regions = countries_data.dissolve(by='Pan region')
pan_regions['name'] = pan_regions.index

In [3]:
# load GFED regions
GFED_regions = rio.open_rasterio('../data/regions_data/GFED/GFED5_Beta_monthly_2002.nc',variable='basisregions').sel(band=1)['basisregions']
GFED_regions.rio.write_crs(4326,inplace=True);

# replace 0 values with NaN
GFED_regions = GFED_regions.where(GFED_regions!=0)

# set the nodata value to NaN
GFED_regions.rio.write_nodata(np.nan,inplace=True);

# load the names of each region
GFED_region_names = pd.read_excel('../data/biomass/besnard_et_al_2021/data.xlsx',sheet_name='region_names')

## 3. Define functions for analysis

### 3.1. For gridded data sources

In [4]:
def analyze_gridded(regions:gpd.GeoDataFrame, biomass:xr.DataArray) -> pd.DataFrame:
    """
    Sum gridded biomass data to regions.

    Parameters:
    regions (gpd.GeoDataFrame): The region definition.
    biomass (xr.DataArray): The biomass data to analyze.

    Returns:
    pd.DataFrame: The analyzed biomass data.
    """

    ## 1. check if dataset covers mainly one region

    # calculate the total area covred by each region in the biomass dataset
    area_fracs = raster_vector_zonal_stats(regions.reset_index(),calc_area(biomass).where(biomass.sum(dim=['landcover','time'])>0),'sum')
    
    # if one region is dominating the area, aggregate to that region
    if (area_fracs/area_fracs.sum()).max() > 0.95:

        # calculate the total biomass in the data set
        res =(biomass*calc_area(biomass)*100).sum(dim=['x','y']).to_dataframe(name='biomass')['biomass'].unstack()

        # find the region with the largest area fraction from the regions GeoDataFrame
        r = regions.index[area_fracs.index[area_fracs.argmax()].astype(int)]

        # merge the index of the region with the index of the sum of the biomass (which is the landcover type)
        res.index = pd.MultiIndex.from_product([[r],res.index])

        return res
    
    # else, aggregate to all regions
    else:

        # calculate the sum of the biomass in each region
        res = raster_vector_zonal_stats(regions.reset_index(),biomass*calc_area(biomass)*100,'sum',interp=True).unstack()

        # rename the indices to be the region names and the landcover types
        res.index =pd.MultiIndex.from_product([regions.index,biomass['landcover'].values])

        # set the columns as the time values
        res.columns = biomass['time'].values
        
    return res

### 3.2. For regional data sources

In [5]:
def calculate_conversion(crosstab:gpd.GeoDataFrame,biomass:xr.DataArray) -> pd.DataFrame:
    '''
    Calculate the conversion factors between regions and RECCAP regions for a given dataset.
    
    Parameters:
    crosstab (gpd.GeoDataFrame): The intersection of the regions and the RECCAP regions.
    biomass (xr.DataArray): The biomass data to analyze.
    
    Returns:
    pd.DataFrame: The conversion factors per region, RECCAP region and landcover type.
    '''
    
    # calculate the total biomass in each region
    region_biomass = raster_vector_zonal_stats(crosstab,biomass*calc_area(biomass),'sum').unstack()
    
    # merge with the original regions GeoDataFrame to get the names of the regions and then set the index of the DataFrame to be the region names, the RECCAP region names, and the landcover types
    region_biomass = region_biomass.reset_index().set_index('index').merge(crosstab[['name','RECCAP reg']],left_index=True,right_index=True).set_index(['name','RECCAP reg','landcover'])
    
    # calculate the fraction of biomass in each region that is in each RECCAP region per landcover type
    conversion_df = region_biomass/region_biomass.groupby(['name','landcover']).sum()
    
    # return constant coversion factors in time
#     result = pd.DataFrame(0,index=conversion_df.index,columns=conversion_df.columns)
#     result = result.add(conversion_df.mean(axis=1),axis=0)
    result = conversion_df.mean(axis=1)
    
    return result

def analyze_dataset_regional(biomass:pd.DataFrame,region:Union[xr.DataArray,gpd.GeoDataFrame],metadata) -> pd.DataFrame:

        # if the region is a raster, polygonize it into a GeoDataFrame
        if type(region) == xr.DataArray:
                region = polygonize(region)

        # calculate the intersection of the regions and the RECCAP regions
        crosstab = reccap_regions.reset_index()[['RECCAP reg','geometry']].overlay(region.reset_index()).merge(metadata,left_on='id',right_on='id')
        
        # take only regions with area larger than 50 Mha
        crosstab['area'] = crosstab.to_crs(epsg=6933).area
        crosstab = crosstab[crosstab['area']>5e11]
        
        # take all of the global gridded biomass estimates (Liu et al., Xu et al., LVOD data)
        biomass_files =  [x for x in glob('../results/02_convert_AGB_TB/*.nc') if ('chen' not in x)]
        
        # calculate the conversion factors between regions and RECCAP regions for each dataset
        conversions = pd.concat([calculate_conversion(crosstab,xr.open_dataarray(f)) for f in biomass_files])

        # calculate the mean conversion across all datasets
        mean_conversion = conversions.groupby(['name','RECCAP reg','landcover']).mean()

        # rename indices
        mean_conversion.index.names = ['region','RECCAP reg','landcover']
        biomass.index.names = ['region','landcover']

        # apply the conversion to get the biomass in the RECCAP regions
        result = biomass.mul(mean_conversion,axis=0).dropna().groupby(['RECCAP reg','landcover']).sum()

        return result
        

### 3.3. For DGVMs

In [6]:
def preprocess_TRENDY(fn:str) -> xr.DataArray:
    '''
    Preprocess TRENDY data for analysis.

    Parameters:
    fn (str): The file path to the TRENDY data.

    Returns:
    xr.DataArray: The preprocessed TRENDY data.
    '''

    variable = fn.split('/')[-1].split('_')[2]

    # load data
    da = xr.open_dataset(fn)
    # get model names
    models = da.attrs['models'].split(' ')

    # get NBP data from the dataset
    da = da[variable]

    # assign model names and time in years
    da['model'] = models
    da['time'] = da['time'].dt.year

    # change the name of the dimensions to x and y
    da = da.rename({'lon':'x','lat':'y'})

    if variable == 'nbp':
        # convert units from kgC/m2/s to gC/m2/yr
        da = da * 60*60*24*365*1e3
    else:
        da = da * 1e3

    # set CRS
    da.rio.write_crs(4326,inplace=True);

    # transpose time and model dimensions
    da = da.transpose('model','time','y','x')

    return da

In [7]:
def analyze_gridded_DGVM(region:gpd.GeoDataFrame, biomass:xr.DataArray) -> pd.DataFrame:
    """
    Sum gridded biomass data to regions.

    Parameters:
    regions (gpd.GeoDataFrame): The region definition.
    biomass (xr.DataArray): The biomass data to analyze.

    Returns:
    pd.DataFrame: The analyzed biomass data.
    """
    
    # calculate the sum of the biomass in each region
    res = raster_vector_zonal_stats(region.reset_index(),biomass*calc_area(biomass),'sum',interp=True).unstack()
    
    # set indices, columns and indices names
    res.index =pd.MultiIndex.from_product([region.index,biomass['model'].values])
    res.columns = biomass['time'].values
    res.index.names = ['region','model']
    
    return res

## 4. Run analysis

### 4.1. For gridded data sources

In [17]:
overwrite = True
files =  glob('../results/02_convert_AGB_TB/*.nc')
for file in tqdm(files):
    if os.path.exists(f'../results/03_aggregate_regions/{file.split("/")[-1][:-3]}_regions.csv'):
        if overwrite == False:
            print(f'../results/03_aggregate_regions/{file.split("/")[-1][:-3]}_regions.csv already exists')
            continue 
    res = analyze_gridded(reccap_regions,xr.open_dataarray(file,decode_times=False).fillna(0))
    res.to_csv(f'../results/03_aggregate_regions/{file.split("/")[-1][:-3]}_regions.csv')

100%|██████████| 90/90 [25:03<00:00, 16.70s/it]


### 4.2. For regional data sources

We have three regional data sources to analyze:

1. Besnard et al. (2021)

2. Pan et al. (2011)

3. FRA data from Tubiello et al. (2021)

#### 4.2.1 Besnard et al. (2021)

In [18]:
files = glob('../results/02_convert_AGB_TB/besnard*.csv')
for file in tqdm(files):
    df = pd.read_csv(file,index_col=[0,1])
    df.columns = df.columns.astype(int)
    res = analyze_dataset_regional(df,GFED_regions,GFED_region_names)
    res.to_csv(f'../results/03_aggregate_regions/{file.split("/")[-1].split(".")[0]}_regions.csv')

  return data.astype(dtype, **kwargs)
  return data.astype(dtype, **kwargs)
100%|██████████| 2/2 [16:49<00:00, 504.67s/it]


#### 4.2.2 Pan et al. (2011)

For Pan et al., all regions are consistent with the RECCAP regions, except from the "Americas". For all other regions expect "Americas", we can directly aggregate the data.

The "Americas" region contains Central America. Central America is considered in the "North America" RECCAP region. We calculate a conversion factor between the two regions and apply it to the "Americas" region.


In [8]:
# load data from Table S3 in Pan et al.
pan_data = pd.read_excel('../data/biomass/pan_et_al_2011/data.xlsx')

# take the "Total living biomass" data and exclude the "Tropical Intact" and "Tropical Regrowth" biomes (as they are included in the "All Tropics" rows)
pan_data_df = pan_data[(pan_data.Type=='Total living biomass') & (~pan_data.Biome.isin(['Tropical Intact','Tropical Regrowth']))].pivot_table(index='Region',columns='Year',values='Carbon',aggfunc='sum')

# interpolate the data to fill in missing years
pan_data_df = (pd.DataFrame(0,index=pan_data_df.index,columns=np.arange(1990,2008),dtype=float)+pan_data_df.astype(float)).interpolate(axis=1)*1e15

# set landcover to forest, and set the index to region and landcover
pan_data_df['landcover'] = 'forest'
pan_data_df = pan_data_df.reset_index().set_index(['Region','landcover'])
pan_data_df.index.names = ['name','landcover']


In [10]:
# load data from Table S3 in Pan et al.
pan_data = pd.read_excel('../data/biomass/pan_et_al_2011/data.xlsx')

# take the "Total living biomass" data and exclude the "Tropical Intact" and "Tropical Regrowth" biomes (as they are included in the "All Tropics" rows)
pan_data_df = pan_data[(pan_data.Type=='Total living biomass') & (~pan_data.Biome.isin(['Tropical Intact','Tropical Regrowth']))].pivot_table(index='Region',columns='Year',values='Carbon',aggfunc='sum')

# interpolate the data to fill in missing years
pan_data_df = (pd.DataFrame(0,index=pan_data_df.index,columns=np.arange(1990,2008),dtype=float)+pan_data_df.astype(float)).interpolate(axis=1)*1e15

# set landcover to forest, and set the index to region and landcover
pan_data_df['landcover'] = 'forest'
pan_data_df = pan_data_df.reset_index().set_index(['Region','landcover'])
pan_data_df.index.names = ['name','landcover']

# for all regions except "Americas", the conversion factor is 1
conversion = pan_data_df/pan_data_df
conversion = conversion.merge(pan_data[['Region','RECCAP region']].drop_duplicates(),left_on='name',right_on='Region').dropna().set_index(['Region','RECCAP region'])
conversion.index.names = ['name','RECCAP reg']

# calculate the biomass in RECCAP regions for the fully matching regions
pan_RECCAP = pan_data_df.mul(conversion,axis=0).groupby(['RECCAP reg','landcover']).sum()

# calculate the conversion factors for the "Americas" region
pan_conv = analyze_dataset_regional(pan_data_df,pan_regions.drop(columns=['name','RECCAP reg']),pan_regions.reset_index()[['name','id']])

# concatenate the fully matching and non matching regions
pan_RECCAP = pd.concat([pan_RECCAP,pan_conv])

# sum over RECCAP regions and landcover types
pan_RECCAP = pan_RECCAP.groupby(['RECCAP reg','landcover']).sum()

# save data
pan_RECCAP.to_csv('../results/03_aggregate_regions/pan_regions.csv')

  return geopandas.overlay(


#### 4.2.3. FRA data from Tubiello et al. (2021)

In [11]:
# Load data
FRA_data = pd.read_csv('../data/biomass/tubiello_et_al_2021/GF_GHG_ForestLand_Total_2020_ZENODO.csv', encoding='latin-1')

# Fix the name for Sudan to match the FAO names
FRA_data['AreaName'] = FRA_data['AreaName'].str.replace(' (former)', '', regex=False)

# Take the stocks data and convert to a DataFrame with the region names as the index and years as columns
FRA_data = FRA_data[FRA_data['ElementName'] == 'Carbon Stock (million tonnes)']
FRA_data = FRA_data[['AreaName','Year','Value']].groupby(['AreaName','Year']).mean()['Value'].unstack()

# convert units from MtC to gC
FRA_data = FRA_data*1e12

# merge data with country data
FRA_data_merge = FRA_data.merge(countries_data[['FAO_name','RECCAP reg']],left_index=True,right_on='FAO_name',how='left')

# calculate total biomass for each RECCAP region
FRA_RECCAP_sum = FRA_data_merge.groupby('RECCAP reg').sum()

# set index to be the RECCAP region and landcover type, which is forest
FRA_RECCAP_sum['landcover'] = 'forest'
FRA_RECCAP_sum = FRA_RECCAP_sum.reset_index().set_index(['RECCAP reg','landcover'])

# drop the FAO_name column
FRA_RECCAP_sum.drop(columns='FAO_name',inplace=True)

# save data
FRA_RECCAP_sum.to_csv('../results/03_aggregate_regions/FRA_regions.csv')


### 4.3 Analyze DGVM data

#### 4.3.2 cVeg

In [19]:
# load and preprocess data
cVeg_grid = preprocess_TRENDY('../data/DGVMs/trendyv10_S3_cVeg_1901-2020_annual_gridded.nc')

# run the analysis
trendy_cVeg_regional = analyze_gridded_DGVM(reccap_regions,cVeg_grid)

# find missing data and remove it

# calculate the sum for each model
model_sum = trendy_cVeg_regional.groupby('model').sum().sum(axis=1)

# valid models have a sum different than 0
valid_models = model_sum[model_sum!=0].index

# remove invalid models
trendy_cVeg_regional = trendy_cVeg_regional[trendy_cVeg_regional.index.get_level_values('model').isin(valid_models)]

# save the results
trendy_cVeg_regional.to_csv('../results/03_aggregate_regions/regional_DGVM_cVeg.csv')

#### 4.3.1 NBP

In [20]:
# load and preprocess data
nbp_grid = preprocess_TRENDY('../data/DGVMs/trendyv10_S3_nbp_1901-2020_annual_gridded.nc')

# run the analysis
trendy_nbp_regional = analyze_gridded_DGVM(reccap_regions,nbp_grid)

# save the results
trendy_nbp_regional.to_csv('../results/03_aggregate_regions/regional_DGVM_nbp.csv')