In [373]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed

# Supplementary Analysis - Quantifying the sources of uncertainty to our final predictions of biomass stock gains

## 1. Introduction

The sources of uncertainty we consider here are:

1. Uncertainty associated with each data source (we assume it to be 30% CV based on [Yang et al. (2023)](https://www.nature.com/articles/s41561-023-01274-4), [Pan et al. (2024)](https://www.nature.com/articles/s41586-024-07602-x) and [Xu et al. (2021)](https://www.science.org/doi/10.1126/sciadv.abe9829))
2. Variation between data sources
3. Variation across different analysis pipelines (e.g. different methods for splitting forest and non-froest areas, above to below ground biomass ratios, etc.)
4. Variation associated with different assumptions about undetectable growth in mature forests (e.g. no such effect, an effect in forests with >100 MgC ha<sup>-1</sup> biomass stocks or an effect in forests with >200 MgC ha<sup>-1</sup> biomass stocks)


## 2. Load data

In [374]:
# load data
biomass_data_obs = pd.read_csv('../results/04_temporal_harmonization/harmonized_biomass_data.csv',index_col=[0,1,2,3])
biomass_data_obs.columns = biomass_data_obs.columns.astype(int)

## 3. Define functions for analysis

In [375]:
# the starting year of the analysis
start_year = 1992

# the number of bootstrap samples
n = 10_000

# define time bins
time_bins = pd.DataFrame(pd.Series([np.arange(start_year+1,2001),np.arange(2001,2011),np.arange(2011,2020)]),columns=['year'])

# create time bin names
time_bin_names = time_bins.apply(lambda x: '-'.join([str(x.iloc[0].min()),str(x.iloc[0].max())]),axis=1)

# set the index to be the names
time_bins.index = time_bin_names

# calculate the frequency of each time bin
year_bin_freq = (time_bins.apply(lambda x: len(x['year']),axis=1)/time_bins.apply(lambda x: len(x['year']),axis=1).sum())

In [376]:
def sample_uncertainty(df:pd.DataFrame,time_bin:np.array,bin_vars:list,utype:str,n_samples=10000) -> list:
    """
    Create random samples from the data for each time bin, considering only one source of uncertainty. 

    Parameters:
    df: pd.DataFrame
        The data frame with the data
    time_bin: np.array
        The time bin to sample from
    bin_vars: list
        The variables to use as columns
    utype: str
        The type of uncertainty to be considered - must be one of 'intra', 'pipeline', 'inter' or 'dense_forests'
    n_samples: int
        The number of samples to create
    
    Returns:
    list
        A list with the samples
    """

    # check if the type is valid
    assert utype in ['intra','pipeline','inter','dense_forests'], "type must be one of 'intra','pipeline','inter' or 'dense_forests'"

    # select the data that for the years in time_bin, drop missing values and calculate make the bin_vars as columns
    df_tb = df.loc[:,df.columns.isin(time_bin)].mean(axis=1).dropna().unstack(bin_vars)

    # define the result list
    res = []

    if utype == 'intra':
        # for each sample, calculate the mean rate of change across data sources and their variants, and sample from a 30% 
        # coefficient of variation for each estimate at the regional scale.
        
        # calculate the mean rate of change for each data source across their variants
        df = df_tb.groupby('source').mean()

        # # add random noise to the data with a certain coefficient of variation
        cv = 0.3 # typical number from the literature
        samples = np.stack([df+np.random.normal(0,cv*df.abs()) for i in range(n_samples)])
        
        # calculate the mean rate of change across data sources and their variants, and sum across bin_vars
        res = np.nanmean(samples,axis=1).sum(axis=1)
    
    elif utype == 'inter':

        # for each sample, calculate the mean rate of change for each data sources across their variants
        # then sample from one data source randomly. If that data source doesn't cover all regions and landcovers,
        # sample from another data source and fill the missing values with the values from the second data source.

        # for each sample
        for i in range(n_samples):
            
            # calculate the mean rate of change for each data source across their variants, 
            # then sample one data source randomly
            sample = df_tb.groupby('source').mean().sample(1).T.iloc[:,0]

            # while the sample doesn't cover all regions and landcovers (has nan values)
            while sample.isna().any():
                # sample again
                sample2 = df_tb.groupby('source').mean().sample(1).T.iloc[:,0]
                
                # merge the two samples
                merged_samples = pd.concat([sample,sample2],axis=1,keys=['first_sample','second_sample'])

                # fill the missing values in the original sample with the values from the second sample
                sample = merged_samples['first_sample'].fillna(merged_samples['second_sample'])
                    
            # add the sample to the result list
            res.append(sample)


    elif utype in ['pipeline','dense_forests']:
        # for each sample, calculate the mean rate of change across data sources and their variants, then sample 
        # one variant for each data source randomly.

        if utype == 'dense_forests':
            variant = df_tb.index.get_level_values('method').str.split('TB_').str[1].str.split("_regions").str[0]
        else:
            variant = df_tb.index.get_level_values('method').str.split('biomass_').str[1].str.split("_TB").str[0]

        # add the variant to the data frame
        df_tb['variant'] = variant

        # add the variant to the indices of the data frame
        df_tb.set_index('variant',append=True,inplace=True)

        # for each sample
        for i in range(n_samples):
                            
            # add the sample to the result list
            res.append(df_tb.groupby(['source','variant']).mean().groupby('source').sample(1).mean().sum())
                
    return res

## 4. Run analysis

### 4.1. Intra-data source uncertainty

In [377]:
# the basic variables that each sample should have are region and landcover
bin_vars=['region','landcover']

# generate for each time period N random samples of the data
trajectories = Parallel(n_jobs=-1)(delayed(sample_uncertainty)(biomass_data_obs,x['year'],bin_vars=bin_vars,utype='intra',n_samples=n) for i,x in time_bins.iterrows())

# calculate the uncertainty of the estimate across time bins
intra_sd = (np.stack(trajectories).T @ year_bin_freq).std()/1e15

### 4.2. Inter-data source uncertainty

In [378]:
# generate for each time period N random samples of the data
trajectories = Parallel(n_jobs=-1)(delayed(sample_uncertainty)(biomass_data_obs,x['year'],bin_vars=bin_vars,utype='inter',n_samples=n) for i,x in time_bins.iterrows())
trajectories_df = pd.concat([pd.concat(i,axis=1,keys=range(n)) for i in trajectories],keys=time_bins.index)
trajectories_df.index.names = ['time_bin'] + bin_vars

inter_source_sd = (trajectories_df.groupby('time_bin').sum().T @ year_bin_freq).std()/1e15

### 4.3. Inter-analysis pipeline uncertainty

In [379]:
# generate for each time period N random samples of the data
trajectories = Parallel(n_jobs=-1)(delayed(sample_uncertainty)(biomass_data_obs,x['year'],bin_vars=bin_vars,utype='pipeline',n_samples=n) for i,x in time_bins.iterrows())

pipeline_sd = (np.array(trajectories).T @ year_bin_freq).std()/1e15

### 4.4. Uncertainty associated with undetectable growth in mature forests

In [380]:
# generate for each time period N random samples of the data
trajectories = Parallel(n_jobs=-1)(delayed(sample_uncertainty)(biomass_data_obs,x['year'],bin_vars=bin_vars,utype='dense_forests',n_samples=n) for i,x in time_bins.iterrows())

mature_forest_sd = (np.array(trajectories).T @ year_bin_freq).std()/1e15

### 4.5. Summary

In [381]:
uncertainty_df = pd.Series([intra_sd,inter_source_sd,pipeline_sd,mature_forest_sd],index=['intra_source','inter_source','pipeline','mature_forest'])
uncertainty_df

intra_source     0.046135
inter_source     0.207648
pipeline         0.006185
mature_forest    0.078274
dtype: float64