In [1]:
import pandas as pd
import numpy as np
import pyreadr
from joblib import Parallel, delayed

# Generate estimates for change in total carbon stocks, living biomass and non-living carbon stocks from observations and DGVMs

## 1. Introduction

The following notebook take all of the data produced in previous steps and generate from it two main types of estimates for two different data types. The two kinds of estimates we produce are:

- decadal estimates of change in total carbon stocks, living biomass and non-living carbon stocks

- annual estimates of cummulative change in total carbon stocks, living biomass and non-living carbon

The two data types we analyze are observation-based estimates and DGVM-based estimates. 

For each type of estimate and data type, we generate both mean estimates as well as uncertainty estimates. 

In [2]:
start_year = 1992

## 2. Load data

### 2.1. Observation-based

#### 2.1.2 Total carbon stocks

In [3]:
# Global

# load global data
GCB_data = pd.read_csv('../results/05_estimate_land_sink/global.csv',index_col=[0])
GCB_data = GCB_data.loc[start_year:2019]
GCB_data.loc[start_year,'land_sink'] = 0 

# anthropogenic perturbation of the land to ocean flux from Regnier et al. 2022
F_lat = 0.05e15
F_lat_std = 0.025e15

In [4]:
# Regional

# load regionall NEE and FF fluxes
NEE = pd.read_csv('../results/05_estimate_land_sink/NEE_regional.csv',index_col=[0,1])
FF = pd.read_csv('../results/05_estimate_land_sink/FF_regional.csv',index_col=[0,1])
NEE.columns.name = 'year'
FF.columns.name = 'year'
NEE.columns = NEE.columns.astype(int)
FF.columns = FF.columns.astype(int)

# take only years between 1993 and 2019
NEE = NEE.loc[:,start_year+1:2019]
FF = FF.loc[:,start_year+1:2019]

# load lateral trade fluxes from Ciais et al. 2021
trade_data = pd.read_excel('../data/carbon_cycle/lateral_fluxes/ciais_et_al_2021/data.xlsx')

# take only the trade fluxes and convert from TgC to PgC. For the uncertainty convert to gC
trade_std = trade_data[(trade_data.Region!='Globe') & (trade_data.Parameter=='Ftrade')].set_index('Region')['Uncertainty estimation']/1e3*1e15
trade_data = trade_data[(trade_data.Region!='Globe') & (trade_data.Parameter=='Ftrade')].set_index('Region')['Value']/1e3


# load lateral river fluxes
river_data = pd.read_csv('../results/05_estimate_land_sink/river_fluxes.csv',index_col=0)['river_flux']/1e15

F_lateral = (river_data +trade_data.reindex(river_data.index,fill_value=0))*1e15


#### 2.2.2 Living biomass

In [5]:
# load data
biomass_data_obs = pd.read_csv('../results/04_temporal_harmonization/harmonized_biomass_data.csv',index_col=[0,1,2,3])
biomass_data_obs.columns = biomass_data_obs.columns.astype(int)

### 2.2. DGVMs

In [6]:
# load regional nbp and cVeg data
trendy_nbp_regional = pd.read_csv('../results/03_aggregate_regions/regional_DGVM_nbp.csv',index_col=[0,1])
trendy_cVeg_regional = pd.read_csv('../results/03_aggregate_regions/regional_DGVM_cVeg.csv',index_col=[0,1])
trendy_nbp_regional.columns = trendy_nbp_regional.columns.astype(int)
trendy_cVeg_regional.columns = trendy_cVeg_regional.columns.astype(int)

# calculate the time difference and select the time period between 1993-2019
trendy_cVeg_regional = trendy_cVeg_regional.diff(axis=1)
trendy_cVeg_regional = trendy_cVeg_regional.loc[:,start_year+1:2019]
trendy_nbp_regional = trendy_nbp_regional.loc[:,start_year+1:2019]

In [7]:
# load global estimates from the models
trendy_global_nbp = pyreadr.read_r('../data/DGVMs/nbp_S3.RData')['nbp_S3'].T*1e15
trendy_global_cVeg = pyreadr.read_r('../data/DGVMs/cVeg_S3.RData')['cVeg_S3'].T*1e15

# convert units from kgC per second to PgC per year
trendy_global_nbp = trendy_global_nbp*(3600*24*365*1e3/1e15)

# set model names and years from the regional data
trendy_global_nbp.index = trendy_nbp_regional[start_year+1].unstack().columns
trendy_global_nbp.columns = np.arange(1901,2021)
trendy_global_cVeg.index = trendy_nbp_regional[start_year+1].unstack().columns


# drop out missing data
trendy_global_cVeg = trendy_global_cVeg.dropna()

# take time difference for cVeg data
trendy_global_cVeg = trendy_global_cVeg.diff(axis=1)
trendy_global_cVeg = trendy_global_cVeg.dropna(axis=1)

# set the columns of the cVeg data
trendy_global_cVeg.columns = np.arange(1902,2021)

# take only the times in the study period
trendy_global_nbp = trendy_global_nbp.loc[:,start_year+1:2019]
trendy_global_cVeg = trendy_global_cVeg.loc[:,start_year+1:2019]

## 3. Define functions for analysis

In [8]:
# define time bins
time_bins = pd.DataFrame(pd.Series([np.arange(start_year+1,2001),np.arange(2001,2011),np.arange(2011,2020)]),columns=['year'])

# create time bin names
time_bin_names = time_bins.apply(lambda x: '-'.join([str(x.iloc[0].min()),str(x.iloc[0].max())]),axis=1)

# set the index to be the names
time_bins.index = time_bin_names

# calculate the frequency of each time bin
year_bin_freq = (time_bins.apply(lambda x: len(x['year']),axis=1)/time_bins.apply(lambda x: len(x['year']),axis=1).sum())

In [9]:
def calc_regional_stat(df:pd.DataFrame, year_bins:list, stat:str,grouping_vars=['inversion','region']) -> pd.DataFrame:
    '''
    Function to calculate regional statistics .

    Parameters:
    df (pd.DataFrame): data
    year_bins (list): list of year bins
    stat (str): statistic to calculate (mean or std)
    grouping_vars (list): list of variables to group by

    Returns:
    pd.DataFrame: regional statistics
    '''

    # extract the names of the time periods
    period_names = ['-'.join([str(j+1),str(year_bins[i+1])]) for i,j in enumerate(year_bins[:-1])]

    # check if the stat is either mean or std
    assert stat in ['mean','std'], 'stat must be either mean or std'
    
    # copy the input data (for some reason the analysis overwrites the input data)
    df_time_mean = df.copy()

    # set the columns to the respective time bin
    df_time_mean.columns = pd.cut(df_time_mean.columns,bins=year_bins)

    # set the columns name to period
    df_time_mean.columns.name = 'period'

    # calculate the regional mean and number of years for each inversion
    df_time_mean = df_time_mean.stack().groupby(grouping_vars+['period'],observed=False).agg(['mean','count'])
    
    # remove methods with only one measurement in a given year bin
    df_time_mean = df_time_mean.loc[df_time_mean['count']>1]

    # if the grouping_vars are larger than 1, take the last variable as the grouping variable
    gb2 = grouping_vars[-1:] if len(grouping_vars) >1 else grouping_vars
    
    # calculate the regional mean or std across the different inversion for the same year bin
    df_regional_stats = df_time_mean.groupby(gb2 + ['period'],observed=False)['mean'].agg(stat).unstack()

    # set the column names to be the nice period names
    df_regional_stats.columns = period_names

    return df_regional_stats


### 3.1. Total carbon stocks

In [10]:
def calc_land_std(land_df,start:int, end:int, ocean_random:float, ocean_sys:float,F_lat_std=F_lat_std) -> float:
    '''
    Function to calculate the uncertainty of the net land sink for a given time period.

    Parameters:
    land_df (pd.DataFrame): land sink data
    start (int): start year of the time period
    end (int): end year of the time period
    ocean_random (float): random uncertainty of the ocean sink
    ocean_sys (float): systematic uncertainty of the ocean sink
    F_lat_std (float): uncertainty of the anthropogenic perturbation of the lateral flux from land to ocean

    Returns:
    float: uncertainty of the net land sink
    '''

    # define component stds
    FF_std = land_df.loc[start:end,'FF_std'].mean()
    AGR_std = land_df.loc[start:end,'AGR_std'].mean()
    cement_std = land_df.loc[start:end,'cement_std'].mean()

    ocean_random = ocean_random/(end-start) # calculate the standard deviation of the random ocean uncertainty for the mean flux across the period
    conponent_std = np.array([FF_std,AGR_std,cement_std,ocean_random,ocean_sys,F_lat_std])

    # propogate uncertainties
    land_sink_std = np.sqrt((conponent_std**2).sum())

    return land_sink_std    


### 3.2. Living biomass

In [11]:
def create_dist(df:pd.DataFrame,time_bin:np.array,bin_vars:list,n_samples=1000,stat='mean') -> list:
    """
    Create random samples from the data for each time bin. For each sample, we choose randomly one method from each source and then one source.
    If the original sample has missing values, we fill them with the values from the second sample. 

    Parameters:
    df: pd.DataFrame
        The data frame with the data
    time_bin: np.array
        The time bin to sample from
    bin_vars: list
        The variables to use as columns
    n_samples: int
        The number of samples to create
    stat: str
        The statistic to calculate in each sample (mean or cumsum)

    Returns:
    list
        A list with the samples
    """
    
    # check if the stat is either mean or cumsum
    assert stat in ['mean','cumsum'], 'stat must be either mean or cumsum'
    
    if stat == 'mean':
        # select the data that for the years in time_bin, drop missing values and calculate make the bin_vars as columns
        df_tb = df.loc[:,df.columns.isin(time_bin)].mean(axis=1).dropna().unstack(bin_vars)
    else:
        df_tb = df.loc[:,df.columns.isin(time_bin)].cumsum(axis=1)
        df_tb = df_tb[df_tb.sum(axis=1)!=0].unstack(bin_vars)

    # define the result list
    res = []

    # for each sample
    for i in range(n_samples):
        
        # sample one method from each source, then sample one source and take the values
        sample = df_tb.groupby('source').sample(1).sample(1).T.iloc[:,0]

        # while the sample doesn't cover all regions and lancovers (has nan values)
        while sample.isna().any():
            # sample again
            sample2 = df_tb.groupby('source').sample(1).sample(1).T.iloc[:,0]
            
            # merge the two samples
            merged_samples = pd.concat([sample,sample2],axis=1,keys=['first_sample','second_sample'])

            # fill the missing values in the original sample with the values from the second sample
            sample = merged_samples['first_sample'].fillna(merged_samples['second_sample'])
                
        # add the sample to the result list
        res.append(sample)

    return res


def sort_df(df:pd.DataFrame)->pd.DataFrame:
    """
    Sort the columns of a data frame based on the sum of the values in the columns

    Parameters:
    df: pd.DataFrame
        The data frame to sort

    Returns:
    pd.DataFrame
        The sorted data frame
    """
    
    # sort the columns based on the sum of the values
    sorted_df = df.loc[:,df.sum().sort_values(ascending=False).index]
    
    # rename columns
    sorted_df.columns = range(df.shape[1])
    return sorted_df


In [12]:
def generate_std_df(trajectories:pd.DataFrame,year_bin_freq:pd.Series) -> pd.DataFrame:
    '''
    Function to calculate the regional and global uncertainty for different time periods from the given random samples.

    Parameters:
    trajectories (pd.DataFrame): the random samples
    year_bin_freq (pd.Series): the frequency of each time bin

    Returns:
    pd.DataFrame: the standard deviation of the data for the different time periods
    '''

    # sum over the landcovers to get regional trajectories
    trajectories_regional = trajectories.groupby(['time_bin','region']).sum()

    # add random noise to the data with a certain coefficient of variation
    cv = 0.3 # typical number from the literature
    trajectories_regional = trajectories_regional + np.random.normal(0,cv*trajectories_regional.abs())

    # calculate the standard deviation between samples as a measure of unertainty
    std_df = trajectories_regional.std(axis=1).unstack()

    # for the entire period, calculate the overall uncertainty assuming no correlation between time periods
    std_df.loc[f'{start_year+1}-2019'] = np.sqrt(std_df.T**2 @ year_bin_freq**2)

    # Global
    # sum over the landcovers to get regional trajectories
    trajectories_global = trajectories.groupby('time_bin').sum()

    # add random noise to the data with a certain coefficient of variation
    cv = 0.3 # typical number from the literature
    trajectories_global = trajectories_global + np.random.normal(0,cv*trajectories_global.abs())

    # calculate the standard deviation between samples as a measure of unertainty
    global_std = trajectories_global.std(axis=1)

    # for the entire period, calculate the average rate across overall uncertainty assuming no correlation between time periods
    global_std.loc[f'{start_year+1}-2019'] = (trajectories_global.T @ year_bin_freq).std()

    # combine the regional and global estimates
    std_df['Global'] = global_std

    return std_df


### 3.3. DGVMs

In [13]:
def calc_global(df:pd.DataFrame, time_bins_edges:list) -> pd.Series:
    '''
    Function to calculate the global mean and standard deviation for different time periods of DGVM data.

    Parameters:
    df (pd.DataFrame): the data
    time_bins_edges (list): the edges of the time bins

    Returns:
    pd.Series: the global mean and standard deviation for the different time periods
    '''
    
    # extract the names of the time periods
    period_names = ['-'.join([str(j+1),str(time_bins_edges[i+1])]) for i,j in enumerate(time_bins_edges[:-1])]

    # define the result variable
    global_df = pd.DataFrame(np.nan,index= period_names,columns=['mean','std'])

    # for each time period calculate the mean and std across the models
    for i,start,end in zip(range(len(time_bins_edges)-1),time_bins_edges[:-1],time_bins_edges[1:]):
        
        global_df.iloc[i] = df.loc[:,start+1:end].mean(axis=1).agg(['mean','std'])

    # calculate the mean and std across the models for the entire period
    global_df.loc['1993-2019'] = df.mean(axis=1).agg(['mean','std'])
    return global_df

def analyze_DGVM(df:pd.DataFrame, global_df:pd.DataFrame, time_bins_edges:list, stat:str) -> pd.DataFrame:
    '''
    Function to analyze the DGVM data for different time periods.

    Parameters:
    df (pd.DataFrame): the data
    global_df (pd.DataFrame): the global data
    time_bins_edges (list): the edges of the time bins
    stat (str): the statistic to calculate (mean or std)

    Returns:
    pd.DataFrame: the regional and global statistics for the different time periods
    '''

    assert stat in ['mean','std'], 'stat must be either mean or std'

    # calculate regional stat
    res = calc_regional_stat(df,time_bins_edges,stat,['model','region'])
    
    # calculate the statistic for the entire period
    res[f'{start_year+1}-2019'] = df.mean(axis=1).groupby('region').agg(['mean','std'])[stat]
    
    # calculate the statistic for the globe
    res.loc['Global',:] =  calc_global(global_df,time_bins_edges)[stat]

    return res



## 4. Run analysis

### 4.1. Observation-based

#### 4.1.1. Total carbon stocks

##### 4.1.1.1. Global estimate

The net land flux is calculated according to the following formula:

$$ F_{land} = E_{FOS} - S_{O} - AGR - S_{cement} $$

Where $F_{land}$ is the net land sink, $E_{FOS}$ is the emissions from fossil fuels, $S_{O}$ is the sink from the ocean, $AGR$ is the atmospheric growth rate, and $S_{cement}$ is the sink from cement carbonation.

To calculate from the net land sink the change in total carbon stocks on land, we remove an estimate for the anthropogenic perturbation of the lateral fluxes between land and the ocean of ≈0.05 GtC <sup>-1</sup> based on [Regnier et al. (2022)](https://www.nature.com/articles/s41586-021-04339-9).

In [14]:
# calculate the net land sink
GCB_data['land_sink'] = GCB_data['FF'] - GCB_data['AGR'] - GCB_data['ocean'] - GCB_data['cement']

# calculate the mean net land sink for the different time bins
global_land = time_bins.apply(lambda x: GCB_data.loc[x['year'],'land_sink'].mean(),axis=1)
global_land.index = time_bin_names
global_land.loc[f'{start_year+1}-2019'] = GCB_data.loc[start_year+1:2019,'land_sink'].mean()

# remove 0.05 anthropogenic perturbation of the land to ocean flux (F'_EC) from Regnier et al. 2022
global_land = global_land - F_lat

To calculate the uncertainty of $F_{land}$ at each year we need to combine the uncertainties of each component based on estimates from the Global Carbon Budget report ([Friedlingstein et al. 2023](https://essd.copernicus.org/articles/15/5301/2023/)), assuming that the uncertainties reported represent systematic errors.

For the ocean sink, the overall uncertainty is ≈0.4 GtC yr<sup>-1</sup>, but it is composed of random and systematic errors. The random errors which will be reduced when making averages and to a systematic part which will not be reduced.

The ocean sink is calculated using two methods - Global Ocean Biogeochemical Models (GOBMs) and data products. 
The random error for both methods is 0.3 GtC yr<sup>-1</sup> each. 
The systematic uncertainty of GOBMs is 0.4 GtC yr<sup>-1</sup>. 
By propagating uncertainties from observations (0.2 GtC yr<sup>-1</sup>), 
gas-transfer velocity (0.2 GtC yr<sup>-1</sup>), 
wind product (0.1 GtC yr<sup>-1</sup>), 
river flux adjustment (0.3 GtC yr<sup>-1</sup>), 
and $fCO_2$ mapping (0.2 GtC yr<sup>-1</sup>), 
we calculate the systematic uncertainty of the data-products to be $\sqrt{0.2^2 + 0.2^2 + 0.1^2 + 0.3^2 + 0.2^2}$ ≈ 0.5 GtC yr<sup>-1</sup>. 
Using these estimates, we can calculate the random and systematic components of the global ocean sink estimate:

- random error: $\sqrt{0.5^2 * 0.3^3+0.5^2 * 0.3^2}$ = 0.21
- systematic error: $\sqrt{0.5^2 * 0.4^2 + 0.5^2 * 0.47^2}$ = 0.32

In [15]:
# calculate random and systematic ocean uncertainty
GOBM_random = 0.3e15
GOBM_sys = 0.4e15
data_prod_random = 0.31e15
data_prod_sys = 0.5e15
ocean_random = np.sqrt(GOBM_random**2*0.5**2 + data_prod_random**2*0.5**2)
ocean_sys = np.sqrt(GOBM_sys**2*0.5**2 + data_prod_sys**2*0.5**2)

# calculate the uncertainty of the net land sink for different time periods
global_land_std = time_bins.apply(lambda x: calc_land_std(GCB_data,x['year'].min(),x['year'].max(),ocean_random,ocean_sys),axis=1)
global_land_std.index = time_bin_names
global_land_std.loc[f'{start_year+1}-2019'] = calc_land_std(GCB_data,start_year+1,2019,ocean_random,ocean_sys)

In [16]:
# Generate annual estimates

# calculate mean
delta_C_obs_annual_cum = (GCB_data.loc[start_year+1:2019,'land_sink']-F_lat).cumsum() 
delta_C_obs_annual_cum[start_year] = 0
delta_C_obs_annual_cum.sort_index(inplace=True)

# calculate uncertainty
delta_C_obs_annual_std_cum = np.zeros(GCB_data.loc[start_year:].shape[0])
for i,e in enumerate(GCB_data.loc[start_year+1:].index):
    delta_C_obs_annual_std_cum[i+1] = calc_land_std(GCB_data,start_year,e,ocean_random,ocean_sys)*(e-start_year)

delta_C_obs_annual_std_cum = pd.Series(delta_C_obs_annual_std_cum,index=GCB_data.loc[start_year:].index)

##### 4.1.1.2. Regional estimate

For the mean regional total carbon stock change, we rely on the mean of the 14 different inversions we use. We bin the data in 4 time bins to keep the coverage of different inversions consistent across each time bin. 

We remove from the vertical exchange between land and the atmosphere the lateral fluxes between each region based on estimates of lateral fluxes into the ocean and lateral trade across RECCAP regions.

In [17]:
year_bins = np.array([start_year,2000,2010,2014,2019])

# calculate the mean regional NEE
NEE_regional_mean = calc_regional_stat(NEE,year_bins,'mean')

# calculate the time length of each year bin
bin_length = year_bins[1:]-year_bins[:-1]

# calculate the regional mean change in total carbon stocks by removing from NEE lateral fluxes
nbp_regional_mean = -NEE_regional_mean.add(F_lateral,axis=0).dropna()

For calculating the uncertainty of the regional changes in the total carbon stocks on land, we calculate three different uncertainty components and propagate their respective uncertainty to the final estimate:

1. The variability between the estimates of different inversions on the average rate of exchange of CO<sub>2</sub> between the atmosphere and the land surface. We calculate the standard deviation of the mean of the 14 inversions for each region and time bin.

2. The uncertainty associated with lateral fluxes - we rely on the uncertainty associated with land to ocean fluxes from the Glboal Carbon Budget [Friedlingstein et al. (2023)](https://essd.copernicus.org/articles/15/5301/2023/) of ≈0.3 GtC yr<sup>-1</sup> and the uncertainty associated with lateral trade from [Ciais et al. 2021](https://academic.oup.com/nsr/article/8/2/nwaa145/5868251).

3. Uncertainty of regional fossil fuel emissions, contribute to the uncertainty of each single inversion.

In [18]:
# 1. calculate the mean regional NEE
NEE_regional_std = calc_regional_stat(NEE,year_bins,'std')

# take land to ocean uncertainty from the Global Carbon Budget
river_std = 0.3e15

# scale the uncertainty of the lateral fluxes to the total river flux
river_std = river_data/river_data.sum()*river_std

# calculate the uncertainty of the lateral fluxes
lat_std = np.sqrt(river_std**2 + trade_std**2)

# 3. calculate the uncertainty of the fossil fuel flux assuming 5% uncertainty
FF_std = calc_regional_stat(FF,year_bins,'mean')*0.05

# calculate the overall uncertainty of the carbon stock change
nbp_regional_std = np.sqrt((NEE_regional_std**2 + FF_std**2).add(lat_std**2,axis=0))

##### 4.1.1.3. Combine global and regional data

In [19]:
# mean
# take the regional estimates for 1993-2000 and 2001-2010
delta_C_obs = nbp_regional_mean.iloc[:,:2].T
delta_C_obs.index = time_bin_names[:2]

# for the 2011-2019 period, calculate the mean regional change by the weighted average of the rates for 2011-2014 and 2015-2019
delta_C_obs.loc['2011-2019'] = nbp_regional_mean.iloc[:,2:] @ (bin_length[2:]/sum(bin_length[2:]))

# for the 1993-2019 period, calculate the mean regional change by the weighted average of all periods
delta_C_obs.loc[f'{start_year+1}-2019'] = nbp_regional_mean @ (bin_length/sum(bin_length))

# set the global estimates from the result of the global analysis
delta_C_obs['Global'] = global_land

#std
# take the regional estimates for 1993-2000 and 2001-2010
delta_C_obs_std = nbp_regional_std.iloc[:,:2].T
delta_C_obs_std.index = time_bin_names[:2]

# for the 2011-2019 period, propagate uncertainties assuming periods are uncorrelated by using the weighted average of the variances for 2011-2014 and 2015-2019
delta_C_obs_std.loc['2011-2019'] = np.sqrt(nbp_regional_std.iloc[:,2:]**2 @ (bin_length[2:]/sum(bin_length[2:]))**2)

# for the 1993-2019 period, propagate uncertainties assuming periods are uncorrelated by using the weighted average of the variances for all periods
delta_C_obs_std.loc[f'{start_year+1}-2019'] = np.sqrt(nbp_regional_std**2 @ (bin_length/sum(bin_length))**2)

# set the global uncertainty estimates from the result of the global analysis
delta_C_obs_std['Global'] = global_land_std

# transpose data to make it consistent with other datasets
delta_C_obs = delta_C_obs.T
delta_C_obs_std = delta_C_obs_std.T

#### 4.1.2. Living biomass

##### 4.1.2.1. Mean estimate

###### 4.1.2.1.1. Annual

In [20]:
# calculate annual level data by:
# 1. grouping over source, region and landcover and taking the mean (mean across al methods in each source)
# 2. grouping over region and landcover and taking the mean (mean across all sources in each region)
# 3. grouping over region and taking the sum (sum across all landcovers in each region)
delta_B_obs_annual_mean = biomass_data_obs.groupby(['source','region','landcover']).mean()\
                                            .groupby(['region','landcover']).mean()\
                                            .groupby(['region']).sum()\
                                            .loc[:,start_year+1:2019]
# set columns name to time
delta_B_obs_annual_mean.columns.name = 'time'

###### 4.1.1.1.2. Decadal

In [21]:
# set time bins
time_bins_edges = [start_year,2000,2010,2019]

# copy data from the annual mean data
delta_B_obs = calc_regional_stat(delta_B_obs_annual_mean,time_bins_edges,'mean',['region'])

# for the mean across the entire study period (1992-2019), just take the mean of the annual mean
delta_B_obs.loc[:,'-'.join([str(time_bins_edges[0]+1),str(time_bins_edges[-1])])] = delta_B_obs_annual_mean.mean(axis=1)

# for the global estimate, sum over all regions
delta_B_obs.loc['Global',:] = delta_B_obs.sum()

##### 4.1.2.2. Uncertainty

###### 4.1.2.2.1. Uncorrelated

In [22]:
# the basic variables that each sample should have are region and landcover
bin_vars=['region','landcover']

# define the number of samples to take
n = 10_000

# generate for each time period N random samples of the data
trajectories = Parallel(n_jobs=-1)(delayed(create_dist)(biomass_data_obs,x['year'],bin_vars=bin_vars,n_samples=n) for i,x in time_bins.iterrows())

# convert the result into a dataframe
trajectories = pd.concat([pd.concat(i,axis=1,keys=range(n)) for i in trajectories],keys=time_bins.index)

# set the index names
trajectories.index.names=['time_bin'] + bin_vars

# use the trajectories to calculate the uncertainty of the data
delta_B_obs_std = generate_std_df(trajectories,year_bin_freq).T

###### 4.1.2.2.2. Correlated

In [23]:
# sort the trajectories so that they are fully correlated in time
trajectories_sorted = trajectories.groupby('time_bin').apply(sort_df)

# use the trajectories to calculate the uncertainty of the data
delta_B_obs_std_cor = generate_std_df(trajectories_sorted,year_bin_freq).T

###### 4.1.2.2.3. Annual estimates

In [24]:
# calculate the mean cumulative sum of the annual rate of change
delta_B_obs_annual_cum = np.cumsum(delta_B_obs_annual_mean.sum())
delta_B_obs_annual_cum.loc[start_year] = 0
delta_B_obs_annual_cum.sort_index(inplace=True)

In [None]:

# the basic variables that each sample should have are region and landcover
bin_vars=['region','landcover']

# define the number of samples to take
n = 10_000

# generate for each time period N random samples of the data
trajectories_cum = Parallel(n_jobs=-1)(delayed(create_dist)(biomass_data_obs,x['year'],bin_vars=bin_vars,n_samples=n,stat='cumsum') for i,x in time_bins.iterrows())

# convert the result into a dataframe
trajectories_cum = pd.concat([pd.concat(i,axis=1,keys=range(n)) for i in trajectories_cum],keys=time_bins.index)

# set the index names
trajectories_cum.index.names = ['time_bin','year'] + bin_vars

# sum the samples for each year to get global samples
trajectories_cum = trajectories_cum.groupby('year').sum()

# for the second and third time periods, add the cumulative biomass of the previous period to beginning of the current period
for i,(period,row) in enumerate(time_bins[1:].iterrows()):
    sy = row['year'][0]
    ey = row['year'][-1]
    previous_ey = time_bins.iloc[i]['year'][-1]
    trajectories_cum.loc[sy:ey] = trajectories_cum.loc[sy:ey] + trajectories_cum.loc[previous_ey]

# set the standard deviation and the std of the trajectories
delta_B_obs_annual_std_cum = trajectories_cum.std(axis=1)


##### 4.1.2.3. Different starting years

In [29]:

# the basic variables that each sample should have are region and landcover
bin_vars=['region','landcover']

# define the number of samples to take
n = 10_000

# define the results
results = []

# for each year
for sy in np.arange(1993,2011):
    
    # define time bins
    if sy < 2001:
        tb = pd.DataFrame(pd.Series([np.arange(sy,2001),np.arange(2001,2011),np.arange(2011,2020)]),columns=['year'])
    elif sy < 2011:
        tb = pd.DataFrame(pd.Series([np.arange(sy,2011),np.arange(2011,2020)]),columns=['year'])

    # create time bin names
    tb_names = tb.apply(lambda x: '-'.join([str(x.iloc[0].min()),str(x.iloc[0].max())]),axis=1)

    # set the index to be the names
    tb.index = tb_names

    # calculate the frequency of each time bin
    yb_freq = (tb.apply(lambda x: len(x['year']),axis=1)/tb.apply(lambda x: len(x['year']),axis=1).sum())

    # generate for each time period N random samples of the data
    tjs = Parallel(n_jobs=-1)(delayed(create_dist)(biomass_data_obs,x['year'],bin_vars=bin_vars,n_samples=n) for i,x in tb.iterrows())

    # convert the result into a dataframe
    tjs = pd.concat([pd.concat(i,axis=1,keys=range(n)) for i in tjs],keys=tb.index)

    # set the index names
    tjs.index.names=['time_bin'] + bin_vars

    # append the results
    results.append(generate_std_df(tjs,yb_freq).T)

# calculate the mean and std of the global estimates
res = pd.DataFrame(np.nan,index=np.arange(1993,2011),columns=['mean_rate','std'])
different_starts = pd.DataFrame(index=np.arange(1993,2011),columns=['mean','std'])
different_starts['std'] = [r.loc['Global','1993-2019'] for r in results]
different_starts['mean'] = (res.apply(lambda x: delta_B_obs_annual_mean.sum().loc[x.name:].mean(),axis=1))

different_starts.to_csv('../results/06_make_estimates/delta_B_obs_different_starts.csv')



In [30]:
# calculate the delta_C for different starting years
delta_C_different_start = pd.DataFrame(np.nan,index=np.arange(1993,2011),columns=['mean_rate','std'])
delta_C_different_start = delta_C_different_start.apply(lambda x: pd.Series([GCB_data['land_sink'].loc[x.name:2019].mean()-F_lat,calc_land_std(GCB_data,x.name+1,2019,ocean_random,ocean_sys)],index=['mean','std']),axis=1)

delta_C_different_start.to_csv('../results/06_make_estimates/delta_C_obs_different_starts.csv')

#### 4.1.3. Infer changes in non-living carbon

In [31]:
# calculate the mean estimate
delta_OC_obs = delta_C_obs-delta_B_obs

# calculate the uncertainty estimate by propagating the uncertainties of the total carbon stock and living biomass estimates
delta_OC_obs_std = np.sqrt(delta_C_obs_std**2+delta_B_obs_std**2)
delta_OC_obs_std_cor = np.sqrt(delta_C_obs_std**2+delta_B_obs_std_cor**2)


#### 4.1.4. Save estimates

In [32]:
obs_mean = pd.concat([delta_B_obs.stack(),delta_OC_obs.stack(),delta_C_obs.stack()],keys=['delta_B','delta_OC','delta_C'])

# method 1 uncorrelated uncertainty
obs_std = pd.concat([delta_B_obs_std.stack(),delta_OC_obs_std.stack(),delta_C_obs_std.stack()],keys=['delta_B','delta_OC','delta_C'])
obs_periods = pd.concat([obs_mean,obs_std],keys=['mean','std'])
obs_periods.index.names = ['stat','pool','time','period']
obs_periods.to_csv('../results/06_make_estimates/obs_periods.csv')

# method 1 correlated uncertainty
obs_std_cor = pd.concat([delta_B_obs_std_cor.stack(),delta_B_obs_std_cor.stack(),delta_C_obs_std.stack()],keys=['delta_B','delta_OC','delta_C'])
obs_periods_cor = pd.concat([obs_mean,obs_std_cor],keys=['mean','std'])
obs_periods_cor.index.names = ['stat','pool','time','period']
obs_periods_cor.to_csv('../results/06_make_estimates/obs_periods_cor.csv')

In [51]:
# annual estimates
obs_annual_cum = pd.concat([delta_B_obs_annual_cum,delta_C_obs_annual_cum],keys=['delta_B','delta_C'])
obs_annual_cum_std = pd.concat([delta_B_obs_annual_std_cum,delta_C_obs_annual_std_cum],keys=['delta_B','delta_C'])
obs_annual_cum = pd.concat([obs_annual_cum,obs_annual_cum_std],keys=['mean','std'])
obs_annual_cum.index.names = ['stat','pool','time']
obs_annual_cum.to_csv('../results/06_make_estimates/obs_annual_cum.csv')

### 4.2. DGVMs

#### 4.2.1. Annual

In [31]:
def calc_cumsum(df):
    # set the first year to be 1992 with a value of zero
    res = df.copy()
    res[start_year] = 0 
    res = res[res.columns.sort_values()]
    
    # calculate the cumulative sum and then the mean and std across the models
    return res.cumsum(axis=1).agg(['mean','std']).T

# calculate mean cumsums    
delta_B_DGVMs_annual_cum = calc_cumsum(trendy_global_cVeg)['mean']
delta_C_DGVMs_annual_cum = calc_cumsum(trendy_global_nbp)['mean']

# calculate the std of the cumsums
delta_B_DGVMs_annual_std_cum = calc_cumsum(trendy_global_cVeg)['std']
delta_C_DGVMs_annual_std_cum = calc_cumsum(trendy_global_nbp)['std']

#### 4.2.2. Decadal

In [32]:
time_bins_edges = [start_year,2000,2010,2019]

# calculate mean for total carbon stocks and living biomass
delta_B_DGVMs = analyze_DGVM(trendy_cVeg_regional,trendy_global_cVeg,time_bins_edges,'mean')
delta_C_DGVMs = analyze_DGVM(trendy_nbp_regional,trendy_global_nbp,time_bins_edges,'mean')

# calculate std for total carbon stocks and living biomass
delta_B_DGVMs_std = analyze_DGVM(trendy_cVeg_regional,trendy_global_cVeg,time_bins_edges,'std')
delta_C_DGVMs_std = analyze_DGVM(trendy_nbp_regional,trendy_global_nbp,time_bins_edges,'std')


# define the regional and global non-living carbon data
trendy_OC_global = (trendy_global_nbp-trendy_global_cVeg).dropna()
trendy_OC_regional = trendy_nbp_regional-trendy_cVeg_regional

# remove models with missing data
trendy_OC_regional = trendy_OC_regional.loc[pd.IndexSlice[:,trendy_OC_global.index],:]

# calculate the mean for non-living orgnic carbon
delta_OC_DGVMs = delta_C_DGVMs-delta_B_DGVMs

# calculate the std for non-living orgnic carbon
delta_OC_DGVMs_std = analyze_DGVM(trendy_OC_regional,trendy_OC_global,time_bins_edges,'std')

#### 4.2.3. Single model data

In [33]:
# concatenate the cVeg and nbp data
single_model_data = pd.concat([trendy_global_cVeg,trendy_global_nbp],keys=['delta_B','delta_C'],names=['pool'])

# copy the DataFrame to calculate the whole period stats
whole_period = single_model_data.copy()

res = []

for df,bins, bin_name in zip([single_model_data,whole_period],[time_bins_edges,[time_bins_edges[0],time_bins_edges[-1]]],[time_bin_names,[f'{start_year+1}-2019']]):
    
    # cut the columns into the different periods
    df.columns = pd.cut(df.columns,bins=bins,labels=bin_name)
    df.columns.name = 'period'

    # groupby pool, model and period and calculate the mean and std
    res.append(df.stack().groupby(['pool','model','period'],observed=False).agg(['mean','std']))

# concatenate the results
single_model_data = pd.concat(res)

#### 4.2.4. Save estimates

In [34]:
DGVMs_mean = pd.concat([delta_B_DGVMs.stack(),delta_OC_DGVMs.stack(),delta_C_DGVMs.stack()],keys=['delta_B','delta_OC','delta_C'])

# method 1 uncorrelated uncertainty
DGVMs_std = pd.concat([delta_B_DGVMs_std.stack(),delta_OC_DGVMs_std.stack(),delta_C_DGVMs_std.stack()],keys=['delta_B','delta_OC','delta_C'])
DGVMs_periods = pd.concat([DGVMs_mean,DGVMs_std],keys=['mean','std'])
DGVMs_periods.index.names = ['stat','pool','time','period']
DGVMs_periods.to_csv('../results/06_make_estimates/DGVMs_periods.csv')

In [35]:
DGVMs_annual_cum = pd.concat([delta_B_DGVMs_annual_cum,delta_C_DGVMs_annual_cum],keys=['delta_B','delta_C'])
DGVMs_annual_std_cum = pd.concat([delta_B_DGVMs_annual_std_cum,delta_C_DGVMs_annual_std_cum],keys=['delta_B','delta_C'])
DGVMs_annual_cum     = pd.concat([DGVMs_annual_cum,DGVMs_annual_std_cum],keys=['mean','std'])
DGVMs_annual_cum.index.names = ['stat','pool','time']
DGVMs_annual_cum.to_csv('../results/06_make_estimates/DGVMs_annual_cum.csv')

In [36]:
single_model_data.to_csv('../results/06_make_estimates/single_DGVM_data.csv')