In [1]:
%load_ext autoreload
%autoreload 2
import xarray as xr
import rioxarray as rio
import numpy as np
from rasterio.enums import Resampling
from utils import *
from glob import glob
from tqdm import tqdm

This script is intended to preprocess auxiliary datasets to bring them to the same spatial resolution of the highest resolution data source in our analysis, which is 0.1˚ x 0.1˚ (used in [Xu et al. 2021](https://www.science.org/doi/10.1126/sciadv.abe9829)).

There are three main steps performed in this script:
1. create a gridded land area map from the ESA CCI land cover data

2. convert the ESA CCI land cover data into three land cover classes: forest, cropland, and grassland based on the definitions of [Tagesson et al. 2020](https://www.nature.com/articles/s41559-019-1090-0)

3. Preprocess the Song et al. (2018) dataset

4. Preprocess the Ma et al. (2021) dataset

# Create land area map

In [2]:
# load the 2016 land cover data
ESA_CCI_2016 = rio.open_rasterio(f'../data/land_cover/ESA_CCI/C3S-LC-L4-LCCS-Map-300m-P1Y-2016-v2.1.1.nc',masked=True,chunks='auto',variable='lccs_class')['lccs_class']

# set CRS for file
ESA_CCI_2016.rio.write_crs(4326,inplace=True);

# drop the time dimension
ESA_CCI_2016 = ESA_CCI_2016.drop_vars('time').squeeze()


In [3]:
# mask all areas that are not water (210)
land = (ESA_CCI_2016 != 210).astype(float)

# set nodata value to nan
land.rio.set_nodata(np.nan,inplace=True)

# calculate the area of each pixel
area = calc_pixel_area(land)

# calculate the total land surface area and reproject to 0.1 degree resolution
land_surface_area = (land*area).rio.set_nodata(np.nan).rio.reproject(land.rio.crs,shape=[1800,3600],resampling=Resampling.sum)

# save the land surface area output to a netcdf file
land_surface_area.to_netcdf('../results/00_preprocessing/land_surface_area.nc')

# Preprocess ESA CCI land cover data

In [3]:
#load the Xu et al. 2021 biomass data to use as a target projection and resolution
xu_data = rio.open_rasterio('../data/biomass/xu_et_al_2021/test10a_cd_ab_pred_corr_2000_2019_v2.tif')[0,:,:]

# define function to preprocess the ESA CCI data
def process_CCI(raster: xr.DataArray) -> xr.DataArray:
    '''
    Preprocess the ESA CCI land cover data

    Parameters:
    raster (xarray.DataArray): raster to preprocess

    Returns:
    xarray.DataArray: preprocessed raster of size (1800,3600,3) with the surface area of each land cover class (forest, cropland, and shrubland) in each pixel
    '''
    ## Using the definitions in Tagesson et al. DOI: 10.1038/s41559-019-1090-0
    ## Categories:
        ## forest - 50,60,61,62,70,71,72,80,81,82,90,100,160,170 + 0.1*151 + 0.03*150
        ## cropland/grassland - 10,11,20,30,110,130 + 0.1*153 + 0.03*150
        ## shrubland - 12, 40, 120, 121,122,140,180 + 0.1*152 + 0.03*150
        ## bare - 190, 200,201,202,210,220
    
    # calculate surface area
    surface_area = calc_pixel_area(raster)

    # define the land cover classes
    cropland = raster.isin([10,11,20,30,110,130]) + 0.1 * (raster== 153) + 0.03 * (raster==150)
    forest = raster.isin([50,60,61,62,70,71,72,80,81,82,90,100,160,170]) + 0.1 * (raster== 151) + 0.03 * (raster==150)
    shrub = raster.isin([12, 40, 120, 121,122,140,180]) + 0.1 * (raster== 152) + 0.03 * (raster==150)
    lcs = [cropland,forest,shrub]
    
    for i in range(len(lcs)):
        # lcs[i] =  (lcs[i]*surface_area).rio.reproject_match(xu_data,resampling=Resampling.sum)
        lcs[i] =  down_sample(lcs[i]*surface_area,x_factor=36,y_factor=36,stat='sum')

    result = xr.concat(lcs,dim='landcover')
    return result

In [9]:
# find all ESA CCI land cover files
files = glob('../data/land_cover/ESA_CCI/*.nc')

# load the land cover data
for file in tqdm(files):
    
    # check if output file already exists
    year = file.split("/")[-1].split(".")[0].split('-')[-2]
    print(year)
    if glob(f'../results/00_preprocessing/ESA_CCI_landcover_processed_{year}.nc'):
        continue
    
    # open the file
    da = rio.open_rasterio(file,masked=True,chunks='auto',variable='lccs_class')['lccs_class']
    
    # set the CRS
    da = da.rio.write_crs(4326,inplace=True)
    
    # process the file
    res = process_CCI(da)
    
    # add landcover as a coordinate
    res['landcover'] = ['cropland','forest','shrubland']

    # change name of the variable
    res.name = 'ESA_CCI_landcover_processed'

    # save the results
    res.to_netcdf(f'../results/00_preprocessing/ESA_CCI_landcover_processed_{year}.nc')

  0%|          | 0/29 [00:00<?, ?it/s]

2018


  3%|▎         | 1/29 [01:39<46:36, 99.89s/it]

2020


  7%|▋         | 2/29 [03:39<50:13, 111.62s/it]

2016


 10%|█         | 3/29 [05:02<42:35, 98.28s/it] 

2019


 14%|█▍        | 4/29 [06:36<40:15, 96.60s/it]

2017


 17%|█▋        | 5/29 [07:59<36:44, 91.84s/it]

2012


 21%|██        | 6/29 [09:24<34:14, 89.33s/it]

2014


 24%|██▍       | 7/29 [10:51<32:29, 88.63s/it]

2013


 28%|██▊       | 8/29 [12:15<30:30, 87.16s/it]

2015


 31%|███       | 9/29 [13:46<29:26, 88.34s/it]

1994


 34%|███▍      | 10/29 [15:06<27:11, 85.86s/it]

2001


 38%|███▊      | 11/29 [16:29<25:27, 84.86s/it]

1992
1999


 45%|████▍     | 13/29 [17:50<17:08, 64.25s/it]

1996


 48%|████▊     | 14/29 [19:21<17:45, 71.04s/it]

1998


 52%|█████▏    | 15/29 [20:44<17:15, 74.00s/it]

1995


 55%|█████▌    | 16/29 [22:15<17:02, 78.66s/it]

2000


 59%|█████▊    | 17/29 [23:38<15:58, 79.91s/it]

1993


 62%|██████▏   | 18/29 [25:00<14:47, 80.67s/it]

1997


 66%|██████▌   | 19/29 [26:24<13:34, 81.44s/it]

2009


 69%|██████▉   | 20/29 [27:43<12:08, 80.91s/it]

2003


 72%|███████▏  | 21/29 [29:11<11:02, 82.84s/it]

2011


 76%|███████▌  | 22/29 [30:40<09:52, 84.62s/it]

2006


 79%|███████▉  | 23/29 [32:04<08:26, 84.41s/it]

2004


 83%|████████▎ | 24/29 [33:26<06:59, 83.95s/it]

2010


 86%|████████▌ | 25/29 [34:48<05:32, 83.13s/it]

2008


 90%|████████▉ | 26/29 [36:08<04:06, 82.24s/it]

2005


 93%|█████████▎| 27/29 [37:29<02:43, 81.87s/it]

2002


 97%|█████████▋| 28/29 [38:48<01:21, 81.04s/it]

2007


100%|██████████| 29/29 [40:09<00:00, 83.09s/it]


# Preprocess Song et al. (2018) data

In [3]:
# Load ESA CCI landcover data in order to reproject to the Song et al. data to its resolution
files = glob('../results/00_preprocessing/ESA_CCI_landcover_processed_*.nc')
CCI_data = xr.open_dataset(files[0])['ESA_CCI_landcover_processed']
CCI_data.rio.write_crs('EPSG:4326',inplace=True)


# Load Song et al. data
files = glob(f'../data/land_cover/song_et_al_2018/VCF*.tif')
song_data = xr.concat([rio.open_rasterio(x,masked=True,chunks='auto').rio.reproject_match(CCI_data) for x in files],dim='time')

# get the years
years = [int(f.split('/')[-1].split('_')[1][:4]) for f in files]
song_data['time'] = years

# rename the band dimension to landcover
song_data = song_data.rename({'band':'landcover'})

# convert units from % to fraction
song_data = song_data/100

# remove the "bare land" class
song_data = song_data[:,:2,:,:]

# name landcover as a coordinate
song_data['landcover'] = ['forest','nonforest']

# take only data that is greater than 0 and set nodata value to nan
song_data = song_data.where(song_data>0).rio.set_nodata(np.nan)

# sort by time
song_data = song_data.sortby('time')

song_data.name = 'Song et al. 2018'

# interpolate missing years
song_data = song_data.interp(time=np.arange(1982,2020))

# take the years 1992 to 2019
song_data = song_data.sel(time=slice(1992,2019))

# transpose dimensions to be in the order landcover,time,y,x
song_data = song_data.transpose('landcover','time','y','x')

# save the results
song_data.to_netcdf('../results/00_preprocessing/song_et_al_landcover_processed.nc')


# Preprocess Ma et al. (2021) data

In [11]:
# define the landcovers to use
landcover_types = ['forest','shrub','grass']

# load data and merge it into one file
ma_data = xr.concat([xr.concat([rio.open_rasterio(x,masked=True,chunks='auto').sel(band=1) for x in glob(f'../data/RMF/ma_et_al_2021/{type}*.tif')],dim='x') for type in landcover_types],dim='landcover')
ma_data['landcover'] = landcover_types

# resample the data to the same resolution as the Xu et al. data
ma_data = ma_data.rio.reproject(ma_data.rio.crs,shape=[1800,3600],resampling=Resampling.nearest)

# name the data
ma_data.name = 'Ma et al. 2021'

# save the data
ma_data.to_netcdf('../results/00_preprocessing/ma_et_al_processed.nc')

In [3]:
# load data and merge it into one file
ma_data = xr.concat([rio.open_rasterio(x,masked=True,chunks='auto').sel(band=1) for x in glob(f'../data/RMF/ma_et_al_2021/rmf_all*.tif')],dim='x')

# resample the data to the same resolution as the Xu et al. data
ma_data = ma_data.rio.reproject(ma_data.rio.crs,shape=[1800,3600],resampling=Resampling.nearest)

# name the data
ma_data.name = 'Ma et al. 2021'

# save the data
ma_data.to_netcdf('../results/00_preprocessing/ma_et_al_processed_all.nc')

# Preprocess Huang et al. (2021) data

In [13]:
# load data and merge into one DataArray
files = glob('../data/RMF/huang_et_al_2021/data_code_to_submit/pergridarea_*.nc')
huang_data = xr.concat([rio.open_rasterio(x,masked=True,chunks='auto').sel(band=1) for x in files],dim='veg_part')

# name the different parts of the vegetation
huang_data['veg_part'] = ['shoot','root']

# calculate the RMF from Huang
huang_data = huang_data.sel(veg_part='root')/huang_data.sum(dim='veg_part')

# set CRS
huang_data.rio.write_crs(4326,inplace=True)

# set no data to nan
huang_data = huang_data.rio.set_nodata(np.nan)

# resample the data to the same resolution as the Xu et al. data
huang_data = huang_data.rio.reproject(huang_data.rio.crs,shape=[1800,3600],resampling=Resampling.average,nodata=np.nan)

# name the DataArray
huang_data.name = 'Huang et al. 2021'

# save the data
huang_data.to_netcdf('../results/00_preprocessing/huang_et_al_processed.nc')

In [11]:
# load data and merge into one DataArray
files = glob('../data/RMF/huang_et_al_2021/data_code_to_submit/pergridarea_*.nc')
huang_data = xr.concat([rio.open_rasterio(x,masked=True,chunks='auto').sel(band=1) for x in files],dim='veg_part')

# name the different parts of the vegetation
huang_data['veg_part'] = ['shoot','root']

# calculate the RMF from Huang
huang_data = huang_data.sel(veg_part='root')/huang_data.sum(dim='veg_part')
