# CMIP6 data normalization

In [1]:
import numpy as np
import pandas as pd
import h5py

## Qualified datasets

In [2]:
dataset_mce = [
    # originally used for MCE calibration
    'ACCESS-CM2',
    'ACCESS-ESM1-5',
    'AWI-CM-1-1-MR',
    'BCC-CSM2-MR',
    'BCC-ESM1',
    'CAMS-CSM1-0',
    'CanESM5',
    'CESM2',
    'CESM2-FV2',
    'CESM2-WACCM',
    'CESM2-WACCM-FV2',
    'CNRM-CM6-1',
    'CNRM-CM6-1-HR',
    'CNRM-ESM2-1',
    'E3SM-1-0',
    'EC-Earth3',
    'EC-Earth3-Veg',
    'FGOALS-f3-L',
    'FGOALS-g3',
    'GFDL-CM4',
    'GFDL-ESM4',
    'GISS-E2-1-G', # partly replaced with 1pctCO2-4xext
    'GISS-E2-1-H',
    'GISS-E2-2-G',
    'HadGEM3-GC31-LL',
    'HadGEM3-GC31-MM',
    'INM-CM4-8',
    'IPSL-CM6A-LR',
    'MIROC-ES2L',
    'MIROC6',
    'MPI-ESM1-2-HR',
    'MPI-ESM1-2-LR',
    'MRI-ESM2-0',
    'NESM3',
    'NorESM2-LM',
    'NorESM2-MM',
    'SAM0-UNICON',
    'UKESM1-0-LL',
]
dataset_add = [
    # from RCMIP
    'CIESM',
    'CMCC-CM2-SR5',
    'IITM-ESM',
    'NorCPM1',
    'TaiESM1',
    # from Chapter-7 repository
    'CAS-ESM2-0', # 1pctCO2 not available
    'INM-CM5-0', # 1pctCO2 not available
    # TLM parameters available, but currently no CMIP6 data
    'MCM-UA-1-0',
]

## Normalization

Anomalies are defined as deviations from a linear fit of piControl.

In [3]:
# Pre-processed CMIP6 global mean data
# abrupt-4xCO2 and 1pctCO2 series are stitched to piControl series
# using the piControl time coordinates in common
f1 = h5py.File('./datain/cmip6_global_mean.h5', 'r')

In [4]:
# Read rtnt and tas time series over 150-year and 140-year periods
# of abrupt-4xCO2 and 1pctCO2, respectively,
# and corresponding piControl series
dfset = {}

map_slice = {
    'abrupt-4xCO2': slice(None, 150),
    '1pctCO2': slice(None, 140),
    'piControl': slice(None, None),
}

for k, v in f1.items():
    df = {}
    for k1, v1 in v.items():
        dlen = min([len(v2) for v2 in v1.values()])
        df[k1] = (
            pd.DataFrame({
                k2: v2[:dlen][map_slice[k1]] for k2, v2 in v1.items()
            })
            .set_index('year')
        )

    df = pd.concat(df, axis=1)
    # Truncate the period to keep the range covering abrupt-4xCO2 and 1pctCO2.
    # The length of truncated periods is mostly 150 years, but can be longer
    # when the branch time is different between abrupt-4xCO2 and 1pctCO2
    if df.shape[1] == 4:
        df = df.dropna()
    else:
        d1 = df.apply(lambda d1: len(d1.dropna()), axis=1) < 4
        (i0, i1) = (d1.idxmin(), d1.iloc[::-1].idxmin())
        df = df.loc[i0:i1]

    dfset[k] = df.T

In [5]:
f1.close()

In [6]:
def linear_trend(d1):
    """Return linear trend of a given time series

    Parameters
    ----------
    d1
        Input time series

    Returns
    -------
        Linear trend time series
    """
    p = np.polyfit(d1.index, d1, 1)
    return pd.Series(p[0] * d1.index.values + p[1], index=d1.index)

In [7]:
df_norm1 = {}

for k, df in dfset.items():
    for k1, df1 in df.groupby(level=1):
        df1n = df1.sub(linear_trend(df1.loc[('piControl', k1)]))
        df1n.columns = np.arange(df1.shape[1]) + 0.5
        df_norm1[(k, k1)] = df1n.droplevel(1)

df_norm1 = pd.concat(df_norm1)

In [8]:
df_norm1.to_csv('./dataout/cmip6_normalized_1.csv')