# Normalization of CMIP climate data

In [1]:
import numpy as np
import pandas as pd
import h5py
from scipy.stats import linregress

In [2]:
def read_cmip(path, map_slice, map_slice_specific={}):
    """Read stitched CMIP global mean data

    Parameters
    ----------
    path
        Input CMIP data in HDF5 format
        key structure: source_id/experiment_id/variable_id
    map_slice
        Time slice by experiments
    map_slice_specific, optional
        Time slice for specific (source_id, experiment_id), by default {}

    Returns
    -------
        Dictionary of DataFrame
    """
    dfset = {}

    with h5py.File(path, 'r') as f1:
        for source_id, gl1 in f1.items():
            df = {}

            for experiment_id, gl2 in gl1.items():
                slc = map_slice_specific.get(
                    (source_id, experiment_id),
                    map_slice[experiment_id],
                )

                # ensure to use the same length across variables
                dlen = min([len(data) for data in gl2.values()])

                df[experiment_id] = (
                    pd.DataFrame({
                        variable_id: data[:dlen][slc]
                        for variable_id, data in gl2.items()
                    })
                    .set_index('year')
                )

            df = pd.concat(df, axis=1).sort_index()

            # Truncate the period to cover all experiments at least.
            # The truncated length is typically 150 years,
            # but can be longer depending on branch times.
            if df.shape[1] == 4:
                # 1pctCO2 may be missing
                df = df.dropna()
            else:
                d1 = df.apply(lambda d1: len(d1.dropna()), axis=1) < 4
                (i0, i1) = (d1.idxmin(), d1.iloc[::-1].idxmin())
                df = df.loc[i0:i1]
                
            dfset[source_id] = df.T    

    return dfset

In [3]:
def reg_polyfit(d1):
    """Linear regression by numpy.polyfit

    Parameters
    ----------
    d1
        Input data in pandas.Series

    Returns
    -------
        Regression coefficients of slope and intercept
    """
    p = np.polyfit(d1.index, d1, 1)
    return p[0], p[1]


def reg_linregress(d1):
    """Linear regression by scipy.stats.linregress

    Parameters
    ----------
    d1
        Input data in pandas.Series

    Returns
    -------
        Regression coefficients of slope and intercept
    """
    reg = linregress(d1.index, d1)
    return reg.slope, reg.intercept
    

class LinearRegression:
    def __init__(self, f_regress):
        """Linear regression class

        Parameters
        ----------
        f_regress
            Regression function
        """
        self.f_regress = f_regress
        self.slope = None
        self.intercept = None

    def regress(self, d1, ret=True):
        """Perform regression

        Parameters
        ----------
        d1
            Input data in pandas.Series
        ret, optional
            Return fitting data when True, by default True

        Returns
        -------
            Fitting data in pandas.Series
        """
        self.slope, self.intercept = self.f_regress(d1)
        if ret:
            return self.trend(d1)
        
    def trend(self, d1):
        """Fitting to given data

        Parameters
        ----------
        d1
            Input data in pandas.Series

        Returns
        -------
            Fitting data in pandas.Series
        """
        return pd.Series(
            self.slope * d1.index.values + self.intercept,
            index=d1.index,
        )

## CMIP5

In [4]:
map_slice = {
    'abrupt4xCO2': slice(None, 150),
    '1pctCO2': slice(None, 140),
    'piControl': slice(None, None),
}
map_slice_specific = {
    # add 1 year to discard the first December-only year
    ('HadGEM2-ES', 'abrupt4xCO2'): slice(None, 151),
    ('HadGEM2-ES', '1pctCO2'): slice(None, 151),
}
path = 'datain/cmip5_global_mean.h5'
dfset = read_cmip(path, map_slice, map_slice_specific)

In [5]:
dfset['HadGEM2-ES']

Unnamed: 0,year,1860,1861,1862,1863,1864,1865,1866,1867,1868,1869,...,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010
1pctCO2,rtnt,8.654499,0.068563,0.806606,0.723264,0.348339,0.129695,0.230886,0.392473,0.619887,0.518593,...,,,,,,,,,,
1pctCO2,tas,284.849294,286.750466,286.812354,286.856291,286.903964,286.834005,286.794261,286.819748,286.679494,286.668729,...,,,,,,,,,,
abrupt4xCO2,rtnt,15.486268,6.516709,5.312812,5.244224,4.256972,4.066469,4.586454,3.656675,4.087922,3.664238,...,2.568495,2.21499,1.930989,1.979264,2.174504,2.152034,2.026579,1.76436,2.033359,2.356366
abrupt4xCO2,tas,285.328525,288.04226,288.65891,289.097644,289.362987,289.636346,289.854225,290.207715,290.30712,290.312133,...,293.30299,293.321769,293.381265,293.35456,293.317238,293.326335,293.300609,293.30914,293.188434,293.258565
piControl,rtnt,0.377321,0.341524,0.409079,0.171715,0.75743,0.840114,0.096753,-0.14809,0.402539,-0.261519,...,0.345578,0.284344,0.265751,0.296517,0.058186,-0.084898,0.640702,0.312847,0.342174,0.018246
piControl,tas,286.819238,286.688588,286.700063,286.667869,286.639383,286.706139,286.728179,286.70434,286.706739,286.619953,...,286.993375,286.889575,286.798084,286.826043,286.873622,286.919306,286.798485,286.879491,286.809728,286.806749


In [6]:
# Shift 1 year to discard the first December-only year
# for abrupt4xCO2 and 1pctCO2
dfset['HadGEM2-ES'] = pd.concat([
    d1.shift(-1).dropna() if k[0] != 'piControl' else d1.iloc[:-1]
    for k, d1 in dfset['HadGEM2-ES'].iterrows()
], axis=1).T
dfset['HadGEM2-ES']

Unnamed: 0,year,1860,1861,1862,1863,1864,1865,1866,1867,1868,1869,...,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009
1pctCO2,rtnt,0.068563,0.806606,0.723264,0.348339,0.129695,0.230886,0.392473,0.619887,0.518593,1.209177,...,,,,,,,,,,
1pctCO2,tas,286.750466,286.812354,286.856291,286.903964,286.834005,286.794261,286.819748,286.679494,286.668729,286.785603,...,,,,,,,,,,
abrupt4xCO2,rtnt,6.516709,5.312812,5.244224,4.256972,4.066469,4.586454,3.656675,4.087922,3.664238,3.698053,...,2.568495,2.21499,1.930989,1.979264,2.174504,2.152034,2.026579,1.76436,2.033359,2.356366
abrupt4xCO2,tas,288.04226,288.65891,289.097644,289.362987,289.636346,289.854225,290.207715,290.30712,290.312133,290.578927,...,293.30299,293.321769,293.381265,293.35456,293.317238,293.326335,293.300609,293.30914,293.188434,293.258565
piControl,rtnt,0.377321,0.341524,0.409079,0.171715,0.75743,0.840114,0.096753,-0.14809,0.402539,-0.261519,...,0.351226,0.345578,0.284344,0.265751,0.296517,0.058186,-0.084898,0.640702,0.312847,0.342174
piControl,tas,286.819238,286.688588,286.700063,286.667869,286.639383,286.706139,286.728179,286.70434,286.706739,286.619953,...,286.772108,286.993375,286.889575,286.798084,286.826043,286.873622,286.919306,286.798485,286.879491,286.809728


In [7]:
# Example of different branch times
dfset['GISS-E2-H']

Unnamed: 0,year,2410,2411,2412,2413,2414,2415,2416,2417,2418,2419,...,2800,2801,2802,2803,2804,2805,2806,2807,2808,2809
1pctCO2,rtnt,0.147972,0.435169,0.386536,0.506856,0.331663,0.363981,1.041346,0.106986,0.47727,0.824659,...,,,,,,,,,,
1pctCO2,tas,287.836942,287.782097,287.824151,287.800052,287.905455,287.903676,287.892335,287.992342,288.004186,288.040941,...,,,,,,,,,,
abrupt4xCO2,rtnt,,,,,,,,,,,...,1.288718,1.322129,1.46452,1.782507,1.227172,1.3037,1.056435,1.33472,1.210032,1.49304
abrupt4xCO2,tas,,,,,,,,,,,...,292.019406,291.955491,291.944403,292.013557,292.033161,292.011194,292.022663,292.009316,292.069662,292.017555
piControl,rtnt,0.279262,0.001227,0.360351,0.480223,0.143552,0.346735,0.189185,0.612083,0.239171,0.212441,...,0.232556,0.041047,0.038915,0.103205,0.32369,0.224003,-0.332247,0.289748,0.349664,0.529816
piControl,tas,287.908142,287.832775,287.704217,287.909406,287.919895,287.882836,287.940941,287.880221,287.962323,288.010841,...,287.992369,288.050284,288.030441,288.069289,287.955552,288.039637,288.037361,288.016488,288.060721,287.989573


In [11]:
linreg = LinearRegression(reg_linregress)
data = {}

for source_id, df in dfset.items():
    for variable, df1 in df.groupby(level=1):
        trend = linreg.regress(df1.loc[('piControl', variable)])
        df1n = df1.sub(trend)
        df1n.columns = np.arange(df1.shape[1]) + 0.5
        data[(source_id, variable)] = df1n.droplevel(1)

df_norm1 = (
    pd.concat(data)
    .reorder_levels([0, 2, 1])
    .sort_index(key=lambda x: x.str.lower())
)

In [12]:
outpath = 'data/cmip_normalized.h5'
df_norm1.to_hdf(outpath, 'CMIP5/climate_norm1')

In [13]:
with pd.HDFStore(outpath, 'r') as store:
    df_chk = store['CMIP5/climate_norm1']

np.array_equal(df_norm1, df_chk, equal_nan=True)

True

## CMIP6

In [14]:
path = 'datain/cmip6_global_mean.h5'
map_slice = {
    'abrupt-4xCO2': slice(None, 150),
    '1pctCO2': slice(None, 140),
    'piControl': slice(None, None),
}
dfset = read_cmip(path, map_slice)

In [15]:
linreg = LinearRegression(reg_linregress)
data = {}

for source_id, df in dfset.items():
    for variable, df1 in df.groupby(level=1):
        trend = linreg.regress(df1.loc[('piControl', variable)])
        df1n = df1.sub(trend)
        df1n.columns = np.arange(df1.shape[1]) + 0.5
        data[(source_id, variable)] = df1n.droplevel(1)

df_norm1 = (
    pd.concat(data)
    .reorder_levels([0, 2, 1])
    .sort_index(key=lambda x: x.str.lower())
)

In [16]:
outpath = 'data/cmip_normalized.h5'
df_norm1.to_hdf(outpath, 'CMIP6/climate_norm1')

In [17]:
with pd.HDFStore(outpath, 'r') as store:
    df_chk = store['CMIP6/climate_norm1']

np.array_equal(df_norm1, df_chk, equal_nan=True)

True