# CSS 120: Environmental Data Science

## Paleoclimate

### Umberto Mignozzetti (UCSD)

(Based on Project Pythia and ClimateMatch)

# Packages


In [None]:
# System helpers
import os
import sys
from io import StringIO
import tempfile
import xarray as xr

# Data analysis
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pooch

# Principal Component Analysis
from sklearn.decomposition import PCA

# Packages


In [None]:
# Paleodata analysis
import lipd
import pyleoclim as pyleo

# Climlab
from climlab import constants as const
from climlab.solar.orbital import OrbitalTable
from climlab.solar.insolation import daily_insolation

# Maps
import cartopy, cartopy.crs as ccrs, cartopy.feature as cfeature
import cartopy.util as cutil, cartopy.io.shapereader as shapereader

##  Helper functions


In [None]:
# Pooch Load
def pooch_load(filelocation=None, filename=None, processor=None):
    shared_location = "~/"
    user_temp_cache = tempfile.gettempdir()

    if os.path.exists(os.path.join(shared_location, filename)):
        file = os.path.join(shared_location, filename)
    else:
        file = pooch.retrieve(
            filelocation,
            known_hash=None,
            fname=os.path.join(user_temp_cache, filename),
            processor=processor,
        )

    return file

##  Helper functions


In [None]:
# Function to convert the PAGES2K LiDP files in a pandas.DataFrame
def lipd2df(
    lipd_dirpath,
    pkl_filepath=None,
    col_str=[
        "paleoData_pages2kID", "dataSetName", "archiveType", "geo_meanElev", "geo_meanLat",
        "geo_meanLon", "year", "yearUnits", "paleoData_variableName", "paleoData_units",
        "paleoData_values", "paleoData_proxy",
    ],
):
    """
    Convert a bunch of PAGES2k LiPD files to a `pandas.DataFrame` to boost data loading.

    If `pkl_filepath` isn't `None`, save the DataFrame as a pikle file.

    Parameters:
    ----------
        lipd_dirpath: str
          Path of the PAGES2k LiPD files
        pkl_filepath: str or None
          Path of the converted pickle file. Default: `None`
        col_str: list of str
          Name of the variables to extract from the LiPD files

    Returns:
    -------
        df: `pandas.DataFrame`
          Converted Pandas DataFrame
    """

    # Save the current working directory for later use, as the LiPD utility will change it in the background
    work_dir = os.getcwd()
    # LiPD utility requries the absolute path
    lipd_dirpath = os.path.abspath(lipd_dirpath)
    # Load LiPD files
    lipds = lipd.readLipd(lipd_dirpath)
    # Extract timeseries from the list of LiDP objects
    ts_list = lipd.extractTs(lipds)
    # Recover the working directory
    os.chdir(work_dir)
    # Create an empty pandas.DataFrame with the number of rows to be the number of the timeseries (PAGES2k records),
    # and the columns to be the variables we'd like to extract
    df_tmp = pd.DataFrame(index=range(len(ts_list)), columns=col_str)
    # Loop over the timeseries and pick those for global temperature analysis
    i = 0
    for ts in ts_list:
        if (
            "paleoData_useInGlobalTemperatureAnalysis" in ts.keys()
            and ts["paleoData_useInGlobalTemperatureAnalysis"] == "TRUE"
        ):
            for name in col_str:
                try:
                    df_tmp.loc[i, name] = ts[name]
                except:
                    df_tmp.loc[i, name] = np.nan
            i += 1
    # Drop the rows with all NaNs (those not for global temperature analysis)
    df = df_tmp.dropna(how="all")
    # Save the dataframe to a pickle file for later use
    if pkl_filepath:
        save_path = os.path.abspath(pkl_filepath)
        print(f"Saving pickle file at: {save_path}")
        df.to_pickle(save_path)
    return df

##  Helper functions


In [None]:
class SupressOutputs(list):
    def __enter__(self):
        self._stdout = sys.stdout
        sys.stdout = self._stringio = StringIO()
        return self

    def __exit__(self, *args):
        self.extend(self._stringio.getvalue().splitlines())
        del self._stringio  # free up some memory
        sys.stdout = self._stdout##  Helper functions


# Paleoclimate Models

## Paleoclimate Models

Another useful tool in paleoclimate is the use of climate models.  

We will now explore data from the [Paleoclimate Modelling Intercomparison Project 3 (PMIP3)](https://www.nature.com/articles/nclimate1456).

More specifically, we will analyze global mean surface temperature (GMST) data from  simulations for the past 1,000 years. 

We will also compare the PMIP3 GMST data to a proxy-based reconstruction of temperature from Lake Tanganyika in East Africa [(Tierney et al., 2010)](https://www.nature.com/articles/ngeo865). 

Through this proxy-model comparison, we may assess the differences and similarities between the two datasets.

## Load Proxy-Based Temperature Reconstructions

The proxy record we'll be analyzing is a 1,000 year-long lake surface temperature reconstruction from [Tierney et al., 2010](https://https://doi.org/10.1038/ngeo865). 

This record is from Lake Taganyika in equatorial East Africa and is based on the [TEX86 ratio](https://en.wikipedia.org/wiki/TEX86), which is a temperature proxy derived from the distribution of the isoprenoid glycerol dialkyl glycerol tetraether (iGDGT) of archaeal membrane lipids.

The organisms producing these iGDGT compounds alter the structure of the compound in response to changes in temperature, so by measuring changes in the ratio of the different compounds, we can infer past changes in temperature.   

Let's start by loading the proxy data, saving it as a `Series` in Pyleoclim, and plotting a time series.

## Load Proxy-Based Temperature Reconstructions

In [None]:
filename_tang = "tang_sst.csv"
url_tang = "https://osf.io/p8tx3/download"
proxy_temp = pd.read_csv(pooch_load(filelocation=url_tang, filename=filename_tang))
proxy_temp.head()

## Load Proxy-Based Temperature Reconstructions

In [None]:
proxy_t = pyleo.Series(
    time=proxy_temp["Year"],
    value=proxy_temp["LST"],
    time_name="Time",
    time_unit="yrs",
    value_name="Surface Temperature",
    value_unit="ºC",
    label="Lake Tanganyika Surface Temperature",
)

proxy_t.plot(color="C1")

## Your turn

1. What is the overall temperature trend over the past 2,000 years? Where are the major increases or decrease in temperature? 

2. What could be the cause of these shifts in lake surface temperature? 

## Last Millenium PMIP3 GMST Data

We can now load GMST anomaly data from the PMIP3 simulations for the past 1,000 years ([Braconnot et al., 2012](https://doi.org/10.1038/nclimate1456) and [PAGES 2k-PMIP3 group, 2015](https://cp.copernicus.org/articles/11/1673/2015/)).

The anomalies are computed compared to the mean of the time series over the full length of temporal overlap between simulations.

## Last Millenium PMIP3 GMST Data

In [None]:
# load the raw data 'PMIP3_GMST.txt'
filename_PMIP3 = "PMIP3_GMST.txt"
url_PMIP3 = "https://osf.io/gw2m5/download"
df = pd.read_table(pooch_load(filelocation=url_PMIP3, filename=filename_PMIP3))

# display the raw data
df

## Load Proxy-Based Temperature Reconstructions

Note that the data file includes several ensemble members for [Community Earth System Model (CESM)](https://www.cesm.ucar.edu/) and [Goddard Institute for Space Studies (GISS)](https://www.giss.nasa.gov/) simulations. 

Ensembles are essentially groups of climate model simulations used for climate projections, or in this case, reconstructions. 

You will learn about this in much more detail next week, when we study Climate Modeling.

For now, we can replace these with their ensemble mean series.

## Load Proxy-Based Temperature Reconstructions

In [None]:
# create a new pandas.DataFrame to store the processed data
df_new = df.copy()

# remove the data columns for CESM and GISS ensemble members
for i in range(10):
    df_new = df_new.drop([f"CESM_member_{i+1}"], axis=1)

df_new = df_new.drop(["GISS-E2-R_r1i1p127.1"], axis=1)
df_new = df_new.drop(["GISS-E2-R_r1i1p127"], axis=1)
df_new = df_new.drop(["GISS-E2-R_r1i1p121"], axis=1)
df_new.head()

## Load Proxy-Based Temperature Reconstructions

In [None]:
# calculate the ensemble mean for CESM and GISS, and add the results into the table
df_new["CESM"] = df[
    [
        "CESM_member_1",
        "CESM_member_2",
        "CESM_member_3",
        "CESM_member_4",
        "CESM_member_5",
        "CESM_member_6",
        "CESM_member_7",
        "CESM_member_8",
        "CESM_member_9",
        "CESM_member_10",
    ]
].mean(axis=1)

df_new["GISS"] = df[
    [
        "GISS-E2-R_r1i1p127.1",
        "GISS-E2-R_r1i1p127",
        "GISS-E2-R_r1i1p121",
    ]
].mean(axis=1)

## Load Proxy-Based Temperature Reconstructions

In [None]:
# display the processed data
df_new

## Load Proxy-Based Temperature Reconstructions

In our new dataframe, you can now see that the ensemble members for CESM and GISS are now replaced with one ensemble mean for each model simulation.

Now we can create a Pyleoclim `Series` object for each simulated GMST time series, which will allow us to easily plot the time series for each simulation and perform data analysis using various built-in tools.

## Load Proxy-Based Temperature Reconstructions

In [None]:
# store each pyleoclim.Series() object into a dictionary
ts_dict = {}
for name in df_new.columns[1:]:
    ts_dict[name] = pyleo.Series(
        time=df_new["Year"].values,  # the time axis
        value=df_new[name].values,  # the value axis
        label=name,  # optional metadata: the nickname of the series
        time_name="Time",  # optional metadata: the name of the time axis
        time_unit="yrs",  # optional metadata: the unit of the time axis
        value_name="GMST anomaly",  # optional metadata: the name of the value axis
        value_unit="ºC",  # optional metadata: the unit of the value axis
    )

## Load Proxy-Based Temperature Reconstructions

We can now plot each simulation. For example, let's plot the CCSM4 GMST anomaly:

In [None]:
ts_dict["CCSM4"].plot()

## Load Proxy-Based Temperature Reconstructions

But what if we wanted to plot all of the PMIP3 time series at once? 

We can do that using the `MultipleSeries` object in Pyleoclim, which takes a list of `Series` objects as input. 

To do so, we have to convert the dictionary of `Series` into a list and then create a `MultipleSeries` object.

In [None]:
ts_list = [
    v for k, v in ts_dict.items()
]  # a pythonic way to convert the pyleo.Series items in the dictionary to a list
ms_pmip = pyleo.MultipleSeries(ts_list)

## Load Proxy-Based Temperature Reconstructions

We can now plot all PMIP3 simulation time series at once:

In [None]:
ms_pmip.plot(
    lgd_kwargs={
        "bbox_to_anchor": (1.25, 1),  # move the legend to the right side
    }
)

## Your turn

Note that like the paleo proxy data we have looked at, these model simulations are also referred to as reconstructions as they are attempts to recreate past climates.

The reconstructed GMST anomalies from all of the PMIP3 simulations follow the same overall trend of relatively stable, long-term temperature from 800-1800 AD, followed by an increase in temperature over the past 200 years. 

> What do you think is driving this recent warming trend?

Despite the long-term similarities, there are also noticeable differences between the GMST time series from each simulation.  

## Your turn (again)

1. How are the GMST anomalies from each simulation different? What could be causing these differences?
2. How do we know which simulation is the most accurate and reliable?

## Proxy-Model Comparisons

Proxy-based reconstructions of climate variables in the past can provide direct measurements of temperature, precipitation, greenhouse gas concentration, etc.

Comparing proxy paleoclimate records with paleoclimate model simulations can help to clarify the interpretation of the proxy record and also help to improve the ability of climate models to simulate past variations in climate.

Here, we'll compare the proxy-based Lake Tanganyika surface temperature reconstruction we downloaded and plotted before, with the GMST anomaly PMIP3 simulations.

But first, we need to standardize the Lake Tanganyika data since we're comparing the data to the GMST anomaly.

## Proxy-Model Comparisons

In [None]:
# standardize the proxy data
proxy_stnd = proxy_t.standardize()

## Proxy-Model Comparisons

In [None]:
fig, ax = ms_pmip.plot(
    lgd_kwargs={
        "bbox_to_anchor": (1.25, 1),  # move the legend to the right side
    }
)

ax.set_ylabel("GMST anomaly (ºC)")
ax1 = ax.twinx()  # create a second axes that shares the same x-axis
ax1.set_ylabel("Tanganyika Surface Temperature Anomaly (ºC)")

proxy_stnd.plot(ax=ax1, color="black")
ax.set_xlim(xmin=850, xmax=2000)
ax.set_ylim(ymin=-4, ymax=2)
ax1.set_ylim(ymin=-4, ymax=2)

## Your turn

How do the model simulated GMST and proxy-based surface temperature compare?

1. Is there more variability in the proxy or model temperatures? What might be causing this?

2. Are the long-term trends over the last millenium in the simulated GMST anomaly and proxy-based surface temperature record similar or different?

# Paleoclimate Reanalysis Products

## Paleoclimate Reanalysis Products

Proxies and models both have advantages and limitations for reconstructing past changes in Earth's climate system. 

One approach for combining the strengths of both paleoclimate proxies and models is data assimilation. 

This is the same approach used before, except instead of simulations of Earth's recent past, we are using a simulation that spans many thousands of years back in time.

***The results of this process are called reanalysis products.***

To do that, let us look at paleoclimate reconstructions from the Last Glacial Maximum Reanalysis (LGMR) product from [Osman et al. (2021)](https://www.nature.com/articles/s41586-021-03984-4), which contains temperature for the past 24,000 years.

## Load the LGMR Paleoclimate Reconstruction

This dataset contains a reconstruction of surface air temperature (SAT) from the product [Last Glacial Maximum Reanalysis (LGMR)](https://www.ncdc.noaa.gov/paleo/study/33112). 

Note that this data starts from 100 years before present and goes back in time to ~24,000 BP.

The period of time from ~21,000 to 18,000 years ago is referred to as the Last Glacial Maximum (LGM).

The LGM was the most recent glacial period in Earth's history. 

During this time, northern hemisphere ice sheets were larger, global sea level was lower, atmospheric CO<sub>2</sub> was lower, and global mean temperature was cooler.

## Load the LGMR Paleoclimate Reconstruction

We will calculate the global mean temperature from the LGM to 100 years before present  from a paleoclimate data assimilation to asses how Earth's climate varied over the past 24,000 years.

First let's download the paleoclimate data assimilation reconstruction for surface air temperature (SAT). 

In [None]:
filename_LGMR_SAT_climo = "LGMR_SAT_climo.nc"
url_LGMR_SAT_climo = "https://www.ncei.noaa.gov/pub/data/paleo/reconstructions/osman2021/LGMR_SAT_climo.nc"

ds = xr.open_dataset(
    pooch_load(filelocation=url_LGMR_SAT_climo, filename=filename_LGMR_SAT_climo)
)
ds

## Plotting the Temperature Time Series

Now that the data is loaded, we can plot a time series of the temperature data to assess global changes.

However, the dimensions of the `sat_mean` variable are age-lat-lon, so we first need to weight the data and calculate a global mean.

In [None]:
# assign weights
weights = np.cos(np.deg2rad(ds.lat))

# calculate the global mean surface temperature
sat_global_mean = ds.sat.weighted(weights).mean(dim=["lat", "lon"])
sat_global_mean

## Plotting the Temperature Time Series

Now that we calculated our global mean, we can plot the results as a time series to assess global changes in temperature over the past 24,000 years:

In [None]:
# plot the global mean surface temperature since the LGM
f, ax1 = plt.subplots(1, 1, figsize=(12, 6))
ax1.plot(ds["age"], sat_global_mean, linewidth=3)

ax1.set_xlim(ds["age"].max().values, ds["age"].min().values)
ax1.set_ylabel("Global Mean SAT for LGM ($^\circ$C)", fontsize=16)
ax1.set_xlabel("Age (yr BP)", fontsize=16)

## Your turn:

1. How has global temperature varied over the past 24,000 years?

2. What climate forcings may have contributed to the increase in temperature ~17,000 years ago? 

## Plotting a Temperature Anomaly Map

The reanalysis contains *spatial* reconstructions, so we can also make figures showing spatial temperature anomalies for different time periods (i.e., the change in temperature between two specified times).

The anomaly that we'll interpret is the difference between global temperature from 18,000 to 21,000 years ago (i.e. "LGM") and 100 to 1,000 years ago (i.e. "modern").

## Plotting a Temperature Anomaly Map

First, we'll calculate the average temperatures for each time period.

In [None]:
# calculate the LGM (18,000-21,000 year) mean temperature
lgm = ds.sat.sel(age=slice("18000", "21000"), lon=slice(0, 357.5), lat=slice(-90, 90))
lgm_mean = lgm.mean(dim="age")

# calculate the "modern" (100-1000 year) mean temperature
modern = ds.sat.sel(age=slice("100", "1000"), lon=slice(0, 357.5), lat=slice(-90, 90))
modern_mean = modern.mean(dim="age")

## Plotting a Temperature Anomaly Map

Now we can calculate the anomaly and create a map to visualize the change in temperature from the LGM to present in different parts on Earth.

In [None]:
sat_change = modern_mean - lgm_mean

## Plotting a Temperature Anomaly Map

In [None]:
# make a map of changes
fig, ax = plt.subplots(figsize=(12, 8), subplot_kw={"projection": ccrs.Robinson()})
ax.set_global()
sat_change.plot(
    ax=ax,
    transform=ccrs.PlateCarree(),
    x="lon",
    y="lat",
    cmap="Reds",
    vmax=30,
    cbar_kwargs={"orientation": "horizontal", "label": "$\Delta$SAT ($^\circ$C)"},
)
ax.coastlines()
ax.set_title(f"Modern - LGM SAT ($^\circ$C)", loc="center", fontsize=16)
ax.gridlines(color="k", linewidth=1, linestyle=(0, (1, 5)))
ax.spines["geo"].set_edgecolor("black")

## Plotting a Temperature Anomaly Map

Before we interpret this data, another useful way to visualize this data is through a plot of zonal mean temperature (the average temperature for all locations at a single latitude).

Once we calculate this zonal mean, we can create a plot of LGM to present temperature anomalies versus latitude.

In [None]:
zonal_mean = sat_change.mean(dim="lon")
latitude = ds.lat

## Plotting a Temperature Anomaly Map

In [None]:
# Make a zonal mean figure of the changes
fig, ax1 = plt.subplots(1, 1)
ax1.plot(zonal_mean, latitude)
ax1.axvline(x=0, color="gray", alpha=1, linestyle=":", linewidth=2)
ax1.set_ylim(-90, 90)
ax1.set_xlabel("$\Delta$T ($^\circ$C)")
ax1.set_ylabel("Latitude ($^\circ$)")
ax1.set_title(
    f"Zonal-mean $\Delta$T ($^\circ$C) changes",  # ohad comment: same changes
    loc="center",
)

## Your turn

Looking at both the map and zonal mean plot, consider the following questions: 

1. How does the temperature anomaly vary with latitude? 
2. What might be causing spatial differences in the temperature anomaly?

## Questions?

## See you in the next lecture!