In [1]:
import os
import sys
import pickle

import cartopy.crs as ccrs
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import xarray as xr
import xesmf as xe
import xcdat as xc
import xsearch as xs
import xskillscore as xscore

from glob import glob 
from global_land_mask import globe
from typing import List, Tuple, Dict, Union, Optional, Any, Callable, Iterable, Sequence, cast
from scipy.stats import linregress

# Ignore xarray warnings (bad practice)
import warnings
warnings.simplefilter("ignore") 

In [2]:
os.chdir('/home/espinosa10/tropical_pacific_clouds')

In [6]:
def ingest_and_process(
    output_grid: np.ndarray,
    var: str = "tos", 
    cmipTable: str ="Omon", 
    era: str = "CMIP6",
    testing: bool = False,
    calc_anoms: bool = False
) -> dict:
    """
    Ingests all the data from the CMIP piControl experiment. 
    Optionally remove the seasonal cycle and detrend the data, regrid to a commond 2.5 x 2.5 degree grid and return a new dataset

    Returns:
        xarray datasets with dimensions (model, time, lat, lon)
    """

    # Specify the ensemble member
    if era == "CMIP6": 
        member = "r1i1p1f1"
    else:
        member = "r1i1p1"

    # Find all the paths to the data
    dpaths = xs.findPaths(
        experiment="historical",
        variable=var,
        frequency="mon",
        cmipTable=cmipTable,
        mip_era=era, 
        activity="CMIP",
        member=member
    )
    models =  xs.getGroupValues(dpaths, 'model')
    print("Models: ", len(models), models)
    dpaths = list(dpaths.keys())
    
    # Create an empty Dataset
    ds = []
    valid_models = []

    for i, (model_path, model) in enumerate(zip(dpaths, models)):
        print("Starting model: ", model)
        print("Model path: ", model_path)

        try:
            # Load data
            da = xc.open_mfdataset(glob(model_path + "/*.nc")) #, parallel=True, chunks="auto")
            if (var == "ta"):
                da = da.sel(plev=70000)
            if (var in ["hur", "ua", "va", "zg"]): 
                # da = da.sel(plev=100000)
                da = da.sel(plev=85000)

            # Regrid Data
            output = da.regridder.horizontal(var, output_grid, tool='xesmf', method='bilinear')

            # Deseasonalize (no need to detrend piControl data, there should be no drift)
            if calc_anoms:
                output = output.temporal.departures(var, "month")

            ntime, _, _ = output[var].shape

            # nmonths = 165*12 # historical
            nmonths = 150*12 # piControl
            if ntime < nmonths:
                continue

            output = output[var][:nmonths]
            time = np.arange(np.datetime64("1850-01"), np.datetime64("1850-01") + np.timedelta64(nmonths, 'M'), dtype="datetime64[M]")
            output["time"] = time
            coords = output.coords.keys()
            print(coords)
            if "height" in coords:
                output = output.drop('height')
            ds.append(output)
            valid_models.append(model)

            # Only load one model if testing
            if testing:
                _, axes = plt.subplots(ncols=2, figsize=(16, 4))
                da[var].isel(time=0).plot(ax=axes[0])
                axes[0].set_title('Input data')
                output.isel(time=0).plot(ax=axes[1])
                axes[1].set_title('Output data')
                plt.tight_layout()

                if i == 1:
                    break
        
        except Exception as e:
            print("Model failed: ", model, e)

    ds = xr.concat(ds, dim='model', coords='minimal')
    ds.coords['model'] = list(valid_models)

    return ds


def collect_data():
    """
    Iterate through all the variables and eras and save the data
    """

    # Define Constants
    eras = ["CMIP6"]
    # variables = ["rsutcs", "rsut", "tos"] # Shortwave cloud forcing
    # variables = ["hur", "tas", "psl", "ta"] # EIS 
    # variables = ["rlut", "rlutcs"] # Longwave cloud forcing
    variables = [
        # "hur"
        # "rlds", # surface downwelling longwave flux, all sky
        # "rlus", # surface upwelling longwave flux, all sky
        # "rldscs", # surface downwelling longwave flux, clear sky
        # "rsds", # surface downwelling shortwave flux, all sky
        # "rsdscs", # surface downwelling shortwave flux, clear sky
        # "rsus", # surface upwelling shortwave flux, all sky (probably don't need)
        # "rsuscs", # surface upwelling shortwave flux, clear sky (probably don't need)
        # "hfls", # surface upward latent heat flux
        # "hfss", # surface upward shortwave heat flux
        # "sfcWind", # surface wind speed
        # "uas", # zonal 10 meter wind speed 
        # "vas", # meridional 10 meter wind speed
        # "psl", # Mean sea-level pressure
        # "ua", # zonal wind speed (lowest level bc not all models have 10m wind speed)
        # "va" # meridional wind speed (lowest level bc not all models have 10m wind speed)
        "zg"
        # "tauu", # zonal wind stress
        # "tauv", # meridional wind stress
        # "tas", # surface air temperature
        # "pr", # precipitation flux (km m^-2 s^-1)
    ]
    calc_anoms = False

    # Create output grid
    # lat = np.arange(-88.75, 90, 2.5)
    # lon = np.arange(1.25, 360, 2.5)
    lat = np.arange(-89.5, 90.5, 1)
    lon = np.arange(.5, 360.5, 1)
    output_grid = xc.create_grid(lat, lon)

    for era in eras: 
        print("Starting era: ", era)
        for var in variables:
            if var == "sfcWind" and era == "CMIP6": continue

            print("Starting variable: ", var)

            if var == "tos": 
                table = "Omon"
            else: 
                table = "Amon"

            ds = ingest_and_process(
                var=var,
                cmipTable=table,
                era=era,
                testing=TESTING,
                output_grid=output_grid,
                calc_anoms=False
            )

            print(ds)
            if (var == "hur") or (var == "ta"):
                var = f"{var}-surface"
                
            if calc_anoms:
                ds.to_netcdf(f"{var}_mon_1850-2100_anoms_{era}_historical.nc")
                ds.close()
            else: 
                ds.to_netcdf(f"{var}_850hpa_mon_1850-2100_{era}_historical.nc")
                ds.close()


TESTING = False
collect_data()

Starting era:  CMIP6
Starting variable:  zg
Models:  57 ['E3SM-1-1', 'E3SM-1-0', 'E3SM-1-1-ECA', 'KIOST-ESM', 'CIESM', 'BCC-ESM1', 'BCC-CSM2-MR', 'CESM2-WACCM', 'CESM2-WACCM-FV2', 'CESM2-FV2', 'CESM2', 'FGOALS-f3-L', 'CAS-ESM2-0', 'FGOALS-g3', 'SAM0-UNICON', 'AWI-CM-1-1-MR', 'AWI-ESM-1-1-LR', 'GFDL-ESM4', 'GFDL-CM4', 'GISS-E2-1-G-CC', 'GISS-E2-2-H', 'GISS-E2-1-H', 'GISS-E2-2-G', 'GISS-E2-1-G', 'CanESM5', 'CAMS-CSM1-0', 'MCM-UA-1-0', 'KACE-1-0-G', 'INM-CM5-0', 'INM-CM4-8', 'MPI-ESM-1-2-HAM', 'TaiESM1', 'EC-Earth3-CC', 'EC-Earth3-Veg-LR', 'EC-Earth3', 'EC-Earth3-Veg', 'EC-Earth3-AerChem', 'CMCC-ESM2', 'CMCC-CM2-HR4', 'CMCC-CM2-SR5', 'ACCESS-ESM1-5', 'MRI-ESM2-0', 'ACCESS-CM2', 'NESM3', 'MIROC6', 'IPSL-CM6A-LR', 'IPSL-CM6A-LR-INCA', 'NorCPM1', 'NorESM2-MM', 'FIO-ESM-2-0', 'ICON-ESM-LR', 'MPI-ESM1-2-LR', 'MPI-ESM1-2-HR', 'E3SM-2-0', 'CanESM5-1', 'E3SM-2-0-NARRM', 'NorESM2-LM']
Starting model:  E3SM-1-1
Model path:  /p/user_pub/work/CMIP6/CMIP/E3SM-Project/E3SM-1-1/historical/r1i1p1f1/Amon/