In [None]:
import xarray as xr
from siphon.catalog import TDSCatalog
import pandas as pd
import numpy as np

def contains_all(list1, list2):
    return all(x in list1 for x in list2)

def get_file_url(cat_xml, filename):
    return cat_xml.replace('https://','dap2://').replace('/catalog/','/dodsC/').replace('catalog.xml',filename)

def access_data(file0):
    try:
        remote_data = xr.open_dataset(file0, decode_times=True, engine="pydap")
        # print('Something wrong with time decoding ...')
    except:
        print('Try ..')
        remote_data = xr.open_dataset(file0, decode_times=False, engine="pydap")
        time_values = remote_data.time.values
        raw_time_units  = remote_data.time.units.split(' since ')
        print(raw_time_units)
        if raw_time_units[0] == 'days': 
            time_unit = 'D'
            print(raw_time_units[1])
        else:
            raise ValueError('The time unit is not day ...')
        pandas_dates = pd.to_datetime(time_values, unit='D',origin=raw_time_units[1])
        remote_data['time'] = pandas_dates
    return remote_data
    
def concat_data(cat_xml, lat = [40,55], lon = [80,95], time=[1980, 2014], plev = [85000, 50000]):
    """
        cat_xml: REQUIRED. the URL link of TDS data server, e.g., 'https://esgf.ceda.ac.uk/thredds/catalog/esg_cmip6/CMIP6/CMIP/catalog.xml'
        lat: set the latitude range to clip, default [40,55]
        lon: set the longitude range to clip, default [80,95]
        time: set the time range to clip, defautl [1980, 2014]
        plev: set a list to select pressure levels, default [85000, 50000]
    """
    # read a file name list from TDS server ...
    full_list = list(TDSCatalog(cat_xml).datasets)
    cont = -1
    for filename in full_list:
        file0 = get_file_url(cat_xml, filename) # generate a real link of the file in the server 
        remote_data = access_data(file0) # 
        if ((remote_data.time[0].dt.year>time[1]) or (remote_data.time[-1].dt.year<time[0])):
            # Check the time period
            # if the file is not in the range we needed, skip ...
            # print('No availiable ..' + filename)
            pass 
        else:
            cont += 1
            dump = remote_data.sel(lat = slice(lat[0],lat[1]), 
                                   lon = slice(lon[0],lon[1]),
                                   plev= plev,
                                   time = slice(str(time[0]),str(time[1])))
            if cont == 0: 
                out = dump.copy() # if the first file, just copy it
            else:
                out = xr.concat([out, dump], dim = 'time') # otherwise, concat them one by one according to 'time' dimension. 
            print('Loaded ... ' + filename)
    return out

In [None]:
?concat_data

In [None]:
sel = 'CMIP'

tab_info = pd.DataFrame(index = np.arange(10), columns = ['Institute','Model','nLat','nLon','nFiles','rLat','rLon'])

n0 = -1

cat0 = TDSCatalog('https://esgf.ceda.ac.uk/thredds/catalog/esg_cmip6/CMIP6/'+sel+'/catalog.xml')
# for i0 in ['AS-RCEC']:# 
for i0 in list(cat0.catalog_refs):
    cat1 = TDSCatalog("https://esgf.ceda.ac.uk/thredds/catalog/esg_cmip6/CMIP6/"+sel+"/"+i0+"/catalog.xml")
    # for i1 in ['TaiESM1']:
    for i1 in list(cat1.catalog_refs):  # MODEL NAME
        cat2 = TDSCatalog("https://esgf.ceda.ac.uk/thredds/catalog/esg_cmip6/CMIP6/"+sel+"/"+i0+"/"+i1+"/catalog.xml")
        if 'historical' in list(cat2.catalog_refs):
            cat3 = TDSCatalog("https://esgf.ceda.ac.uk/thredds/catalog/esg_cmip6/CMIP6/"+sel+"/"+i0+"/"+i1+"/historical/catalog.xml")
            if 'r1i1p1f1' in list(cat3.catalog_refs):
                cat4 = TDSCatalog("https://esgf.ceda.ac.uk/thredds/catalog/esg_cmip6/CMIP6/"+sel+"/"+i0+"/"+i1+"/historical/r1i1p1f1/catalog.xml")
                if 'day' in list(cat4.catalog_refs):
                    cat5 = TDSCatalog("https://esgf.ceda.ac.uk/thredds/catalog/esg_cmip6/CMIP6/"+sel+"/"+i0+"/"+i1+"/historical/r1i1p1f1/day/catalog.xml")
                    if contains_all(list(cat5.catalog_refs), ['ta','ua', 'va', 'hus','psl']):
                        check_gn = TDSCatalog("https://esgf.ceda.ac.uk/thredds/catalog/esg_cmip6/CMIP6/"+sel+"/"+i0+"/"+i1+"/historical/r1i1p1f1/day/ta/catalog.xml")
                        if 'gn' in list(check_gn.catalog_refs):
                            n0 += 1
                            cat_xml = "https://esgf.ceda.ac.uk/thredds/catalog/esg_cmip6/CMIP6/"+sel+"/"+i0+"/"+i1+"/historical/r1i1p1f1/day/ta/gn/latest/catalog.xml"
                            ta_list = list(TDSCatalog(cat_xml).datasets)
                            s0 = get_file_url(cat_xml, ta_list[0])
                            s1 = access_data(s0) 
                            tab_info.loc[n0, 'Institute'] = i0
                            tab_info.loc[n0, 'Model'] = i1
                            tab_info.loc[n0, 'nLat'] = s1.lat.shape[0]
                            tab_info.loc[n0, 'nLon'] = s1.lon.shape[0]
                            tab_info.loc[n0, 'rLat'] = np.diff(s1.lat.values)[0] # Resolution
                            tab_info.loc[n0, 'rLon'] = np.diff(s1.lon.values)[0] # Resolution
                            tab_info.loc[n0, 'nFiles'] = len(ta_list)
                            # print(i1, s1.lat.shape[0], s1.lon.shape[0])
                            # print(i1, ta_list)

tab_info

In [None]:
import intake_esm
import intake
cat_url = intake_esm.tutorial.get_url("google_cmip6")
cat = intake.open_esm_datastore(cat_url)