In [12]:
import pandas as pd
import numpy as np
import gcsfs
import sys
import xarray as xr

In [2]:
df = pd.read_csv('https://cmip6.storage.googleapis.com/cmip6-zarr-consolidated-stores.csv')

In [3]:
# define a simple search on keywords
def search_df(df, verbose= False, **search):
    "search by keywords - if list, then match exactly, otherwise match as substring"
    keys = ['activity_id','institution_id','source_id','experiment_id','member_id', 'table_id', 'variable_id', 'grid_label']
    d = df
    for skey in search.keys():
        
        if isinstance(search[skey], str):  # match a string as a substring
            d = d[d[skey].str.contains(search[skey])]
        else:
            dk = []
            for key in search[skey]:       # match a list of strings exactly
                dk += [d[d[skey]==key]]
            d = pd.concat(dk)
            keys.remove(skey)
    if verbose:
        for key in keys:
            print(key,' = ',list(d[key].unique()))      
    return d

In [4]:
dfp = search_df(df, experiment_id=['historical','ssp585'], table_id=['Omon','Amon'], variable_id=["uas","vas","tos"], grid_label=['gr'])


In [5]:
dm = dfp[['experiment_id','source_id','variable_id','member_id',]].groupby(['experiment_id','source_id','variable_id']).nunique()[['member_id']]

table = pd.DataFrame.pivot_table(dm, values='member_id', index=['experiment_id','source_id'],
                                 columns=['variable_id'], aggfunc=np.sum, fill_value=0)
table

Unnamed: 0_level_0,variable_id,tos,uas,vas
experiment_id,source_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
historical,CESM2,11,0,0
historical,CESM2-FV2,1,0,0
historical,CESM2-WACCM,1,0,0
historical,CESM2-WACCM-FV2,1,0,0
historical,CIESM,0,2,0
historical,CNRM-CM6-1,0,28,27
historical,CNRM-ESM2-1,0,5,5
historical,E3SM-1-0,1,0,0
historical,E3SM-1-1,1,0,0
historical,E3SM-1-1-ECA,1,0,0


In [6]:
table[(table.uas>0)&(table.vas>0)&(table.tos>0)]

Unnamed: 0_level_0,variable_id,tos,uas,vas
experiment_id,source_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
historical,EC-Earth3,2,3,2
historical,KACE-1-0-G,1,3,3
ssp585,KACE-1-0-G,1,1,1


In [13]:
def get_and_organize_cmip6_data(conf):
    # Dictionary to hold the queried variables
    first = True
    for experiment_id in conf.experiment_ids:
        for grid_label in conf.grid_labels:
            for source_id in conf.source_ids:
                for member_id in conf.member_ids:
                    for variable_id, table_id in zip(conf.variable_ids, conf.table_ids):
                        
                        # Create unique key to hold dataset in dictionary
                        key="{}_{}_{}_{}_{}".format(variable_id,experiment_id,grid_label,source_id,member_id)
                        # Historical query string
                        query_string = "source_id=='{}'and table_id=='{}' and grid_label=='{}' and experiment_id=='historical' and variable_id=='{}'".format(source_id, 
                        table_id, 
                        grid_label,
                        variable_id)
                        
                        print(
                            "Running historical query on data: \n ==> {}\n".format(query_string)
                        )
                        ds_hist = perform_cmip6_query(conf,query_string)
                       
                        # Future projection depending on choice in experiment_id
                        query_string = "source_id=='{}'and table_id=='{}' and member_id=='{}' and grid_label=='{}' and experiment_id=='{}' and variable_id=='{}'".format(
                                source_id,
                                table_id,
                                member_id,
                                grid_label,
                                experiment_id,
                                variable_id,
                            )
                            print(
                                "Running projections query on data: \n ==> {}\n".format(
                                    query_string
                                )
                            )
                        ds_proj = perform_cmip6_query(conf,query_string)

                        if first:
                            df_area = conf.df.query(
                                    "variable_id == 'areacella' and source_id =='{}'".format(
                                        source_id
                                    )
                                )
                            ds_area = xr.open_zarr(
                                    conf.fs.get_mapper(df_area.zstore.values[0]), consolidated=True
                                )
                            first = False

                        # Concatentate the historical and projections datasets
                    #    ds_hist=ds_hist.sel(time=slice(ds_hist["time"][0],"2000-12-15"))
                        #  print("Time in projection {} - {}".format(ds_proj["time"][0],ds_proj["time"][-1]))
                        ds = xr.concat([ds_hist, ds_proj], dim="time")

                        # Remove the duplicate overlapping times (e.g. 2001-2014)
                        #  _, index = np.unique(ds["time"], return_index=True)
                        #  ds = ds.isel(time=index)

                        # Extract the time period of interest
                        ds=ds.sel(time=slice(conf.start_date,conf.end_date))
                        print("{} => Dates extracted range from {} to {}\n".format(source_id,ds["time"].values[0], ds["time"].values[-1]))

                        # Save the dataset for variable_id in the dictionary
                        conf.dset_dict[key] = ds

def perform_cmip6_query(conf,query_string):
    df_sub = conf.df.query(query_string)
    if (df_sub.zstore.values.size==0):
        return df_sub
    
    mapper = conf.fs.get_mapper(df_sub.zstore.values[-1])
    ds = xr.open_zarr(mapper, consolidated=True)

    time_object = ds["time"].values[0]
    
    # Convert if necesssary
    if time_object.year == 1:
        
        times = ds["time"].values
        times_plus_2000 = []
        for t in times:
            times_plus_2000.append(
                cftime.DatetimeNoLeap(t.year + 2000, t.month, t.day, t.hour)
            )
        ds["time"].values = times_plus_2000
        ds = xr.decode_cf(ds)                    
    return ds

IndentationError: unexpected indent (<ipython-input-13-95a0c17a701e>, line 34)

In [7]:
class Config_pices():
    df = pd.read_csv(
    "https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv"
)
    fs = gcsfs.GCSFileSystem(token="anon", access="read_only")
   
    grid_labels = ["gr"]  # Can be gr=grid rotated, or gn=grid native
    member_ids = ["r1i1p1f1"]  #
    experiment_ids = ["ssp585"]  #'abrupt-4xCO2',
    source_ids = ["KACE-1-0-G"]
    variable_ids = ["tos","uas","vas"]
    table_ids = ["Omon","Amon","Amon"]  # Amon=atmospheric variables, Omon=Ocean variables, SImon=sea-ice variables
    dset_dict = {}
    start_date="1900-01-01"
    end_date="2100-08-01"

In [11]:
config_pices_obj=Config_pices()
get_and_organize_cmip6_data(config_pices_obj)

Running historical query on data: 
 ==> source_id=='KACE-1-0-G'and table_id=='Omon' and grid_label=='gr' and experiment_id=='historical' and variable_id=='tos'



NameError: name 'xr' is not defined