# Program - Read dataset related to Sc-diagnostic work

**Purpose**

**Content**
- read dataset including CERES, TaiESM hindcast simulation etc. and return Xarray Dataset on different machine such as WD, MAc Studio, etc

**Author:** Yi-Hsuan Chen (yihsuan@umich.edu)

**Date:** 
January 2024

**Reference program:**

**Convert ipynb to py:**

jupyter nbconvert read_data_big.ipynb --to python

**import:**

import read_data as read_data


In [179]:
#import cartopy.crs as ccrs
#import cartopy.feature as cfeature
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr
import io, os, sys, types

#import yhc_module as yhc

xr.set_options(keep_attrs=True)  # keep attributes after xarray operation

<xarray.core.options.set_options at 0x7f19eb1ef220>

In [180]:
#yhc.lib('xr')

## Set machine name

In [181]:
#machine_name = "Mac_studio"
machine_name = "rcec_300T"
#machine_name = "WD"

if (machine_name == "Mac_studio"):
    datapath = "/Users/yi-hsuanchen/Downloads/yihsuan/research/projects/Sc_diag/data/"

elif (machine_name == "WD"):
    datapath = "/Volumes/My_WD_Passport/manuscript/Sc_diag/data"

elif (machine_name == "rcec_300T"):
    datapath = "/lfs/home/yihsuanc/data/"

else:
    error_meg = f"ERROR: machine_name [{machine_name}] is not supported"
    datapath = ""
    raise ValueError(error_meg)

#datapath

## Read CERES data

In [182]:
def read_ceres_data(choice, datapath = datapath+"data.CERES/"):
    
    """
    Read CERES data

    Input arguments:
      choice: a string used in read_data.
      
    Return: 
      Xarray datasets

    Example (also see do_test section) :
      choice = "ddd"
      da1, da2 = read_ceres_data(choice)

    ----------------------
    """
    
    func_name = "read_ceres_data"
    
    #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 
    # read mutiple data and merge time dimenstion
    if (choice == "merge_time_dims"):
        #--- set datapath and file names
        datapath = ""
        
        fnames = ["MERRA2_300.tavg3_3d_tdt_Np.20010710.SUB.nc",
                  "MERRA2_300.tavg3_3d_tdt_Np.20010711.SUB.nc",
                  "MERRA2_300.tavg3_3d_tdt_Np.20010712.SUB.nc"]
        
        #--- read data
        fnames = [datapath+fname1 for fname1 in fnames]
    
        da1 = xr.open_mfdataset(fnames)
    
        return da1
    
    #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 
    #--- read and return individual dataset
    elif (choice == "individual"):
        #--- set datapath and file names
        datapath = ""
        
        fnames = ["MERRA2_300.tavg3_3d_tdt_Np.20010710.SUB.nc",
                  "MERRA2_300.tavg3_3d_tdt_Np.20010711.SUB.nc",
                  "MERRA2_300.tavg3_3d_tdt_Np.20010712.SUB.nc"]
        
        #--- read data
        fnames = [datapath+fname1 for fname1 in fnames]

        da1 = xr.open_dataset(fnames[0])
        da2 = xr.open_dataset(fnames[1])
    
        return da1, da2

    #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 
    #---
    elif (choice == "CERES_July01_30_hourly"):
        #datapath = "/work/Yi-hsuan.Chen/research/edmf_CM4/data_plot.AM4/data.obs.CERES/"
        #datapath = "/Volumes/My_WD_Passport/manuscript/Sc_diag/data/data.CERES/"  # on my Mac
        fnames = ["CERES_SYN1deg-1H_Terra-Aqua-MODIS_Ed4.1_Subset_20010701-20010715.nc",
                  "CERES_SYN1deg-1H_Terra-Aqua-MODIS_Ed4.1_Subset_20010716-20010730.nc",
                 ]
        #--- read data
        fnames = [datapath+fname1 for fname1 in fnames]
        
        da_ceres = xr.open_mfdataset(fnames)
        
        return da_ceres
    
    #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
    #@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 
    #---
    #elif (choice == ""):
    #    datapath = ""
    #    fnames = [        
    #             ]
    #    #--- read data
    #    fnames = [datapath+fname1 for fname1 in fnames]
    #
    #    da = xr.open_dataset(fnames)
    #    return da_
    
    else:
        error_msg = f"ERROR: function [{func_name}] does not support [{choice}]."
        raise ValueError(error_msg)


#-----------
# do_test
#-----------

#do_test=True
do_test=False

if (do_test):

    choice = "CERES_July01_30_hourly"
    
    da1 = read_ceres_data(choice)
    
#print(da1)

## Read ERA5 data

In [183]:
def read_era5_data(choice, datapath = datapath+"data.ERA5/"): 

    func_name = "read_era5_data"
    
    if (choice == "era5_2001July_toa_sw"):        
        fnames = ["ERA5-2001July-toa_sw.nc"]
        
    elif (choice == "era5_DYCOMS_state"):        
        fnames = ["ERA5-DYCOMS_state.nc"]

    elif (choice == "era5_DYCOMS_single_level"):        
        fnames = ["ERA5-2001July-single_level.nc"]
    
    else:
        error_msg = f"ERROR: function [{func_name}] does not support [{choice}]."
        raise ValueError(error_msg)        
    
    #--- read files
    fnames = [datapath+fname1 for fname1 in fnames]
                
    da_return = xr.open_mfdataset(fnames, decode_cf=False)  # ERA5 variables are in short format

    return da_return

#-----------
# do_test
#-----------

do_test=True
#do_test=False

if (do_test):
    #choice = "era5_2001July_toa_sw"
    #choice = "era5_DYCOMS_state"
    choice = "era5_DYCOMS_single_level"

    da1 = read_era5_data(choice)
    #var1_short = da1.tisr
    #var1_float = (var1_short*var1_short.scale_factor + var1_short.add_offset) / 3600.
    #var1_float = var1_float.astype('float32')
    
    #yhc.printv(var1_short.values[0,0:10,20],'short','g')
    #yhc.printv(var1_float.values[0,0:10,20],'float','r')

#print(var1[0,0:10].values)

    #print(var1_float.mean(['time','latitude','longitude']).values)

    #print(da1.tdt_vadv)
#da1

## Read TaiESM1 hindcast data

### function - get_hindcast_filenames

In [184]:
from datetime import datetime, timedelta

############################
############################
############################
def get_previous_date(input_date, days_prior):
    """
    Get the date prior to the input_date

    Input argument:
        input_date = '2001-01-04'
        days_prior = 3

    Output argument:
        a dictionary variable [date, YYYY, MM, DD, YYYYMMDD]

    Example:
        input_date = '2001-01-04'
        days_prior = 3
        date_prior3 = get_previous_date(input_date, days_prior)
        
        date_prior3['date'] is 2001-01-01
        
    Author: Yi-Hsuan Chen
    January 2024
    """
    # Convert the input date string to a datetime object
    input_date = datetime.strptime(input_date, '%Y-%m-%d')

    # Calculate the timedelta for the specified number of days
    delta = timedelta(days=days_prior)

    # Subtract the timedelta from the input date to get the previous date
    previous_date = input_date - delta

    # Format the result as a string in the same format as the input
    date = previous_date.strftime('%Y-%m-%d')
    year, month, day = date.split('-')

    result = {
        'date':date,
        'YYYY':year,
        'MM':month,
        'DD':day,
        #'date_merge':str(year)+str(month).zfill(2)+str(day).zfill(2)
        'YYYYMMDD':year+month+day
             }
    return result

############################
############################
############################
def get_hindcast_filenames(start_date_str, end_date_str, dayN_ensemble,
                           filename_head="hindcast01_2001July-taiesm1.F_2000_TAI.f09_f09.icdate_",
                           cam_h=".cam.h0.",
                          ):

    """
    Get hindcast simulation file names

    Input:
        start_date_str = '2001-07-01'
        end_date_str = '2001-07-31'
        dayN_ensemble=1

    Output:
        filenames, e.g. {filename_head}.20010701.cam.h0.2001-07-01-00000.nc
    """
    
    start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
    end_date = datetime.strptime(end_date_str, '%Y-%m-%d')

    # Convert the date strings to datetime objects
    start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
    end_date = datetime.strptime(end_date_str, '%Y-%m-%d')

    # Define the timedelta for the loop iteration
    delta = timedelta(days=1)

    # Initialize the current date with the start date
    current_date = start_date

    # Loop over the date range

    filenames = []
    
    while current_date <= end_date:
        current_date_str = current_date.strftime('%Y-%m-%d')
        date00 = get_previous_date(current_date_str, dayN_ensemble-1)
        
        #print(date00['date_merge'])

        yyyy, mm ,dd = current_date_str.split('-')
        filename1 = filename_head+date00['YYYYMMDD']+cam_h+current_date_str+"-00000.nc"

        #print(filename1)
        filenames.append(filename1)
        
        # Move to the next date
        current_date += delta

    return filenames
    #print(filenames)

#------------
# do test
#-------------

start_date_str = '2001-07-01'
end_date_str = '2001-07-31'
dayN_ensemble=3
filenames = get_hindcast_filenames(start_date_str, end_date_str, dayN_ensemble)
#filenames

### read_TaiESM1_hindcast_data

In [185]:
#yhc.lib('fdef')

In [205]:
def read_TaiESM1_hindcast_data(choice, dayN_ensemble,
                               icdata_option="ERA5", 
                               hindcast_period="July2001", 
                               #datapath_TaiESM = "/lfs/home/yihsuanc/data/"+"data.TaiESM1_hindcast/data.TaiESM1.July2001_hindcast02_old/",
                               start_date_str=None, end_date_str=None, 
                               do_print_filename=False): 

    func_name = "read_TaiESM1_hindcast_data"

    #--- set icdata_option
    datapath_big = "/lfs/home/yihsuanc/data/data.TaiESM1_hindcast"
    if (icdata_option == "ERA5" and hindcast_period == "July2001"):
        datapath_TaiESM = datapath_big+"/"+"data.TaiESM1.July2001_hindcast02_old/run/"
        filename_head = "hindcast01_2001July-taiesm1.F_2000_TAI.f09_f09.icdate_"

    elif (icdata_option == "JRA3Q"  and hindcast_period == "July2001"):
        datapath_TaiESM = datapath_big+"/"+"data.TaiESM1.July2001_hindcast03/run/"
        filename_head = "hindcast03-taiesm1.F_2000_TAI.f09_f09.JRA3Q_icdate_"

    elif (icdata_option == "ERA5" and hindcast_period == "Oct_Nov2008"):
        datapath_TaiESM = datapath_big+"/"+"data.TaiESM1.Oct_Nov2008_hindcast03/run/"
        filename_head = "hindcast03-taiesm1.F_2000_TAI.f09_f09.ERA5_icdate_"        

    else:
        error_msg = f"ERROR: function [{func_name}] does not support icdata_option=[{icdata_option}]. Available: [ERA5, JRA3Q]"
        raise ValueError(error_msg)
    
    #--- set choice
    if (choice == "TaiESM1_July01_30_2d_1hr"):
        start_date_str = '2001-07-01'
        end_date_str = '2001-07-30'
        cam_h=".cam.h0_2d_1h."

    elif (choice == "TaiESM1_July01_30_state_3hr"):
        start_date_str = '2001-07-01'
        end_date_str = '2001-07-30'
        cam_h=".cam.h1_state_3h."

    elif (choice == "TaiESM1_July01_30_Ttend_3hr"):
        start_date_str = '2001-07-01'
        end_date_str = '2001-07-30'
        cam_h=".cam.h2_Ttend_3h."

    elif (choice == "TaiESM1_July01_30_Qtend_3hr"):
        start_date_str = '2001-07-01'
        end_date_str = '2001-07-30'
        cam_h=".cam.h3_Qtend_3h."

    elif (choice == "custom_2d_1hr"):
        cam_h=".cam.h0_2d_1h."

    elif (choice == "custom_state_3hr"):
        cam_h=".cam.h1_state_3h."

    elif (choice == "custom_Ttend_3hr"):
        cam_h=".cam.h2_Ttend_3h."

    elif (choice == "custom_Qtend_3hr"):
        cam_h=".cam.h3_Qtend_3h."

    elif (choice == "test"):
        start_date_str = '2001-07-20'
        end_date_str = '2001-07-30'
        cam_h=".cam.h3_Qtend_3h."
    
    else:
        error_msg = f"ERROR: function [{func_name}] does not support [{choice}]."
        raise ValueError(error_msg)

    #-- read files
    filenames = get_hindcast_filenames(start_date_str, end_date_str, dayN_ensemble,
                                       filename_head=filename_head, cam_h=cam_h)

    fnames = [datapath_TaiESM+fname1 for fname1 in filenames]
    #print(fnames)
    
    #--- check whether all files exist
    for fname1 in fnames:
        if os.path.exists(fname1):
            aa=0
        else:
            error_msg = f"ERROR: The file '{fname1}' does not exist."
            raise ValueError(error_msg)
    
    da_taiesm = xr.open_mfdataset(fnames) #, concat_dim='time', combine='by_coords')     
    da_taiesm.attrs["fnames"]=filenames

    if (do_print_filename):
        for ff in fnames:
            print(ff)

    return da_taiesm
    #return fnames    

#-----------
# do_test
#-----------

do_test=True
#do_test=False

if (do_test):

    #choice = "TaiESM1_July01_30_2d_1hr"
    #choice = "TaiESM1_July01_30_state_3hr"
    #choice = "TaiESM1_July01_30_Ttend_3hr"
    #choice = "TaiESM1_July01_30_Qtend_3hr"

    choice = "custom_2d_1hr"
    #choice = "custom_state_3hr"
    #choice = "custom_Ttend_3hr"
    #choice = "custom_Qtend_3hr"
    #start_date_str = '2008-11-01' ; end_date_str = '2008-11-30'
    start_date_str = '2008-10-03' ; end_date_str = '2008-10-10'

    #choice = "test"
    dayN_ensemble=2
    icdata_option = "ERA5"
    hindcast_period = "Oct_Nov2008"
    #icdata_option = "JRA3Q"
    #da_taiesm = read_TaiESM1_hindcast_data(choice, dayN_ensemble, icdata_option, do_print_filename=True)
    da_taiesm = read_TaiESM1_hindcast_data(choice, dayN_ensemble, icdata_option, hindcast_period=hindcast_period, start_date_str=start_date_str, end_date_str=end_date_str, do_print_filename=True)

#da_taiesm

/lfs/home/yihsuanc/data/data.TaiESM1_hindcast/data.TaiESM1.Oct_Nov2008_hindcast03/run/hindcast03-taiesm1.F_2000_TAI.f09_f09.ERA5_icdate_20081002.cam.h0_2d_1h.2008-10-03-00000.nc
/lfs/home/yihsuanc/data/data.TaiESM1_hindcast/data.TaiESM1.Oct_Nov2008_hindcast03/run/hindcast03-taiesm1.F_2000_TAI.f09_f09.ERA5_icdate_20081003.cam.h0_2d_1h.2008-10-04-00000.nc
/lfs/home/yihsuanc/data/data.TaiESM1_hindcast/data.TaiESM1.Oct_Nov2008_hindcast03/run/hindcast03-taiesm1.F_2000_TAI.f09_f09.ERA5_icdate_20081004.cam.h0_2d_1h.2008-10-05-00000.nc
/lfs/home/yihsuanc/data/data.TaiESM1_hindcast/data.TaiESM1.Oct_Nov2008_hindcast03/run/hindcast03-taiesm1.F_2000_TAI.f09_f09.ERA5_icdate_20081005.cam.h0_2d_1h.2008-10-06-00000.nc
/lfs/home/yihsuanc/data/data.TaiESM1_hindcast/data.TaiESM1.Oct_Nov2008_hindcast03/run/hindcast03-taiesm1.F_2000_TAI.f09_f09.ERA5_icdate_20081006.cam.h0_2d_1h.2008-10-07-00000.nc
/lfs/home/yihsuanc/data/data.TaiESM1_hindcast/data.TaiESM1.Oct_Nov2008_hindcast03/run/hindcast03-taiesm1.F_200

### read_TaiESM1_hindcast_icdate_files

In [176]:
def read_TaiESM1_hindcast_icdate_files(choice, icdate,
                                       file_dates,
                                       icdata_option="ERA5", 
                                       do_print_filenames=False,  
                                       ):

    func_name = "read_TaiESM1_hindcast_data"

    #--- set icdata_option
    datapath_big = "/lfs/home/yihsuanc/data/data.TaiESM1_hindcast"
    if (icdata_option == "ERA5_DYCOMS"):
        datapath_TaiESM = datapath_big+"/"+"data.TaiESM1.July2001_hindcast02/run/"
        filename_head = "hindcast02_2001July-taiesm1.F_2000_TAI.f09_f09.icdate_"
    elif (icdata_option == "ERA5"):
        datapath_TaiESM = datapath_big+"/"+"data.TaiESM1.July2001_hindcast02_old/run/"
        filename_head = "hindcast01_2001July-taiesm1.F_2000_TAI.f09_f09.icdate_"
    elif (icdata_option == "JRA3Q"):
        datapath_TaiESM = datapath_big+"/"+"data.TaiESM1.July2001_hindcast03/run/"
        filename_head = "hindcast03-taiesm1.F_2000_TAI.f09_f09.JRA3Q_icdate_"
    else:
        error_msg = f"ERROR: function [{func_name}] does not support icdata_option=[{icdata_option}]. Available: [ERA5, JRA3Q]"
        raise ValueError(error_msg)
    
    #--- set choice
    if (choice == "alsdfjlksjdlfjs"):
        cam_h="alskdflksdlfk"
    
    elif (choice == "custom_2d_1hr"):
        cam_h=".cam.h0_2d_1h."

    elif (choice == "custom_state_3hr"):
        cam_h=".cam.h1_state_3h."

    elif (choice == "custom_Ttend_3hr"):
        cam_h=".cam.h2_Ttend_3h."

    elif (choice == "custom_Qtend_3hr"):
        cam_h=".cam.h3_Qtend_3h."

    elif (choice == "test"):
        start_date_str = '2001-07-20'
        end_date_str = '2001-07-30'
        cam_h=".cam.h3_Qtend_3h."
    
    else:
        error_msg = f"ERROR: function [{func_name}] does not support [{choice}]."
        raise ValueError(error_msg)

    #--- get file names
    filenames = []
    for date1 in file_dates:
        fname1 = f"{datapath_TaiESM}{filename_head}{icdate}{cam_h}{date1}.nc"
        if os.path.exists(fname1):
            filenames.append(fname1)
            if (do_print_filenames): print(fname1)
        else:
            error_msg = f"ERROR: The file '{fname1}' does not exist."
            raise ValueError(error_msg)

    #--- read files
    ds = xr.open_mfdataset(filenames) #, concat_dim='time', combine='by_coords')     
    ds.attrs["fnames"]=filenames

    return ds

#############
#############
#############
def get_lat_lon_indexes(ds, lat, lon):
    lat_index = abs(ds['lat'] - lat).argmin().item()
    lon_index = abs(ds['lon'] - lon).argmin().item()

    return lat_index, lon_index

#-----------
# do_test
#-----------

#do_test=True
do_test=False

if (do_test):
    choice = "custom_Ttend_3hr"
    icdate = "20010710"
    file_dates = ["2001-07-12-00000", "2001-07-13-00000"]
    icdata_option = "ERA5_DYCOMS"

    ds = read_TaiESM1_hindcast_icdate_files(choice, icdate, file_dates, icdata_option=icdata_option)

    lat=30 ; lon=25
    #lat_index = abs(ds['lat'] - 21).argmin().item()
    #lon_index = abs(ds['lon'] - 240).argmin().item()
    
    lat_index, lon_index = get_lat_lon_indexes(ds, lat, lon)
    subset = ds.isel(lat=lat_index, lon=lon_index)
        #subset = ds.sel(lat=20, lon=240, method='nearest')

#ds