In [None]:
import xarray as xr
import dask
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from eofs.xarray import Eof
import time

# Create large dataset

In [None]:
main_dir = '/home/disk/eos12/wycheng/'

In [None]:
WWLLN_dataset = xr.open_mfdataset(main_dir+'data/US/WWLLN/WWLLN_*_F_cg_1deg3hr_US.nc',
                                  parallel=True,
                                  chunks={'Time':'auto', 'lat':'auto', 'lon':'auto'})
WWLLN_dataset['F'] = (1/((111.19492664455873)**2)) * (365.25*8) * WWLLN_dataset['F']
display(WWLLN_dataset)

In [None]:
TRMM_dataset = xr.open_mfdataset(main_dir+'data/US/TRMM/TRMM_*_pcp_cg_1deg3hr_US.nc',
                                  parallel=True,
                                  chunks={'Time':'auto', 'lat':'auto', 'lon':'auto'})
TRMM_dataset['pcp'] = TRMM_dataset['pcp'].where(TRMM_dataset['pcp']>0,0)
display(TRMM_dataset)

In [None]:
ERA5_cape_dataset = xr.open_dataset(main_dir+'data/US/ERA5/dataset/ERA5_cape_dataset.nc')
display(ERA5_cape_dataset)

In [None]:
ERA5_t_dataset = xr.open_dataset(main_dir+'data/US/ERA5/dataset/ERA5_t_dataset.nc')
ERA5_q_dataset = xr.open_dataset(main_dir+'data/US/ERA5/dataset/ERA5_q_dataset.nc')
ERA5_z_dataset = xr.open_dataset(main_dir+'data/US/ERA5/dataset/ERA5_z_dataset.nc')

In [None]:
dataset = xr.merge([WWLLN_dataset, TRMM_dataset, ERA5_cape_dataset, ERA5_t_dataset, ERA5_q_dataset, ERA5_z_dataset]).sel(Time=slice("2010-01-01", "2019-12-31"))
dataset['CP'] = dataset['cape'] * dataset['pcp']
dataset = dataset.assign(TO=lambda dataset: 1.0*(dataset.F>0))
dataset = dataset.persist()
display(dataset)

In [None]:
dataset.to_netcdf(path=main_dir+'data/US/dataset/dataset_CONUS.nc', mode='w')

In [None]:
dataset_1dd = dataset.resample(Time='1D').mean().persist()

In [None]:
dataset_1dd.to_netcdf(path=main_dir+'data/US/dataset/dataset_CONUS_1dd.nc', mode='w')

# Create small dataset

In [None]:
main_dir = '/home/disk/eos12/wycheng/'

In [None]:
latm = 32
latp = 42
lonm = -120
lonp = -110

In [None]:
WWLLN_dataset = xr.open_mfdataset(main_dir+'data/US/WWLLN/WWLLN_*_F_cg_1deg3hr_US.nc',
                                  parallel=True,
                                  chunks={'Time':'auto', 'lat':'auto', 'lon':'auto'}).sel(lat=slice(latm,latp), lon=slice(lonm,lonp))
WWLLN_dataset['F'] = (1/((111.19492664455873)**2)) * (365.25*8) * WWLLN_dataset['F']
display(WWLLN_dataset)

In [None]:
TRMM_dataset = xr.open_mfdataset(main_dir+'data/US/TRMM/TRMM_*_pcp_cg_1deg3hr_US.nc',
                                  parallel=True,
                                  chunks={'Time':'auto', 'lat':'auto', 'lon':'auto'}).sel(lat=slice(latm,latp), lon=slice(lonm,lonp))
TRMM_dataset['pcp'] = TRMM_dataset['pcp'].where(TRMM_dataset['pcp']>0,0)
display(TRMM_dataset)

In [None]:
ERA5_cape_dataset = xr.open_dataset(main_dir+'data/US/ERA5/dataset/ERA5_cape_dataset.nc').sel(lat=slice(latm,latp), lon=slice(lonm,lonp))
display(ERA5_cape_dataset)

In [None]:
ERA5_t_dataset = xr.open_dataset(main_dir+'data/US/ERA5/dataset/ERA5_t_dataset.nc').sel(lat=slice(latm,latp), lon=slice(lonm,lonp))
ERA5_q_dataset = xr.open_dataset(main_dir+'data/US/ERA5/dataset/ERA5_q_dataset.nc').sel(lat=slice(latm,latp), lon=slice(lonm,lonp))
ERA5_z_dataset = xr.open_dataset(main_dir+'data/US/ERA5/dataset/ERA5_z_dataset.nc').sel(lat=slice(latm,latp), lon=slice(lonm,lonp))

In [None]:
dataset = xr.merge([WWLLN_dataset, TRMM_dataset, ERA5_cape_dataset, ERA5_t_dataset, ERA5_q_dataset, ERA5_z_dataset]).sel(Time=slice("2010-01-01", "2019-12-31"))
dataset['CP'] = dataset['cape'] * dataset['pcp']
dataset = dataset.assign(TO=lambda dataset: 1.0*(dataset.F>0))
dataset = dataset.persist()
display(dataset)

In [None]:
dataset.to_netcdf(path=main_dir+'data/US/dataset/dataset_CAL.nc', mode='w')