In [17]:
import os
import sys
import yaml
from glob import glob
from datetime import datetime, timedelta

import numpy as np
import xarray as xr
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
sys.path.insert(0, os.path.realpath('../libs/'))
import verif_utils as vu

In [4]:
config_name = os.path.realpath('verif_config.yml')

with open(config_name, 'r') as stream:
    conf = yaml.safe_load(stream)

In [18]:
def map_hour_to_clim_hour(clim_hours, hour):
    if hour in clim_hours:
        ind = np.where(clim_hours == hour)[0][0]
        return ind
    else:
        print('Verification target and climatology reference have different time resolutions')
        raise;
        return -1

def accum_6h_24h(ds_ours, ini=0, copy=True):
    h_shift = ini + 6
    h_convert_ending_time = 24 + ini
    
    if copy:
        ds_ours_shift = ds_ours.copy(deep=True)
        # convert to start time to work with xarray resample
        ds_ours_shift['time'] = ds_ours_shift['time'] - pd.Timedelta(hours=h_shift)
        # accumulate
        ds_ours_24h = ds_ours_shift.resample(time='24h').sum()
    else:
        ds_ours['time'] = ds_ours['time'] - pd.Timedelta(hours=h_shift)
        ds_ours_24h = ds_ours.resample(time='24h').sum()
        
    ds_ours_24h['time'] = ds_ours_24h['time'] + pd.Timedelta(hours=h_convert_ending_time)
    
    return ds_ours_24h

### Weatherbench2 to ours mapping

In [6]:
variable_levels = {
    'surface_net_solar_radiation': None,
    'surface_net_thermal_radiation': None,
    'surface_sensible_heat_flux': None,
    'surface_latent_heat_flux': None,
    'top_net_solar_radiation': None,
    'top_net_thermal_radiation': None,
}

# variable_levels_clim = {
#     'mean_surface_net_short_wave_radiation_flux': None,
#     'mean_surface_net_long_wave_radiation_flux': None,
#     'mean_surface_sensible_heat_flux': None,
#     'mean_surface_latent_heat_flux': None,
#     'mean_top_net_short_wave_radiation_flux': None,
#     'mean_top_net_long_wave_radiation_flux': None,
# }

# rename_IFS_to_ERA5 = {
#     'mean_surface_net_short_wave_radiation_flux': 'surface_net_solar_radiation',
#     'mean_surface_net_long_wave_radiation_flux': 'surface_net_thermal_radiation',
#     'mean_surface_sensible_heat_flux': 'surface_sensible_heat_flux',
#     'mean_surface_latent_heat_flux': 'surface_latent_heat_flux',
#     'mean_top_net_short_wave_radiation_flux': 'top_net_solar_radiation',
#     'mean_top_net_long_wave_radiation_flux': 'top_net_thermal_radiation',
# }

### ERA5 verif target

In [7]:
# ---------------------------------------------------------------------------------------- #
# ERA5 verif target
filename_ERA5 = sorted(glob(conf['ERA5_ours']['save_loc']))

# pick years
year_range = conf['ERA5_ours']['year_range']
years_pick = np.arange(year_range[0], year_range[1]+1, 1).astype(str)
filename_ERA5 = [fn for fn in filename_ERA5 if any(year in fn for year in years_pick)]

# merge yearly ERA5 as one
ds_ERA5 = [vu.get_forward_data(fn) for fn in filename_ERA5]
ds_ERA5_merge = xr.concat(ds_ERA5, dim='time')

ds_ERA5_merge = vu.ds_subset_everything(ds_ERA5_merge, variable_levels)

# latitude weighting
lat = xr.open_zarr(filename_ERA5[0])["latitude"]
w_lat = np.cos(np.deg2rad(lat))
w_lat = w_lat / w_lat.mean()

In [19]:
ds_ERA5_merge_24h = accum_6h_24h(ds_ERA5_merge, copy=False)

### ERA5 clim

In [26]:
# ERA5_clim = xr.open_dataset(conf['ERA5_weatherbench']['save_loc_clim']+'ERA5_clim_1990_2019_6h_1deg_interp.nc')
# ERA5_clim = vu.ds_subset_everything(ERA5_clim, variable_levels_clim)
ERA5_clim = xr.open_dataset(conf['ERA5_weatherbench']['save_loc_clim']+'ERA5_clim_1990_2019_daily_1deg_interp.nc')

### RMSE from clim

In [27]:
varname_verif = list(ds_ERA5_merge.keys())

In [28]:
# ======================================================================================== #
# RMSE computation
rmse_dict = {}

for varname in varname_verif:
    var_analysis = ds_ERA5_merge_24h[varname]
    var_clim = ERA5_clim[varname]
    var_rmse = np.sqrt((w_lat* (var_clim - var_analysis)**2).mean(['latitude', 'longitude']))
    rmse_dict[varname] = var_rmse

rmse_dataset = xr.Dataset(rmse_dict)

In [30]:
path_verif = conf['ERA5_weatherbench']['save_loc_clim']+'combined_rmse_clim_2020_2022_daily_1deg.nc'
# rmse_dataset.to_netcdf(path_verif, compute=True, format='NETCDF4')

In [24]:
conf['ERA5_weatherbench']['save_loc_clim']+'combined_rmse_clim_2020_2022_daily_1deg.nc'

'/glade/campaign/cisl/aiml/ksha/CREDIT_physics/VERIF/ERA5_clim/combined_rmse_clim_2020_2022_daily_1deg.nc'

In [62]:
# chunk_size_clim = {'dayofyear': 1, 'hour': 1, 'latitude': 640, 'longitude': 1280}
# chunk_size_ERA5 = {'time': 1, 'latitude': 640, 'longitude': 1280}

# # get data in chunked version
# ds_actual = ds_ERA5_merge.chunk(chunk_size_ERA5)
# ds_clim = ERA5_clim.chunk(chunk_size_clim)

# # ======================================================================================== #
# # for ds_actual, convert its 'time' dimension to 'dayofyear' and 'hour'

# # extract 'dayofyear' and 'hour' from actual data
# dayofyear_da = ds_actual['time'].dt.dayofyear
# hour_da = ds_actual['time'].dt.hour

# # map actual hours to the corresponding climatology hour
# clim_hours = ds_clim['hour'].values
# mapped_hours = np.array([map_hour_to_clim_hour(clim_hours, h) for h in hour_da.values])

# # create xr.DataArrays to hold dayofyear and mapped hours
# dayofyear_da = xr.DataArray(dayofyear_da.values, dims='time', coords={'time': ds_actual['time']})
# mapped_hours_da = xr.DataArray(mapped_hours, dims='time', coords={'time': ds_actual['time']})

# # ======================================================================================== #
# # for ds_clim, identify its indices that matches with ds_actual on 'dayofyear' and 'hour'

# # get indices of ds_actual from climatology data
# clim_dayofyear_index = ds_clim.get_index('dayofyear')

# ds_clim['hour'] = np.array([0, 1, 2, 3]) # <----- convert actual hour values to hour indeices
# clim_hour_index = ds_clim.get_index('hour')

# dayofyear_indices = clim_dayofyear_index.get_indexer(dayofyear_da.values)
# hour_indices = clim_hour_index.get_indexer(mapped_hours_da.values)

# # check for unmatched indices
# if np.any(dayofyear_indices == -1):
#     raise ValueError("Some 'dayofyear' values not found in climatology data")
# if np.any(hour_indices == -1):
#     raise ValueError("Some 'hour' values not found in climatology data")

# # create xr.DataArrays for indices from ds_actual to ds_clim
# dayofyear_indices_da = xr.DataArray(dayofyear_indices, dims='time', coords={'time': ds_actual['time']})
# hour_indices_da = xr.DataArray(hour_indices, dims='time', coords={'time': ds_actual['time']})

# # ======================================================================================== #
# # boradcast ds_clim to the size of ds_actual with matched 'dayofyear' and 'hour'
# clim_matched = ds_clim.isel(dayofyear=dayofyear_indices_da, hour=hour_indices_da)

# # ======================================================================================== #
# # RMSE computation
# rmse_dict = {}

# for varname in varname_verif:
#     var_analysis = ds_ERA5_merge[varname]
#     var_clim = clim_matched[varname]
#     var_rmse = np.sqrt((w_lat* (var_clim - var_analysis)**2).mean(['latitude', 'longitude']))
#     rmse_dict[varname] = var_rmse

# rmse_dataset = xr.Dataset(rmse_dict)

**GPH RMSE**

In [65]:
rmse_dataset['Z'].mean(dim='time').values

array([1225.0531 , 1020.5859 , 1086.869  , 1136.2949 , 1109.3829 ,
        959.7517 ,  815.8004 ,  705.2293 ,  625.3326 ,  559.9109 ,
        557.0431 ,  568.50244], dtype=float32)

**Save**

In [66]:
path_verif = conf['ERA5_weatherbench']['save_loc_clim']+'combined_rmse_clim_2020_2022_6h_1deg.nc'
# rmse_dataset.to_netcdf(path_verif, compute=True, format='NETCDF4')