# RMSE ERA5 vs. ERA5 climatology

In [1]:
import os
import sys
import yaml
from glob import glob
from datetime import datetime, timedelta

import numpy as np
import xarray as xr

In [2]:
sys.path.insert(0, os.path.realpath('../libs/'))
import verif_utils as vu

In [3]:
config_name = os.path.realpath('verif_config_6h.yml')

with open(config_name, 'r') as stream:
    conf = yaml.safe_load(stream)

In [4]:
path_verif = conf['ERA5_weatherbench']['save_loc_clim']+'combined_rmse_clim_2020_2022.nc'

### Get ERA5 analyzed states

In [5]:
# ---------------------------------------------------------------------------------------- #
# ERA5 verif target
filename_ERA5 = sorted(glob(conf['ERA5_ours']['save_loc']))

# pick years
year_range = conf['ERA5_ours']['year_range']
years_pick = np.arange(year_range[0], year_range[1]+1, 1).astype(str)
filename_ERA5 = [fn for fn in filename_ERA5 if any(year in fn for year in years_pick)]

# merge yearly ERA5 as one
ds_ERA5 = [vu.get_forward_data(fn) for fn in filename_ERA5]
ds_ERA5_merge = xr.concat(ds_ERA5, dim='time')
    
# Select the specified variables and their levels
variables_levels = conf['ERA5_ours']['verif_variables']

# subset merged ERA5 and unify coord names
ds_ERA5_merge = vu.ds_subset_everything(ds_ERA5_merge, variables_levels)
ds_ERA5_merge = ds_ERA5_merge.rename({'latitude':'lat','longitude':'lon'})

### Get ERA5 climatology

In [6]:
ERA5_clim = xr.open_dataset(conf['ERA5_weatherbench']['save_loc_clim']+'ERA5_clim_1990_2019_6h_interp.nc')

**Latitude weights**

In [19]:
# latitude weighting
lat = ERA5_clim["lat"]
w_lat = np.cos(np.deg2rad(lat))
w_lat = w_lat / w_lat.mean()

### Compute RMSE

In [None]:
def map_hour_to_clim_hour(clim_hours, hour):
    if hour in clim_hours:
        ind = np.where(clim_hours == hour)[0][0]
        return ind
    else:
        print('Verification target and climatology reference have different time resolutions')
        raise;
        return -1

In [15]:
chunk_size_clim = {'dayofyear': 1, 'hour': 1, 'lat': 640, 'lon': 1280}
chunk_size_ERA5 = {'time': 1, 'lat': 640, 'lon': 1280}

# get data in chunked version
ds_actual = ds_ERA5_merge.chunk(chunk_size_ERA5)
ds_clim = ERA5_clim.chunk(chunk_size_clim)

# ======================================================================================== #
# for ds_actual, convert its 'time' dimension to 'dayofyear' and 'hour'

# extract 'dayofyear' and 'hour' from actual data
dayofyear_da = ds_actual['time'].dt.dayofyear
hour_da = ds_actual['time'].dt.hour

# map actual hours to the corresponding climatology hour
clim_hours = ds_clim['hour'].values
mapped_hours = np.array([map_hour_to_clim_hour(clim_hours, h) for h in hour_da.values])

# create xr.DataArrays to hold dayofyear and mapped hours
dayofyear_da = xr.DataArray(dayofyear_da.values, dims='time', coords={'time': ds_actual['time']})
mapped_hours_da = xr.DataArray(mapped_hours, dims='time', coords={'time': ds_actual['time']})

# ======================================================================================== #
# for ds_clim, identify its indices that matches with ds_actual on 'dayofyear' and 'hour'

# get indices of ds_actual from climatology data
clim_dayofyear_index = ds_clim.get_index('dayofyear')
clim_hour_index = ds_clim.get_index('hour')

dayofyear_indices = clim_dayofyear_index.get_indexer(dayofyear_da.values)
hour_indices = clim_hour_index.get_indexer(mapped_hours_da.values)

# check for unmatched indices
if np.any(dayofyear_indices == -1):
    raise ValueError("Some 'dayofyear' values not found in climatology data")
if np.any(hour_indices == -1):
    raise ValueError("Some 'hour' values not found in climatology data")

# create xr.DataArrays for indices from ds_actual to ds_clim
dayofyear_indices_da = xr.DataArray(dayofyear_indices, dims='time', coords={'time': ds_actual['time']})
hour_indices_da = xr.DataArray(hour_indices, dims='time', coords={'time': ds_actual['time']})

# ======================================================================================== #
# boradcast ds_clim to the size of ds_actual with matched 'dayofyear' and 'hour'
clim_matched = ds_clim.isel(dayofyear=dayofyear_indices_da, hour=hour_indices_da)

# ======================================================================================== #
# RMSE computation
rmse_dict = {}

for varname in list(variables_levels.keys()):
    var_analysis = ds_ERA5_merge[varname]
    var_clim = clim_matched[varname]
    var_rmse = np.sqrt((w_lat* (var_clim - var_analysis)**2).mean(['lat', 'lon']))
    rmse_dict[varname] = var_rmse

rmse_dataset = xr.Dataset(rmse_dict)

In [30]:
# rmse_dataset.to_netcdf(path_verif, compute=True, format='NETCDF4')