# Quality Control

This notebook is for validating the drought indices dataset produced using the `scripts/process.py` script.

In [36]:
import numpy as np
import xarray as xr
import pandas as pd
from config import INDICES_DIR, DOWNLOAD_DIR, CLIM_DIR
import luts

## Index validation

For each drought index, re-compute a value manually and compare with the indices dataset.

We will be working with the climatologies, the downloaded ERA5 data, and of course the computed indices data. Set up connections to these datasets.

Indices dataset:

In [3]:
intervals = pd.Index([1, 7, 30, 60, 90, 180, 365], name="interval")
fps = [INDICES_DIR.joinpath(f"nws_drought_indices_{i}day.nc") for i in intervals]
indices_ds = xr.open_mfdataset(fps, combine="nested", concat_dim=[intervals])

Define some fixed variables that will be used throughout:

In [47]:
# our most recent day of available data (arbitrary file)
with xr.open_dataset(DOWNLOAD_DIR.joinpath(f"total_precipitation_current_month.nc")) as ds:
    ref_date = ds.time.dt.date.values[-1]

Define a function to help with extracting data from grid cells in ERA5 downloads, since we will be doing this for every index:

In [248]:
def extract_era5(index, time_slice, lat, lon):
    """Function to open the three ERA5 datasets for a given variable name and extract the data from a grid cell for a given point location"""
    varname_lu = {
        "tp": "tp",
        "pntp": "tp",
        "swe": "sd",
        "pnswe": "sd"
    }
    varname = varname_lu[index]
    da_list = []
    latlon_sel_di = {"latitude": lat, "longitude": lon}
    for fp in DOWNLOAD_DIR.glob(f"{luts.varname_prefix_lu[varname]}*.nc"):
        with xr.open_dataset(
            fp
        ) as ds:
            if "expver" in ds.dims:
                # if expver is present, combine from both into a single dataset and drop it
                da = xr.merge([
                    ds[varname].sel(
                        latlon_sel_di, method="nearest"
                    ).sel(expver=1).drop("expver"),
                    ds[varname].sel(
                        latlon_sel_di, method="nearest"
                    ).sel(expver=5).drop("expver")
                ])[varname].sel(time=time_slice)

            else:
                da = ds[varname].sel(
                    latlon_sel_di, method="nearest"
                ).sel(time=time_slice)

            da_list.append(da)
            
    out_da = xr.concat(da_list, dim="time").sortby("time")
    return out_da


def extract_clim(varname, doy_slice, lat, lon):
    clim_lu = {
        "tp": "era5_daily_tp_climatology_1981_2020_leap.nc",
        "swe": "era5_swe_climo_81-20.nc"
    }
    with xr.open_dataset(CLIM_DIR.joinpath(clim_lu[varname])) as clim_ds:
        
        # need to re-index longitude in some cases, not consistent across clim datasets
        if clim_ds.longitude.values[0] == 180:
            clim_ds = clim_ds.assign_coords(
                longitude=(clim_ds.longitude.values) - 360
            )
        # same reason as above, inconsistency between clims... reindex coords with dayofyear
        if clim_ds.time.dt.year.values[0] == 1980:
            clim_ds = clim_ds.assign_coords(
                time=clim_ds.time.dt.dayofyear
            )

        clim_da = clim_ds[varname].sel(time=doy_slice).sel(
            latitude=lat, longitude=lon, method="nearest"
        )

    return clim_da

    
def get_time_slice(ref_date, interval):
    start_date = ref_date - pd.to_timedelta(f"{interval - 1} day")
    return slice(str(start_date), str(ref_date))


def get_doy_slice(ref_date, interval):
    ref_doy = ref_date.timetuple().tm_yday
    start_doy = ref_doy - (interval - 1)
    return slice(start_doy, ref_doy)

Now work through each index and test the existing values against newly processed ones for the following intervals and locations:

In [None]:
interval = 30
lat, lon = 65, -148

#### Total precip

Total precip should be the sum of the precip values over the specified interval.

In [None]:
index = "tp"
test = indices_ds[index].sel(interval=interval).sel(
    latitude=lat, longitude=lon, method="nearest"
).compute()
time_slice = get_time_slice(ref_date, interval)
raw = extract_era5("tp", time_slice, lat, lon)
# convert m to cm
check = (raw.sum() * 100).astype("float32")
assert check == test

#### Total precip % of normal

In [161]:
index = "pntp"
test = indices_ds[index].sel(interval=interval).sel(
    latitude=lat, longitude=lon, method="nearest"
).compute()
doy_slice = get_doy_slice(ref_date, interval)
clim = extract_clim("tp", doy_slice, lat, lon)
check = np.round(((raw.sum() * 100) / clim.sum()).astype("float32") * 100)
assert check == test

#### Snow water equivalent

In [215]:
index = "swe"
test = indices_ds[index].sel(interval=interval).sel(
    latitude=lat, longitude=lon, method="nearest"
).compute()
time_slice = get_time_slice(ref_date, interval)
raw = extract_era5(index, time_slice, lat, lon)
check = (raw.mean() * 100).astype("float32")
assert check == test

#### SWE % of normal

In [242]:
index = "pnswe"
test = indices_ds[index].sel(interval=interval).sel(
    latitude=lat, longitude=lon, method="nearest"
).compute()
doy_slice = get_doy_slice(ref_date, interval)
clim = extract_clim("swe", doy_slice, lat, lon)
check = np.round((raw.mean() / clim.mean()) * 100).astype("float32")
assert check == test

yes
