# Tests for the skill profiling framework

Run some tests on functions used in the skill profiling framework.

In [1]:
import pandas as pd
import numpy as np

# local imports
from config import data_dir
from analog_forecast import read_subset_era5
from run_profile import get_naive_sample_dates

Load a dataset (spatial subset) for doing some testing with.

In [2]:
%%time

varname = "t2m"
spatial_domain = "alaska"
ref_date = "2004-10-11"

raw_sub_da = read_subset_era5(spatial_domain, data_dir, varname, use_anom=False).load()

CPU times: user 4.98 s, sys: 9.69 s, total: 14.7 s
Wall time: 44.6 s


`get_naive_sample_dates` should return a tuple of ("list of naive analog dates and 14 dates for each" (`all_naive_dates` below), "list of naive analog dates" (`naive_analog_dates` below). Run some verifications.

In [3]:
all_times = raw_sub_da.time.values
all_naive_dates, naive_analog_dates = get_naive_sample_dates(all_times, ref_date)

`naive_analog_dates` should be contained within the `all_naive_dates` (ok if type is different):

In [4]:
assert np.all([dt in all_naive_dates for dt in naive_analog_dates])

`naive_analog_dates` should not contain dates that are within 15 days of forecast date:

In [5]:
ref_dt = pd.to_datetime(ref_date + " 12:00:00")
forecast_length = 14
between = pd.Series(naive_analog_dates).between(
    ref_dt - pd.to_timedelta(forecast_length + 2, "D"),
    ref_dt + pd.to_timedelta(forecast_length + 1, "D")
)
assert np.all(~between)

`naive_analog_dates` should all be within +/- 45 days of day-of-year of `ref_date`:

In [6]:
ref_day = ref_dt.day
ref_month = ref_dt.month

def get_window_from_analog_date(dt, ref_month, ref_day):
    year = pd.to_datetime(naive_analog_dates[0]).year
    tmp_ref_dt = pd.to_datetime(f"{year}-{ref_month}-{ref_day} 12:00:00")
    td_offset = pd.to_timedelta(45, "D")
    return pd.Series(tmp_ref_dt).between(
        tmp_ref_dt - (td_offset + pd.to_timedelta(1, "D")), 
        tmp_ref_dt + td_offset
    )

assert np.all([get_window_from_analog_date(dt, ref_month, ref_day) for dt in naive_analog_dates])

In [None]:
def get_naive_sample_dates(all_times, ref_date):
    """Constructs list of all dates to be queried in some fashion for an instance of the naive forecast"""
    # limit pool of all potential times for analogs to be within 3-month window centered on
    #  day-of-year of reference date
    # iterate over all available years and use the same month-day as ref_dt to 
    #  construct acceptance window of +/- 45 days, and accumulate boolean series
    #  corresponding to all_times which essentially will be reduced via "or"
    #  operation over the year axis.
    ref_dt = pd.to_datetime(ref_date + " 12:00:00")
    td_offset = pd.to_timedelta(45, "D")
    all_times = pd.Series(all_times)
    keep_bool = []
    
    for year in np.unique(pd.Series(all_times).dt.year):
        tmp_ref_dt = pd.to_datetime(f"{year}-{ref_dt.month}-{ref_dt.day}")
        keep_bool.append(
            all_times.between(
                tmp_ref_dt - (td_offset + pd.to_timedelta(1, "D")), 
                tmp_ref_dt + td_offset
            )
        )
    keep_times = all_times[np.array(keep_bool).sum(axis=0).astype(bool)]
    
    # construct an exclusion window around ref_date, based on size of forecast which is 14 days
    forecast_length = 14
    exclude = keep_times.between(
        ref_dt - pd.to_timedelta(forecast_length + 2, "D"),
        ref_dt + pd.to_timedelta(forecast_length + 1, "D")
    )
    keep_times[~exclude]
    
    # choose 5 times at random
    analog_times = list(np.random.choice(keep_times, 5, replace=False))
    
    all_dates = []
    for t in analog_times + [ref_dt]:
        all_dates.extend(pd.date_range(t, t + pd.to_timedelta(forecast_length, "D")))
    
    return all_dates, analog_times

Results for anomaly-based and standard search options should be identical because we are randomly sampling the same windows for analogs for each, and forecasts should only be generated from raw data (i.e. not anomalies).

In [4]:
import xarray as xr
# local imports
from config import data_dir
from analog_forecast import read_subset_era5
from run_profile import profile_naive_forecast

In [6]:
varname = "t2m"
spatial_domain = "alaska"
ref_date = "2004-10-11"

In [13]:
%%time
varname = "t2m"
spatial_domain = "alaska"

anom_sub_da = read_subset_era5(spatial_domain, data_dir, varname, use_anom=True).load()
raw_sub_da = read_subset_era5(spatial_domain, data_dir, varname, use_anom=False).load()

CPU times: user 5.46 s, sys: 15.7 s, total: 21.1 s
Wall time: 56 s


In [15]:
raw_sub_da

CPU times: user 37.1 ms, sys: 6.15 ms, total: 43.3 ms
Wall time: 128 ms


In [12]:
%%time
anom_sub_da.load()

CPU times: user 107 µs, sys: 347 µs, total: 454 µs
Wall time: 465 µs
