# Quality control for regridding efforts

Use this notebook to check the quality of the regridded data.

In [61]:
# from multiprocessing import Pool
import cftime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
from config import *
from regrid import open_and_crop_dataset, prod_lat_slice


### Evaluate regridding

Here we will perform a qualitative assessment of the regridding by plotting multiple comparisons of files with each other.

In [2]:
regrid_fps = list(regrid_dir.glob("**/*.nc"))


In [1]:
# define a function to return a single filepath if a given date is in the string filename
from pandas.errors import OutOfBoundsDatetime


def get_matching_time_filepath(fps, test_date):
    matching_fps = []
    for fp in fps:
        start_str, end_str = fp.name.split(".nc")[0].split("_")[-1].split("-")
        start_str = f"{start_str}01" if len(start_str) == 6 else start_str
        # end date should be constructed as the end of month for monthly data
        #  (and should always be December??)
        end_str = f"{end_str}31" if len(end_str) == 6 else end_str
        format_str = "%Y%m%d"
        try:
            start_dt = pd.to_datetime(start_str, format=format_str)
            # it should be OK if end date is
            end_dt = pd.to_datetime(end_str, format=format_str)
        except OutOfBoundsDatetime:
            # we should not be regridding files with time values that cause this (2300 etc)
            continue

        if start_dt <= test_date < end_dt:
            matching_fps.append(fp)

    # there should only be one
    assert len(matching_fps) == 1

    return matching_fps[0]


def generate_cmip6_filepath_from_regrid_filename(fn):
    """Get the path to the original CMIP6 filename from a regridded file name.

    Because the original CMIP6 filenames were split up during the processing, this method finds the original filename based on matching all possible attributes, then testing for inclusion of regrid file start date within the date range formed by the CMIP6 file timespan.
    """
    var_id, freq, model, scenario, _, timespan = fn.split(".nc")[0].split("_")
    institution = model_inst_lu[model]
    experiment_id = "ScenarioMIP" if scenario in prod_scenarios else "CMIP"
    # Construct the original CMIP6 filepath from the filename.
    # Need to use glob because of the "grid type" filename attribute that we do not have a lookup for.
    var_dir = cmip6_dir.joinpath(f"{experiment_id}/{institution}/{model}/{scenario}")
    glob_str = f"*/{freq}/{var_id}/*/*/{var_id}_{freq}_{model}_{scenario}_*.nc"
    candidate_fps = list(var_dir.glob(glob_str))

    start_str = timespan.split("-")[0]
    format_str = "%Y%m" if len(start_str) == 6 else "%Y%m%d"
    start_dt = pd.to_datetime(start_str, format=format_str)
    cmip6_fp = get_matching_time_filepath(candidate_fps, start_dt)

    return cmip6_fp


def plot_comparison(regrid_fp):
    src_fp = generate_cmip6_filepath_from_regrid_filename(regrid_fp.name)
    src_ds = open_and_crop_dataset(src_fp, lat_slice=prod_lat_slice)
    regrid_ds = xr.open_dataset(regrid_fp)
    lat_slice = slice(55, 75)
    lon_slice = slice(200, 240)
    time_val = regrid_ds.time.values[0]
    var_id = src_ds.attrs["variable_id"]

    fig, axes = plt.subplots(1, 2, figsize=(15, 4))
    fig.suptitle(
        f"Variable: {var_id}     Model: {src_ds.attrs['source_id']}     Scenario: {src_ds.attrs['experiment_id']}"
    )

    if isinstance(src_ds.time.values[0], cftime._cftime.Datetime360Day):
        src_time = cftime.Datetime360Day(
            year=time_val.year, month=time_val.month, day=time_val.day
        )
    elif isinstance(src_ds.time.values[0], pd._libs.tslibs.timestamps.Timestamp):
        src_time = pd.to_datetime(f"{time_val.year}-{time_val.month}-{time_val}")
    else:
        src_time = time_val
    src_ds[var_id].sel(time=src_time, method="nearest").sel(
        lat=lat_slice, lon=lon_slice
    ).plot(ax=axes[0])
    axes[0].set_title("Source dataset")
    regrid_ds[var_id].sel(time=time_val).sel(lat=lat_slice, lon=lon_slice).plot(
        ax=axes[1]
    )
    axes[1].set_title("Regridded dataset")
    plt.show()


Now, randomly select and plot comparisons for a subset of the files using an Alaska-ish domain for some added zoom.

In [2]:
for i in range(100):
    fp = np.random.choice(regrid_fps)
    plot_comparison(fp)