# Validate 2km tas/pr

Use this notebook to ensure that the data that has been actually loaded into rasdaman matches what we expect. 

## CRU TS 4.0 seasonal baseline stats point extraction check

This section validates the seasonal baseline statistics coverage by comparing point queries to the coverage against manually extracted and computed values. The goal in writing the code to re-compute the values was not efficiency but rather making the computations more straightfroward. 

### Setup

The CRU TS 4.0  GeoTIFFs need to be in a single folder for each variable, and each folder should be in the same directory. That directory may be stored in the `$SCRATCH_DIR` environment variable or may be set in the code cell below. 

The name of this coverage in rasdaman is `iem_cru_2km_taspr_seasonal_baseline_stats`.

In [1]:
import os
import xml.etree.ElementTree as ET
from calendar import monthrange
from pathlib import Path
import requests
import numpy as np
import rasterio as rio
from rasterio.windows import Window
from pyproj import Transformer
from pyproj.crs import CRS

In [2]:
# set path to datasets
scratch_path = os.getenv("SCRATCH_DIR") or "/atlas_scratch/kmredilla/iem-webapp"
scratch_dir = Path(scratch_path)

In [3]:
# setup seasons to iterate over
seasons = {
    "DJF": [12, 1, 2],
    "MAM": [3, 4, 5],
    "JJA": [6, 7, 8],
    "SON": [9, 10, 11],
}

# compute season weights for weighting 
# seasonal avaerages based on month length
season_weights = {}
for season in seasons.keys():
    month_lengths = [monthrange(2021, month)[1] for month in seasons[season]]
    season_length = np.sum(month_lengths)
    season_weights[season] = [n / season_length for n in month_lengths]
    
# lookup for template fps based on varname
temp_fps = {
    "tas": "tas_mean_C_CRU_TS40_historical_{}_{}.tif",
    "pr": "pr_total_mm_CRU_TS40_historical_{}_{}.tif"
}

#### Test coordinates

Coordinates for testing locations are set here.

In [4]:
# transform test coords to epsg 3338
test_coords_4326 = [
    (65.857, -147.86),
    (60.128, -149.417),
    (66.565, -152.643),
    (59.24, -135.51),
    (67.57, -162.97),
]

transformer = Transformer.from_crs(4326, 3338)

test_coords_3338 = list(transformer.itransform(test_coords_4326))

for latlon, xy in zip (test_coords_4326, test_coords_3338):
    print(f"WGS84 Lat/Lon: {latlon}, AK Albers X,Y: {xy}")

WGS84 Lat/Lon: (65.857, -147.86), AK Albers X,Y: (280430.7467019697, 1779502.4752015218)
WGS84 Lat/Lon: (60.128, -149.417), AK Albers X,Y: (253576.23264155164, 1135359.3852707124)
WGS84 Lat/Lon: (66.565, -152.643), AK Albers X,Y: (60453.058350979816, 1845893.8418211928)
WGS84 Lat/Lon: (59.24, -135.51), AK Albers X,Y: (1037973.6664627154, 1172743.5751594375)
WGS84 Lat/Lon: (67.57, -162.97), AK Albers X,Y: (-383400.2326436074, 1982739.5047352384)


In [5]:
# pull metadata from coverage to determine axis encodings
meta_request = requests.get("http://zeus.snap.uaf.edu:8080/rasdaman/ows?&SERVICE=WCS&VERSION=2.0.1&REQUEST=DescribeCoverage&COVERAGEID=iem_cru_2km_taspr_seasonal_baseline_stats")
meta_xml = ET.ElementTree(ET.fromstring(meta_request.content.decode()))
# wow xml
encodings = eval(
    list(
        list(
            list(
                list(
                    meta_xml.getroot()[0].iter("{http://www.opengis.net/gmlcov/1.0}metadata")
                )[0].iter("{http://www.opengis.net/gmlcov/1.0}Extension")
            )[0].iter("{http://www.rasdaman.org}covMetadata")
        )[0].iter("Encoding")
    )[0].text
)

#### Run the test

In [6]:
def get_test_indices(fp, x, y):
    # get row/col indices for given coords
    with rio.open(fp) as src:
        test_idx = src.index(x, y)
    return test_idx


def get_coverage_data(x, y):
    data_request = requests.get(f"http://zeus.snap.uaf.edu:8080/rasdaman/ows?&SERVICE=WCS&VERSION=2.0.1&REQUEST=GetCoverage&COVERAGEID=iem_cru_2km_taspr_seasonal_baseline_stats&SUBSET=X({x})&SUBSET=Y({y})&FORMAT=application/json")
    return np.array(data_request.json())


def extract_tif_pixel(fp, row, col):
    window = Window.from_slices(slice(row, row + 1), slice(col, col + 1))
    # get the window transform for the windowed reading
    with rio.open(fp) as src:
        value = src.read(1, window=window)
        
    return value[0, 0]


def compute_stat(arr, stat):
    # compute a given statistic for a given array
    if stat == "hi_std":
        out = arr.mean() + np.std(arr)
    elif stat == "lo_std":
        out = arr.mean() - np.std(arr)
    elif stat == "max":
        out = arr.max()
    elif stat == "mean":
        out = arr.mean()
    elif stat == "median":
        out = np.percentile(arr, 50)
    elif stat == "min": 
        out = arr.min()
    elif stat == "q1":
        out = np.percentile(arr, 25)
    elif stat == "q3":
        out = np.percentile(arr, 75)

    return out

def compute_season_summaries(season, varname, coords):
    # global ref to scratch dir
    src_dir = scratch_dir.joinpath(f"cru_ts40_2km_monthly_{varname}")
    # convert to string for .format()
    temp_fp = str(src_dir.joinpath(temp_fps[varname]))
    test_idx = get_test_indices(temp_fp.format("01", "2000"), *coords)
    summaries = []
    for year in np.arange(1950, 2010):
        month_values = []
        for month in seasons[season]:
            fp = temp_fp.format(str(month).zfill(2), year)
            month_values.append(extract_tif_pixel(fp, *test_idx))
        if varname == "tas":
            # compute weighted seasonal average which is basis for all stats
            summaries.append(np.average(month_values, weights=season_weights[season]))
        elif varname == "pr":
            summaries.append(np.sum(month_values))

    return np.array(summaries)

In [7]:
results = []
for coords in test_coords_3338:
    print(f"  Testing location: {coords}")
    arr = get_coverage_data(*coords)
    for si, varname_arr in enumerate(arr):
        for vi, stat_arr in enumerate(varname_arr):
            season = encodings["season"][str(si)]
            varname = encodings["varname"][str(vi)]
            season_summaries = compute_season_summaries(season, varname, coords)
            test_values = []
            for sti, stat_value in enumerate(stat_arr):
                stat = encodings["stat"][str(sti)]
                test_values.append(compute_stat(season_summaries, stat))
            result = np.isclose(test_values, stat_arr)
            print(f"Stats match for {season}/{varname}: {result}")
            results.append(result)
    print("")
            
print(f"Passing test for all locations: {np.all(results)}")

  Testing location: (280430.7467019697, 1779502.4752015218)
Stats match for DJF/pr: [ True  True  True  True  True  True  True  True]
Stats match for DJF/tas: [ True  True  True  True  True  True  True  True]
Stats match for JJA/pr: [ True  True  True  True  True  True  True  True]
Stats match for JJA/tas: [ True  True  True  True  True  True  True  True]
Stats match for MAM/pr: [ True  True  True  True  True  True  True  True]
Stats match for MAM/tas: [ True  True  True  True  True  True  True  True]
Stats match for SON/pr: [ True  True  True  True  True  True  True  True]
Stats match for SON/tas: [ True  True  True  True  True  True  True  True]

  Testing location: (253576.23264155164, 1135359.3852707124)
Stats match for DJF/pr: [ True  True  True  True  True  True  True  True]
Stats match for DJF/tas: [ True  True  True  True  True  True  True  True]
Stats match for JJA/pr: [ True  True  True  True  True  True  True  True]
Stats match for JJA/tas: [ True  True  True  True  True  Tr