In [1]:
import xarray as xr
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from pyproj import Transformer, CRS, Proj, Geod

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

SELECTED_LOCATION = "Cordova"
YEAR = 1991 # picked to avoid any leap year stuff possibly confounding the analysis
SOURCE_DIR = Path(f"/beegfs/CMIP6/wrf_era5/04km/{YEAR}")

### change this to where your outputs are
PROCESSED_FILE = Path(f"/beegfs/CMIP6/cparr4/daily_downscaled_era5_for_rasdaman/t2_mean/t2_mean_{YEAR}_daily_era5_4km_3338.nc")
source_files = sorted(SOURCE_DIR.glob(f"era5_wrf_dscale_4km_{YEAR}-*.nc"))
ds_processed = xr.open_dataset(PROCESSED_FILE)

# ak geographic locations, not honed to any particular dataset
ak_locations = {
    "Anchorage": (61.2181, -149.9003),
    "Fairbanks": (64.8378, -147.7164),
    "Utqiaġvik": (71.2906, -156.7886),
    "Bethel": (60.7922, -161.7558),
    "Cordova": (60.5438, -145.7573),
    "Nome": (64.5011, -165.4064),
    "Seward": (60.1044, -149.4458),
    "WRF": (64.0, -152.0)
}

We're going to compare point extractions across a few different methods.
We will do a "naive" extraction of the source WRF data using the `ak_locations` "community" lat-lons via finding the "nearest" corresponding "XLAT" and "XLON" cell.
We will convert those to same community lat-lons to 3338 and then extract data from the processed reprojected dataset.
But, we will also find the "exact" lat-lon of the **center** of the "nearest" grid cell that we initally found, convert those coordinates to 3338 as well and again extract from the processed reprojected dataset.

When we extract data, we will also extract data from the surrounding neighborhood of pixels (up to 1 row and 1 column away) because we know that in some cases the grid cell in the source dataset that matches the values
in the processed dataset may be offset when we don't use the "exact" lat-lon.


In [2]:
def project_locations(locations_lat_lon):
    """Project lat/lon coordinates to EPSG:3338."""
    to_ak_albers = Transformer.from_crs("EPSG:4326", "EPSG:3338", always_xy=True)
    projected_locs = {name: to_ak_albers.transform(lon, lat) for name, (lat, lon) in locations_lat_lon.items()}
    return projected_locs


def find_nearest_grid_indices(ds, locations):
    """Find the nearest grid indices for a set of lat/lon locations."""
    ak_grid_indices = {}
    lats = ds['XLAT'].values
    lons = ds['XLONG'].values
    for name, (lat, lon) in locations.items():
        # Compute squared distance for all grid points
        dist2 = (lats - lat)**2 + (lons - lon)**2
        idx = np.unravel_index(np.argmin(dist2), lats.shape)
        ak_grid_indices[name] = {'south_north': idx[0], 'west_east': idx[1]}
    return ak_grid_indices


def find_exact_lat_lon_for_grid_cell(location_indices, sample_ds):
    exact_locations = {}
    for location_name, indices in location_indices.items():
        exact_lat = float(sample_ds['XLAT'].isel(south_north=indices['south_north'],
                                                 west_east=indices['west_east']))
        exact_lon = float(sample_ds['XLONG'].isel(south_north=indices['south_north'],
                                                  west_east=indices['west_east']))
        exact_locations[location_name] = (exact_lat, exact_lon)
        
    return exact_locations


def distance_km_3338(p1, p2):
    """Euclidean distance between two EPSG:3338 points, in metres.

    Parameters
    ----------
    p1, p2
        Two-element sequences of *(x, y)* in metres (Alaska Albers).

    Returns
    -------
    float
        Distance in metres.
    """
    from math import hypot
    dx = p2[0] - p1[0]
    dy = p2[1] - p1[1]
    distance_km = hypot(dx, dy) / 1000
    return round(distance_km, 3)


def wrf_sphere_geo_distance_km(p1, p2):
    """
    Great-circle distance on WRF’s spherical Earth (km).

    Parameters
    ----------
    p1, p2
        lat, lon in decimal degrees.

    Returns
    -------
    float
        Distance in kilometres on the 6370 km-radius sphere.
    """
    # radius = 6,370,000 m, flattening = 0 (a sphere)
    _WRF_GEOD = Geod(a=6_370_000, f=0)
    lat1, lon1 = p1
    lat2, lon2 = p2
    _, _, dist_m = _WRF_GEOD.inv(lon1, lat1, lon2, lat2)  # Geod wants lon/lat order
    wrf_dist = dist_m / 1000
    return round(wrf_dist, 3)


In [3]:
ak_locations_3338 = project_locations(ak_locations)
with xr.open_dataset(source_files[0]) as sample_ds:
    grid_indices = find_nearest_grid_indices(sample_ds, ak_locations)
    exact_lat_lons = find_exact_lat_lon_for_grid_cell(grid_indices, sample_ds)
ak_exact_locations_3338 = project_locations(exact_lat_lons)

ak_locations_df = pd.DataFrame.from_dict(ak_locations, orient="index", columns=["community_lat", "community_lon"])
ak_grid_indices_df = pd.DataFrame.from_dict(grid_indices, orient="index", columns=["south_north", "west_east"])
ak_exact_locations_df = pd.DataFrame.from_dict(exact_lat_lons, orient="index", columns=["exact_lat", "exact_lon"])
ak_locations_3338_df = pd.DataFrame.from_dict(ak_locations_3338, orient="index", columns=["community_m_x_3338", "community_m_y_3338"])
ak_exact_locations_3338_df = pd.DataFrame.from_dict(ak_exact_locations_3338, orient="index", columns=["exact_m_x_3338", "exact_m_y_3338"])
ak_extraction_df = pd.concat([ak_locations_df, ak_locations_3338_df, ak_grid_indices_df, ak_exact_locations_df, ak_exact_locations_3338_df], axis=1)

ak_extraction_df["community_distance_to_exact_km"] = (
    ak_extraction_df
    .apply(
        lambda row: distance_km_3338(
            (row["community_m_x_3338"], row["community_m_y_3338"]),
            (row["exact_m_x_3338"],     row["exact_m_y_3338"])
        ),
        axis=1,
    )
)
ak_extraction_df["community_wrf_distance_to_exact_km"] = (
    ak_extraction_df
        .apply(
            lambda row: wrf_sphere_geo_distance_km(
                (row["community_lat"], row["community_lon"]),
                (row["exact_lat"],     row["exact_lon"])
            ),
            axis=1,
        )
)
ak_extraction_df

Unnamed: 0,community_lat,community_lon,community_m_x_3338,community_m_y_3338,south_north,west_east,exact_lat,exact_lon,exact_m_x_3338,exact_m_y_3338,community_distance_to_exact_km,community_wrf_distance_to_exact_km
Anchorage,61.2181,-149.9003,219349.57922,1255302.0,150,241,61.218018,-149.915176,218555.20805,1255243.0,0.797,0.796
Fairbanks,64.8378,-147.7164,297698.80568,1667062.0,252,263,64.82412,-147.734268,296998.510903,1665463.0,1.746,1.74
Utqiaġvik,71.2906,-156.7886,-102347.938497,2368028.0,429,171,71.291115,-156.810669,-103155.066138,2368118.0,0.812,0.789
Bethel,60.7922,-161.7558,-419835.813796,1225436.0,149,79,60.794342,-161.77652,-420924.299218,1225805.0,1.149,1.149
Cordova,60.5438,-145.7573,449500.61297,1201042.0,135,299,60.537258,-145.773895,448690.657345,1200203.0,1.166,1.163
Nome,64.5011,-165.4064,-544971.699063,1662325.0,260,54,64.506271,-165.421295,-545577.585685,1663016.0,0.919,0.916
Seward,60.1044,-149.4458,252166.098298,1132617.0,119,249,60.107948,-149.427734,253137.514651,1133081.0,1.077,1.076
WRF,64.0,-152.0,97696.463114,1560950.0,227,213,63.990608,-151.981476,98632.925588,1559929.0,1.385,1.38


In [4]:
results = {}

for location in ak_locations:
    print(f"Processing {location}...")
    results[location] = {}
    # search the neighborhood around the grid cell
    daily_means_dict = { (di,dj): [] for di in [-1,0,1] for dj in [-1,0,1] }
    print(f"Processing {len(source_files)} source files in a loop for multiple offsets...")
    for f in source_files:
        # could use mf data open here, but this is fast enough
        with xr.open_dataset(f) as ds:
            for di,dj in daily_means_dict.keys():
                
                wn = grid_indices[location]['west_east'] + di
                sn = grid_indices[location]['south_north'] + dj
                
                wn = max(0, min(wn, ds.dims['west_east']-1))
                sn = max(0, min(sn, ds.dims['south_north']-1))
                
                source_raw = ds['T2'].isel(west_east=wn, south_north=sn)
                
                daily_mean = source_raw.resample(Time="1D").mean() - 273.15
                daily_means_dict[(di,dj)].append(daily_mean)
            
    print("Combining daily means for each offset...")
    offset_series = {}
    for key, lst in daily_means_dict.items():
        series = xr.concat(lst, dim="Time").rename({'Time':'time'}).rename("t2_mean_source")
        offset_series[key] = series

    print("Extracting data from processed file using the COMMUNITY x and y...")
    processed_daily_mean_community = ds_processed["t2_mean"].sel(
        x=ak_locations_3338[location][0],
        y=ak_locations_3338[location][1],
        method="nearest"
    )
    
    print("Extracting data from processed file using the EXACT x and y...")
    processed_daily_mean_exact = ds_processed["t2_mean"].sel(
        x=ak_exact_locations_3338[location][0],
        y=ak_exact_locations_3338[location][1],
        method="nearest"
    )

    # find the best ofset for the initial COMMUNITY coordinates
    community_delta_dict = {}
    for key, src_series in offset_series.items():
        aligned_src, aligned_proc = xr.align(src_series, processed_daily_mean_community, join="inner")
        d = aligned_proc - aligned_src
        community_delta_dict[key] = float(np.abs(d).mean())
    # pick best offset (minimum mean abs delta)
    community_best_offset = min(community_delta_dict, key=community_delta_dict.get)
    
    # find the best ofset for the EXACT coordinates
    exact_delta_dict = {}
    for key, src_series in offset_series.items():
        aligned_src, aligned_proc = xr.align(src_series, processed_daily_mean_exact, join="inner")
        d = aligned_proc - aligned_src
        exact_delta_dict[key] = float(np.abs(d).mean())
    # pick best offset (minimum mean abs delta)
    exact_best_offset = min(exact_delta_dict, key=exact_delta_dict.get)

    results[location]["exact_best_offset"] = exact_best_offset
    results[location]["community_best_offset"] = community_best_offset

    results[location]["exact_mean_abs_delta_of_offset"] = round(exact_delta_dict[exact_best_offset], 2)
    results[location]["community_mean_abs_delta_of_offset"] = round(community_delta_dict[community_best_offset], 2)


Processing Anchorage...
Processing 365 source files in a loop for multiple offsets...
Combining daily means for each offset...
Extracting data from processed file using the COMMUNITY x and y...
Extracting data from processed file using the EXACT x and y...
Processing Fairbanks...
Processing 365 source files in a loop for multiple offsets...
Combining daily means for each offset...
Extracting data from processed file using the COMMUNITY x and y...
Extracting data from processed file using the EXACT x and y...
Processing Utqiaġvik...
Processing 365 source files in a loop for multiple offsets...
Combining daily means for each offset...
Extracting data from processed file using the COMMUNITY x and y...
Extracting data from processed file using the EXACT x and y...
Processing Bethel...
Processing 365 source files in a loop for multiple offsets...
Combining daily means for each offset...
Extracting data from processed file using the COMMUNITY x and y...
Extracting data from processed file us

In [5]:
results

{'Anchorage': {'exact_best_offset': (0, 0),
  'community_best_offset': (0, 0),
  'exact_mean_abs_delta_of_offset': 0.0,
  'community_mean_abs_delta_of_offset': 0.0},
 'Fairbanks': {'exact_best_offset': (0, 0),
  'community_best_offset': (1, 1),
  'exact_mean_abs_delta_of_offset': 0.0,
  'community_mean_abs_delta_of_offset': 0.0},
 'Utqiaġvik': {'exact_best_offset': (0, 0),
  'community_best_offset': (0, 0),
  'exact_mean_abs_delta_of_offset': 0.0,
  'community_mean_abs_delta_of_offset': 0.0},
 'Bethel': {'exact_best_offset': (0, 0),
  'community_best_offset': (1, 0),
  'exact_mean_abs_delta_of_offset': 0.0,
  'community_mean_abs_delta_of_offset': 0.0},
 'Cordova': {'exact_best_offset': (0, 0),
  'community_best_offset': (0, 1),
  'exact_mean_abs_delta_of_offset': 0.0,
  'community_mean_abs_delta_of_offset': 0.0},
 'Nome': {'exact_best_offset': (0, 0),
  'community_best_offset': (0, 0),
  'exact_mean_abs_delta_of_offset': 0.0,
  'community_mean_abs_delta_of_offset': 0.0},
 'Seward': {'e