# Define shared utilitiy functions for notebook.
Import components and define utility functions.

Run this cell before other cells in the notebook.

In [2]:
import time
import datetime
import hf_hydrodata as hf
import xarray as xr
import pandas as pd
from parflow import read_pfb
from IPython.display import display

def get_snowtel_data(site_id, start_date, num_days):
    """
    Get snowtel average temp data from one site for num_days starting at start_date.
    Returns: (data, x, y) where x,y are conus2 grid location of the site.
    """
    site_filter_options = {"dataset": "snotel", "variable": "air_temp", "site_id": site_id}
    site_path = hf.get_paths(site_filter_options) [0]
    ds = xr.open_dataset(site_path)
    lat = ds.attrs["latitude"]
    lon = ds.attrs["longitude"]
    (x, y) = hf.to_ij("conus2", lat, lon)
    (x, y) = hf.to_meters("conus2", lat, lon)
    x = int(x/1000)
    y = int(y/1000)
    da = ds["temp_avg"]
    end_date = (datetime.datetime.strptime(start_date, "%Y-%m-%d") + datetime.timedelta(days=num_days-1)).strftime("%Y-%m-%d")
    data = da.sel(date=slice(start_date, end_date)).values
    data = data + 273.15
    return (data, x, y, lat, lon)

def get_cw3e_data(x, y, start_date, num_days):
    """
    Get CW3E mean temp data for num_days starting at start_date.
    Return: a numpy array with dimension (time, y, x) with shape (num_days, 1, 1) with temp data.
    """
    result = None
    grid_bounds = [x, y, x + 1, y + 1]
    start_time = datetime.datetime.strptime(start_date, "%Y-%m-%d")
    for day in range(0, num_days):
        end_time = start_time + datetime.timedelta(hours=24)
        cw3e_filter_options = {
            "dataset": "CW3E", "variable": "air_temp", "temporal_resolution": "hourly", 
            "grid_bounds": grid_bounds, "start_time": start_time, "end_time":end_time}
        cw3e_data = hf.get_gridded_data(cw3e_filter_options)
        if result is None:
            result = []
        v = float(cw3e_data.mean(axis=0)[0][0])
        result.append(v)
        start_time = start_time + datetime.timedelta(hours = 24)
    return result

def get_new_cw3e_data(x, y, start_date, num_days):
    """
    Get CW3E mean temp data for num_days starting at start_date.
    Return: a numpy array with dimension (time, y, x) with shape (num_days, 1, 1) with temp data.
    """
    result = []
    grid_bounds = [x, y, x + 1, y + 1]
    start_time = datetime.datetime.strptime(start_date, "%Y-%m-%d")
    for _ in range(0, num_days):
        wy = start_time.year if start_time.month < 10 else start_time.year+1
        wy_start = datetime.datetime.strptime(f"{wy-1}-10-01", "%Y-%m-%d")
        hr_start = int((start_time - wy_start).total_seconds() / 3600) + 1
        hr_end = hr_start + 23
        pfb_path = f"/hydrodata/temp/CW3E_Forcing_Update_21_Jan_24/processing/processed_data/hourly/WY{wy}/CW3E.Temp.{hr_start:06d}_to_{hr_end:06d}.pfb"
        pfb_constraint = {
            "x": {"start": int(x), "stop": int(x)},
            "y": {"start": int(y), "stop": int(y)},
            "z": {"start": 0, "stop": 24},
        }
        data = read_pfb(pfb_path, pfb_constraint)
        v = data.mean()
        result.append(v)
        start_time = start_time + datetime.timedelta(hours = 24)
    return result
 

def print_site_cw3e_differences(site_id, start_date, num_days, tolerance=5):
    """
    Print the difference in average temperature for the observation site compared with CW3E data from num_days from start_date
    Only print values that diff by more than 1 degree Kelvin.
    """
    (site_data, x, y, lat, lon) = get_snowtel_data(site_id, start_date, num_days)
    cw3e_data = get_cw3e_data(x, y, start_date, num_days)
    new_cw3e_data = get_new_cw3e_data(x, y, start_date, num_days)
    
    num_old_diffs = 0
    num_new_diff = 0
    diff_rows = []
    for i in range(0, num_days):
        old_value = cw3e_data[i]
        new_value = new_cw3e_data[i]
        diff_old = site_data[i] - old_value
        diff_new = site_data[i] - new_value
        if abs(diff_old) > tolerance or abs(diff_new) > tolerance:
            if abs(diff_old) > tolerance:
                num_old_diffs = num_old_diffs + 1
            if abs(diff_new) > tolerance:
                num_new_diffs = num_new_diff + 1
            date = (datetime.datetime.strptime(start_date, "%Y-%m-%d") + datetime.timedelta(days = i)).strftime("%Y-%m-%d")
            row = [site_id, date, site_data[i], old_value, new_value, diff_old, diff_new, x, y, lat, lon]
            diff_rows.append(row)

    print(f"Site value differences that differ by more than {tolerance}.")
    display(pd.DataFrame(diff_rows, columns=["Site_id", "Date", "Site_data", "Old_CW3E_value", "New_CW3E_value", "Old_CW3E_DIFF", "New_CW3E_DIFF", "X", "Y", "Lat", "Lon"]))
    print(f"Found {num_old_diffs} Old and {num_old_diffs} New CW3E rows different from site greater than {tolerance}.")
    
print("Loaded new utility functions.")





Loaded new utility functions.


# Find and display differences between site values and CW3E gridded values and selected site types

In [8]:
site_ids = ["350:WY:SNTL", "347:MT:SNTL", "368:UT:SNTL", "396:UT:SNTL", "913:CO:SNTL", "2090:AR:SCAN"]
site_ids = ["301:CA:SNTL"]
site_ids = ["1000:OR:SNTL"]

start_time = "2005-10-01"
num_days = 4
for site_id in site_ids:
    df = print_site_cw3e_differences(site_id, start_time, num_days, tolerance=0 )
    df

Site value differences that differ by more than 0.


Unnamed: 0,Site_id,Date,Site_data,Old_CW3E_value,New_CW3E_value,Old_CW3E_DIFF,New_CW3E_DIFF,X,Y,Lat,Lon
0,1000:OR:SNTL,2005-10-01,277.65,286.042731,282.285465,-8.392731,-4.635465,258,2286,42.87007,-122.16518
1,1000:OR:SNTL,2005-10-02,274.05,276.73635,273.693076,-2.68635,0.356924,258,2286,42.87007,-122.16518
2,1000:OR:SNTL,2005-10-03,273.75,273.67067,273.078968,0.07933,0.671032,258,2286,42.87007,-122.16518
3,1000:OR:SNTL,2005-10-04,273.75,272.700198,272.856351,1.049802,0.893649,258,2286,42.87007,-122.16518


Found 4 Old and 4 New CW3E rows different from site greater than 0.
