# pyWBM Future Cleaning
- This code is used for cleaning and processing future pyWBM & LOCA2 projections
- Allows us to take this data (panel) and use our coefficients from notebook 5 for future projections

In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import dask
import os
import glob as glob
import cftime 
import warnings
warnings.filterwarnings("ignore")

## Growing degree days

In [2]:
from functions_2a import degreeDays, yearlyCalculationSum

### Filepaths & Inputs

In [3]:
# get cmip6 model names used in loca2, the full path for reference = "/storage/group/pches/default/public/LOCA2/ACCESS-CM2/0p0625deg/r1i1p1f1/ssp245/tasmin"
base_loca_paths_for_models = "/storage/group/pches/default/public/LOCA2/"
models = sorted(glob.glob(f"{base_loca_paths_for_models}*"))
model_names = [os.path.basename(m) for m in models][:-2]

# ssp scenarios used in pyWBM are 245 and 370
ssps = ["245", "370"]

# intitalizataions, only using r1i1p1f1 for now, some runs have more than 1 init
initializations = ["r1i1p1f1"]

# loca2 is in chunks of ~30 years 
time_frames = ["2015-2044", "2045-2074", "2075-2100"]

nldas_lsm = "VIC"

# this is the soil_moiture base path
pyWBM_file_path_base = "/storage/group/pches/default/users/dcl5300/wbm_soilM_uc_2024_DATA/projections/eCONUS/out/LOCA2"

# soil moisture historical normal 
soil_moisture_normal_file_path = f"/storage/home/cta5244/work/avila_et_al_2025_pyWBM_yield/data/{nldas_lsm}_seasonal_average_alltime_average_soilmoisture.nc"

# some arbritrary pyWBM run for regridding
arbritrary_pyWBM_run = f'/storage/group/pches/default/users/dcl5300/wbm_soilM_uc_2024_DATA/projections/eCONUS/out/LOCA2/ACCESS-CM2_r1i1p1f1_ssp245_VIC_kge.nc'

In [4]:
# each loca2 file is 10GB (large)
if __name__ == "__main__":
    # first lets get some base pyWBM run, & base historical normal to use for future regridding
    # by getting ds_soil_normal_on_wbm_grid we can use it without issue for any pyWBM anomaly vs the historical time frame
    ds_soilpyWBM_regrid = (xr.open_dataset(arbritrary_pyWBM_run)).chunk({"time": 1, "lon":100, "lat":100})
    ds_soil_normal = xr.open_dataset(soil_moisture_normal_file_path).SoilM_0_100cm
    ds_soil_normal_on_wbm_grid = ds_soil_normal.interp(
        lat=ds_soilpyWBM_regrid.lat,
        lon=ds_soilpyWBM_regrid.lon,
        method="linear"  # or "nearest" if you prefer
    ).chunk()
    
    for model_name_i in model_names[:1]:
        for initialization_i in initializations:
            for ssp_i in ssps[:1]:
                for time_frame_i in time_frames:
                    # tmax file
                    file_path_i_tmax = f"{base_loca_paths_for_models}{model_name_i}/0p0625deg/{initialization_i}/ssp{ssp_i}/tasmax"
                    file_name_i_tmax = f"tasmax.{model_name_i}.ssp{ssp_i}.{initialization_i}.{time_frame_i}.LOCA_16thdeg_v20220413.nc"
                    # tmin file
                    file_path_i_tmin = f"{base_loca_paths_for_models}{model_name_i}/0p0625deg/{initialization_i}/ssp{ssp_i}/tasmin"
                    file_name_i_tmin = f"tasmin.{model_name_i}.ssp{ssp_i}.{initialization_i}.{time_frame_i}.LOCA_16thdeg_v20220413.nc"
                    
                    # combing them for usage in degree day calculation
                    try:
                        LOCA2_tmax = xr.open_dataset(f"{file_path_i_tmax}/{file_name_i_tmax}", chunks='auto').rename({"tasmax": "tmax"})
                        LOCA2_tmin = xr.open_dataset(f"{file_path_i_tmin}/{file_name_i_tmin}", chunks='auto').rename({"tasmin": "tmin"})
                        LOCA2_combined = xr.merge([LOCA2_tmax, LOCA2_tmin])
                        
                    except FileNotFoundError:
                        print(f"Issue with file location, skipping {file_path_i_tmax}/{file_name_i_tmax} or {file_path_i_tmin}/{file_name_i_tmin}")
                        break

                    # this inputs some big daily chunked dataset, and outputs the gdd & edd binned using pyWBM futures
                    results_season_and_soilm = []
                    start_year = int(LOCA2_combined.time.dt.year.values[0])
                    end_year = int(LOCA2_combined.time.dt.year.values[-1])
                    for year in range(start_year, end_year + 1)[:10]:
                        # gives single year slice, and growing degree days '
                        ds_slice = LOCA2_combined.sel(time=slice(f"{year}-04-01", f"{year}-09-30")).chunk({"time": 1})
                        ds_slice = ds_slice.assign_coords(
                            lon=((ds_slice.lon + 180) % 360) - 180
                        ).sortby("lon")
                        
                        # keep ds slice for a land mask pre-interpolation to avoid pulling ocean values into linear interpolation
                        land_mask = ds_slice.tmax.isel(time=0).notnull().copy(deep=True)

                        # then calculate degree days
                        gdd_future = degreeDays(ds_slice, 'gdd')
                        gdd_future_sum = gdd_future.groupby("time.year").sum("time").reset_coords(drop=True) # this is seasonal growing degree day spatially, temporally for combination of i of input parameters
                        
                        gdd_future_mask = gdd_future_sum.where(land_mask)  # mask applied on the cleaned grid
                        gdd_future_regrid = gdd_future_mask.interp(
                            lat=ds_soilpyWBM_regrid.lat, 
                            lon=ds_soilpyWBM_regrid.lon, 
                            method="linear",
                            kwargs={"fill_value": np.nan}
                        ).chunk()                          # this interpolates 
                        
                        # now looking at edd, but important to bin for compound extremes, everything on pyWBM grid which is 1/8 as opposed to LOCA2s 1/16edd_future
                        edd_future = degreeDays(ds_slice, 'edd')
                        edd_future_mask = edd_future.where(land_mask)
                        edd_future_regrid = edd_future_mask.interp(
                                lat=ds_soilpyWBM_regrid.lat, 
                                lon=ds_soilpyWBM_regrid.lon, 
                                method="linear",  # or "linear", depending on your needs
                                kwargs={"fill_value": np.nan}
                        ).chunk()
                        
                        # getting compound extreme data from pyWBM large ensemble
                        pywbm_combinations = sorted(glob.glob(f"{pyWBM_file_path_base}/{model_name_i}_{initialization_i}_ssp{ssp_i}_{nldas_lsm}*"))
                        ds_soilpyWBM_initial_chunk_open = (xr.open_dataset(pywbm_combinations[0])).chunk({"lon":100, "lat":100})
                        # everything above works as expecte but the next line causes issues 
                        ds_soilpyWBM_initial_chunk_open['time'] = ds_soilpyWBM_initial_chunk_open.indexes['time'].to_datetimeindex()
                        ds_soilpyWBM_initial_chunk_single_year = ds_soilpyWBM_initial_chunk_open.sel(time=slice(f"{year}-04-01", f"{year}-09-30"))
                        
                        # mask out west coast so LOCA2 only exists in pyWBM space
                        ds_soil_normal_on_wbm_grid_masked = ds_soil_normal_on_wbm_grid.where(~np.isnan(ds_soilpyWBM_initial_chunk_single_year))
                        ds_soil_normal_on_wbm_grid_masked = ds_soil_normal_on_wbm_grid_masked.transpose('time', 'lat', 'lon')
                        deviation_from_normal = ds_soilpyWBM_initial_chunk_single_year - ds_soil_normal_on_wbm_grid_masked.soilMoist
                        
                        # before binning, make gdd & edd constrained to this same region
                        try:
                            
                            edd_future_regrid_mask = edd_future_regrid.where(~np.isnan(ds_soilpyWBM_initial_chunk_single_year.isel(time=0).soilMoist))
                            gdd_future_regrid_mask = gdd_future_regrid.where(~np.isnan(ds_soilpyWBM_initial_chunk_single_year.isel(time=0).soilMoist))
                            deviation_from_normal_dr = deviation_from_normal.soilMoist  # now an xarray.DataArray
                            deviation_from_normal_dr = deviation_from_normal_dr.assign_coords(
                                time=pd.to_datetime(deviation_from_normal_dr.time.values.astype(str))
                            )
                            deviation_from_normal_dr = deviation_from_normal_dr.transpose('lat', 'lon', 'time')
                            edd_future_regrid_mask   = edd_future_regrid_mask.transpose('lat', 'lon', 'time')
                            
                            deviation_from_normal_allign, edd_future_aligned = xr.align(
                                deviation_from_normal_dr, edd_future_regrid_mask, join='left'
                            )
                            
                            ds_bin_plus75 = xr.where(deviation_from_normal_allign >= 75, edd_future_aligned, 0)
                            # now bin everything and save appropriately. keep it all in memory, avoid saving intermediary steps
                            ds_bin_plus25_75   = xr.where((deviation_from_normal_allign < 75) & (deviation_from_normal_allign > 25), edd_future_aligned, 0)
                            ds_bin_minus25_plus25 = xr.where((deviation_from_normal_allign <= 25) & (deviation_from_normal_allign >= -25), edd_future_aligned, 0)
                            ds_bin_minus25_75  = xr.where((deviation_from_normal_allign > -75) & (deviation_from_normal_allign < -25), edd_future_aligned, 0)
                            ds_bin_minus75     = xr.where(deviation_from_normal_allign <= -75, edd_future_aligned, 0)
                            
                            # Sum edd over the season (time dimension)
                            combined_dataset_bins = xr.Dataset({
                                "gdd":              gdd_future_regrid_mask,
                                "edd_plus75":       ds_bin_plus75.sum(dim='time'),
                                "edd_plus25_75":    ds_bin_plus25_75.sum(dim='time'),
                                "edd_minus25_plus25": ds_bin_minus25_plus25.sum(dim='time'),
                                "edd_minus25_75":   ds_bin_minus25_75.sum(dim='time'),
                                "edd_minus75":      ds_bin_minus75.sum(dim='time')
                            })
                            
                            results_season_and_soilm.append(combined_dataset_bins)
                            
                        except Exception as e:
                                print(f"{year} year error, pass this year", e)
                    
                    ds_all_seasons = xr.concat(results_season_and_soilm, dim="time")

2015 year error, pass this year Index 0 is out of bounds for axis 0 with size 0


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x150d2b554dc0>>
Traceback (most recent call last):
  File "/storage/home/cta5244/mambaforge/envs/pyWBM/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x150d2b554dc0>>
Traceback (most recent call last):
  File "/storage/home/cta5244/mambaforge/envs/pyWBM/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x150d2b554dc0>>
Traceback (most recent call last):
  File "/storage/home/cta5244/mambaforge/envs/pyWBM/lib/python3

In [None]:
keys_single_place = list(combined_dataset_bins.keys())
for key_i in keys_single_place:
    print(key_i)
    plt.figure(figsize=(8,6))
    plt.title(f"{key_i} {year} {model_name_i} {initialization_i} ssp{ssp_i} {time_frame_i}", fontsize=14)
    data = combined_dataset_bins.isel(year=0)[key_i].compute()
    data.plot()
    plt.show()
    plt.close()

gdd



KeyboardInterrupt



Error in callback <function flush_figures at 0x15216ceeae60> (for post_execute), with arguments args (),kwargs {}:
