# Binned Calculations

## Packages

In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import dask
import os
import glob

## Inputs

In [2]:
# both these have year then dday is formatted as NLDAS_FORA0125_H.A19791221_dday.nc
#                             sm is formatted as NLDAS_VIC0125_H.A19790328.nc
# april to september taken as growing season per haqiqi 2021

model = "NOAH"
dday_base_path = "/storage/home/cta5244/work/pyWBM_yield_data/NCEPNARR_NLDAS_tmax_tmin/"
sm_base_path = f"/storage/home/cta5244/work/pyWBM_yield_data/{model}_daily/"
filepath_save_sm_mean = f"/storage/home/cta5244/work/avila_et_al_2025_pyWBM_yield/data/{model}_seasonal_average_alltime_average_soilmoisture.nc"
start_year, end_year = 1979, 2025
month_start = 4 # april
month_end = 9 # september
edd_file_path = "/storage/home/cta5244/work/pyWBM_yield_data/NCEPNARR_NLDAS_tmax_tmin/"
binned_edd_path = f"/storage/home/cta5244/work/pyWBM_yield_data/{model}_edd_bins/"

## Function for normal

In [3]:
def normal_value_spatial(year, month_start, month_end, model):
    '''
    uses glob to get usable file paths which are then used in seasonal average function
    inputs:
    year, month_start, month_end, & nldas model
    outputs:
    scalar value which is average 
    '''
    files_arr = []
    
    for month_i in np.arange(month_start, month_end+1, 1):
        files = sorted(glob.glob(f"{sm_base_path}/{year}/NLDAS_{model}0125_H.A{year}{str(month_i).zfill(2)}*.nc"))
        for file_i in files:
            files_arr.append(file_i)
    
    ds = xr.concat([xr.open_dataset(f) for f in files_arr], dim="time")
    seasonal_mean = ds.mean(dim='time')
    return seasonal_mean
    

## dask

In [4]:
from dask_jobqueue import SLURMCluster

cluster = SLURMCluster(
    # account="pches",
    account="open",
    cores=1,
    memory="10GiB",
    walltime="03:00:00",
)

cluster.scale(jobs=10) 

In [11]:
from dask.distributed import Client

client = Client(cluster)

In [12]:
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: http://146.186.150.14:8787/status,

0,1
Dashboard: http://146.186.150.14:8787/status,Workers: 8
Total threads: 8,Total memory: 80.00 GiB

0,1
Comm: tcp://146.186.150.14:34665,Workers: 8
Dashboard: http://146.186.150.14:8787/status,Total threads: 8
Started: Just now,Total memory: 80.00 GiB

0,1
Comm: tcp://10.6.8.33:43963,Total threads: 1
Dashboard: http://10.6.8.33:45829/status,Memory: 10.00 GiB
Nanny: tcp://10.6.8.33:46621,
Local directory: /tmp/dask-scratch-space/worker-lqs7s2ms,Local directory: /tmp/dask-scratch-space/worker-lqs7s2ms

0,1
Comm: tcp://10.6.8.35:38313,Total threads: 1
Dashboard: http://10.6.8.35:36063/status,Memory: 10.00 GiB
Nanny: tcp://10.6.8.35:40263,
Local directory: /tmp/dask-scratch-space/worker-rlp5upc6,Local directory: /tmp/dask-scratch-space/worker-rlp5upc6

0,1
Comm: tcp://10.6.8.49:40091,Total threads: 1
Dashboard: http://10.6.8.49:36429/status,Memory: 10.00 GiB
Nanny: tcp://10.6.8.49:46179,
Local directory: /tmp/dask-scratch-space/worker-lbht94nb,Local directory: /tmp/dask-scratch-space/worker-lbht94nb

0,1
Comm: tcp://10.6.8.33:45051,Total threads: 1
Dashboard: http://10.6.8.33:37931/status,Memory: 10.00 GiB
Nanny: tcp://10.6.8.33:42631,
Local directory: /tmp/dask-scratch-space/worker-0fuwf5ut,Local directory: /tmp/dask-scratch-space/worker-0fuwf5ut

0,1
Comm: tcp://10.6.8.49:41095,Total threads: 1
Dashboard: http://10.6.8.49:41365/status,Memory: 10.00 GiB
Nanny: tcp://10.6.8.49:33099,
Local directory: /tmp/dask-scratch-space/worker-z1idamky,Local directory: /tmp/dask-scratch-space/worker-z1idamky

0,1
Comm: tcp://10.6.8.32:33655,Total threads: 1
Dashboard: http://10.6.8.32:44369/status,Memory: 10.00 GiB
Nanny: tcp://10.6.8.32:36417,
Local directory: /tmp/dask-scratch-space/worker-chhm_x1e,Local directory: /tmp/dask-scratch-space/worker-chhm_x1e

0,1
Comm: tcp://10.6.8.17:38033,Total threads: 1
Dashboard: http://10.6.8.17:42941/status,Memory: 10.00 GiB
Nanny: tcp://10.6.8.17:33643,
Local directory: /tmp/dask-scratch-space/worker-11f3tncf,Local directory: /tmp/dask-scratch-space/worker-11f3tncf

0,1
Comm: tcp://10.6.8.48:45785,Total threads: 1
Dashboard: http://10.6.8.48:33033/status,Memory: 10.00 GiB
Nanny: tcp://10.6.8.48:42503,
Local directory: /tmp/dask-scratch-space/worker-cqdhuwan,Local directory: /tmp/dask-scratch-space/worker-cqdhuwan


## Calculations

In [13]:
results = []
for year in np.arange(start_year, end_year):
    delayed_task = dask.delayed(normal_value_spatial)(
                        year=year, 
                        month_start=month_start,
                        month_end=month_end,
                        model=model
                    )
    results.append(delayed_task)


In [14]:
ds_seasonal_mean_arr = dask.compute(*results)
data_concat = xr.concat(ds_seasonal_mean_arr, dim='time')
data_concat_timemean = data_concat.mean(dim='time')    # this is the mean we want! 

In [15]:
data_concat_timemean.to_netcdf(filepath_save_sm_mean)

## demean soil moisture

In [17]:
def demean_and_bin(year, files_arr_soilm, files_arr_edd):
    '''
    demeans season soil moisture, loading 1 year of soil moisture at once into memory as concat xarray
    inputs:
    files_arr_soilm as glob of all possible glob paths for sm
    files_arr_edd as glob of all possible glob paths for all degree days 
    '''
    ds_raw_sm = xr.concat([xr.open_dataset(f) for f in files_arr_soilm], dim="time")
    data_concat_timemean = xr.open_dataset(filepath_save_sm_mean)
    ds_demean_single_year_xr = ds_raw_sm - data_concat_timemean
    ds_raw_edd = xr.concat([xr.open_dataset(f) for f in files_arr_edd], dim="time")

    # these are all the bins for extreme degree days
    try:
        ds_bin_plus75 = xr.where(ds_demean_single_year_xr.SoilM_0_100cm >= 75, ds_raw_edd.edd, 0)
        ds_bin_plus25_75 = xr.where(((75 > ds_demean_single_year_xr.SoilM_0_100cm) & (ds_demean_single_year_xr.SoilM_0_100cm > 25)), ds_raw_edd.edd, 0)
        ds_bin_minus25_plus25 = xr.where(((25 >= ds_demean_single_year_xr.SoilM_0_100cm) & (ds_demean_single_year_xr.SoilM_0_100cm >= -25)), ds_raw_edd.edd, 0)
        ds_bin_minus25_75 = xr.where(((-75 < ds_demean_single_year_xr.SoilM_0_100cm) & (ds_demean_single_year_xr.SoilM_0_100cm < -25)), ds_raw_edd.edd, 0)
        ds_bin_minus75 = xr.where(ds_demean_single_year_xr.SoilM_0_100cm <= -75, ds_raw_edd.edd, 0)
        
        combined_dataset_bins = xr.Dataset({
                "edd_plus75": ds_bin_plus75.sum(dim='time'),
                "edd_plus25_75": ds_bin_plus25_75.sum(dim='time'),
                "edd_minus25_plus25": ds_bin_minus25_plus25.sum(dim='time'),
                "edd_minus25_75": ds_bin_minus25_75.sum(dim='time'),
                "edd_minus75": ds_bin_minus75.sum(dim='time')
                 })
    
        os.makedirs(f"{binned_edd_path}", exist_ok=True)
        combined_dataset_bins.to_netcdf(f"{binned_edd_path}NLDAS_FORA0125_{model}_H.A{year}_binned_dday.nc")
    except ValueError:
        print(f'error with {year}')

def file_path_load(year):
    '''
    demeans season soil moisture, loading 1 year of soil moisture at once into memory as concat xarray
    inputs:
    single_year_files as glob all possible glob paths 
    '''
    files_arr_soilm = []
    files_arr_edd = []
    
    for month_i in np.arange(month_start, month_end+1, 1):
        files_sm = sorted(glob.glob(f"{sm_base_path}/{year}/NLDAS_{model}0125_H.A{year}{str(month_i).zfill(2)}*.nc"))
        files_edd = sorted(glob.glob(f"{edd_file_path}/{year}/NLDAS_FORA0125_H.A{year}{str(month_i).zfill(2)}*_dday.nc"))
        for file_i in files_sm:
            files_arr_soilm.append(file_i)
        for file_j in files_edd:
            files_arr_edd.append(file_j)
    demean_and_bin(year, files_arr_soilm, files_arr_edd)


### create new dask workers if already calculated using top part of code
- avoid this if you have already ran this earlier

In [5]:
from dask_jobqueue import SLURMCluster

cluster = SLURMCluster(
    # account="pches",
    account="open",
    cores=1,
    memory="10GiB",
    walltime="03:00:00",
)

cluster.scale(jobs=10) 

In [7]:
from dask.distributed import Client

client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: http://146.186.150.14:8787/status,

0,1
Dashboard: http://146.186.150.14:8787/status,Workers: 10
Total threads: 10,Total memory: 100.00 GiB

0,1
Comm: tcp://146.186.150.14:38235,Workers: 10
Dashboard: http://146.186.150.14:8787/status,Total threads: 10
Started: Just now,Total memory: 100.00 GiB

0,1
Comm: tcp://10.6.8.103:43595,Total threads: 1
Dashboard: http://10.6.8.103:36891/status,Memory: 10.00 GiB
Nanny: tcp://10.6.8.103:36919,
Local directory: /tmp/dask-scratch-space/worker-4hxke72w,Local directory: /tmp/dask-scratch-space/worker-4hxke72w

0,1
Comm: tcp://10.6.8.100:36611,Total threads: 1
Dashboard: http://10.6.8.100:42819/status,Memory: 10.00 GiB
Nanny: tcp://10.6.8.100:39529,
Local directory: /tmp/dask-scratch-space/worker-2g9x5nq2,Local directory: /tmp/dask-scratch-space/worker-2g9x5nq2

0,1
Comm: tcp://10.6.8.103:46309,Total threads: 1
Dashboard: http://10.6.8.103:45399/status,Memory: 10.00 GiB
Nanny: tcp://10.6.8.103:37155,
Local directory: /tmp/dask-scratch-space/worker-vtdodgee,Local directory: /tmp/dask-scratch-space/worker-vtdodgee

0,1
Comm: tcp://10.6.8.103:41291,Total threads: 1
Dashboard: http://10.6.8.103:44117/status,Memory: 10.00 GiB
Nanny: tcp://10.6.8.103:42819,
Local directory: /tmp/dask-scratch-space/worker-bygteifh,Local directory: /tmp/dask-scratch-space/worker-bygteifh

0,1
Comm: tcp://10.6.8.99:43569,Total threads: 1
Dashboard: http://10.6.8.99:37155/status,Memory: 10.00 GiB
Nanny: tcp://10.6.8.99:44491,
Local directory: /tmp/dask-scratch-space/worker-as376d4c,Local directory: /tmp/dask-scratch-space/worker-as376d4c

0,1
Comm: tcp://10.6.8.98:34229,Total threads: 1
Dashboard: http://10.6.8.98:42115/status,Memory: 10.00 GiB
Nanny: tcp://10.6.8.98:45293,
Local directory: /tmp/dask-scratch-space/worker-rjxt9ipx,Local directory: /tmp/dask-scratch-space/worker-rjxt9ipx

0,1
Comm: tcp://10.6.8.100:37447,Total threads: 1
Dashboard: http://10.6.8.100:42103/status,Memory: 10.00 GiB
Nanny: tcp://10.6.8.100:34391,
Local directory: /tmp/dask-scratch-space/worker-es8zuzhb,Local directory: /tmp/dask-scratch-space/worker-es8zuzhb

0,1
Comm: tcp://10.6.8.104:39503,Total threads: 1
Dashboard: http://10.6.8.104:41025/status,Memory: 10.00 GiB
Nanny: tcp://10.6.8.104:34429,
Local directory: /tmp/dask-scratch-space/worker-xy0bq9in,Local directory: /tmp/dask-scratch-space/worker-xy0bq9in

0,1
Comm: tcp://10.6.8.103:37667,Total threads: 1
Dashboard: http://10.6.8.103:34755/status,Memory: 10.00 GiB
Nanny: tcp://10.6.8.103:44921,
Local directory: /tmp/dask-scratch-space/worker-wb3bxm6d,Local directory: /tmp/dask-scratch-space/worker-wb3bxm6d

0,1
Comm: tcp://10.6.8.105:33771,Total threads: 1
Dashboard: http://10.6.8.105:33987/status,Memory: 10.00 GiB
Nanny: tcp://10.6.8.105:42029,
Local directory: /tmp/dask-scratch-space/worker-30mmjyi3,Local directory: /tmp/dask-scratch-space/worker-30mmjyi3


## dask calculation of all edd bins historically

In [18]:
results = []
for year in np.arange(start_year, end_year):
    out = dask.delayed(file_path_load)(year = year)
    results.append(out)

In [19]:
results = dask.compute(*results)