## To Do

- what is a good chunk size?
- how can I verify a better performance?
- check accuracy of results
- check computation time on cluster

In [1]:
from toad import TOAD
import numpy as np
import xarray as xr
from toad.shifts_detection.methods import ASDETECT as ASDETECT
import dask

fp = "tutorials/test_data/garbe_2020_antarctica.nc"
#fp = "tutorials/test_data/global_mean_summer_tas.nc"
var = "thk"

data = xr.open_dataset(fp)
spatial_dims = list(data[var].dims)
spatial_dims.remove("time")

c = 5
c_dict = {dim: c for dim in spatial_dims}
c_dict["time"] = 3
data = data.coarsen(**c_dict,
                    boundary="trim").reduce(np.mean)

print(f"Dimensions after coarsening:\n{data.dims}")


Dimensions after coarsening:


In [6]:
from dask.distributed import Client

client = Client()
client


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 4,Total memory: 7.62 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:44725,Workers: 0
Dashboard: http://127.0.0.1:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B

0,1
Comm: tcp://127.0.0.1:44117,Total threads: 1
Dashboard: http://127.0.0.1:36665/status,Memory: 1.90 GiB
Nanny: tcp://127.0.0.1:44793,
Local directory: /tmp/dask-scratch-space/worker-u3p_87lm,Local directory: /tmp/dask-scratch-space/worker-u3p_87lm

0,1
Comm: tcp://127.0.0.1:39657,Total threads: 1
Dashboard: http://127.0.0.1:39323/status,Memory: 1.90 GiB
Nanny: tcp://127.0.0.1:46719,
Local directory: /tmp/dask-scratch-space/worker-klvghyul,Local directory: /tmp/dask-scratch-space/worker-klvghyul

0,1
Comm: tcp://127.0.0.1:43091,Total threads: 1
Dashboard: http://127.0.0.1:43159/status,Memory: 1.90 GiB
Nanny: tcp://127.0.0.1:33331,
Local directory: /tmp/dask-scratch-space/worker-7jg059j8,Local directory: /tmp/dask-scratch-space/worker-7jg059j8

0,1
Comm: tcp://127.0.0.1:46025,Total threads: 1
Dashboard: http://127.0.0.1:35287/status,Memory: 1.90 GiB
Nanny: tcp://127.0.0.1:40231,
Local directory: /tmp/dask-scratch-space/worker-088twjwl,Local directory: /tmp/dask-scratch-space/worker-088twjwl


In [2]:
dask.config.set(scheduler='threads')

td_new = TOAD(data)
td_new.compute_shifts(var,
                  method=ASDETECT(),
                  overwrite=True,
                  chunk_size=10,)

[########################################] | 100% Completed | 34.85 s


## Chunk Size

In [None]:
import time

# Define the chunk sizes to test
chunk_sizes = [None, 5, 20, 50]

# Run tests
results = []

print("Benchmarking chunk sizes...\n")
for size in chunk_sizes:
    # get test data
    td = TOAD(data)

    # Time the execution
    start_time = time.time()
    td_new.compute_shifts(var,
                    method=ASDETECT(),
                    overwrite=True,
                    chunk_size=size,)
    elapsed = time.time() - start_time

    results.append((size, elapsed))
    print(f"Chunk size {size}x{size}: {elapsed:.2f} seconds")

# Summary
print("\nSummary:")
for size, elapsed in results:
    print(f"Chunk size {size}x{size}: {elapsed:.2f} s")


Benchmarking chunk sizes...

[                                        ] | 0% Completed | 809.85 us

[########################################] | 100% Completed | 16.66 ss
Chunk size NonexNone: 16.75 seconds
[########################################] | 100% Completed | 16.19 s
Chunk size 5x5: 16.52 seconds
[########################################] | 100% Completed | 18.27 ss
Chunk size 20x20: 18.37 seconds
[########################################] | 100% Completed | 18.11 ss
Chunk size 50x50: 18.17 seconds

Summary:
Chunk size NonexNone: 16.75 s
Chunk size 5x5: 16.52 s
Chunk size 20x20: 18.37 s
Chunk size 50x50: 18.17 s


## Artificial Dataset

In [None]:
import xarray as xr
import numpy as np

# Define large shape (e.g., 1000 time steps, 1000x1000 spatial grid)
shape = (500, 1000, 1000)  # (time, lat, lon)

# Create coordinate values
time = np.arange(shape[0])
lat = np.linspace(-90, 90, shape[1])
lon = np.linspace(-180, 180, shape[2])

# Create synthetic data using Dask
data = xr.DataArray(
    np.random.rand(*shape),
    dims=["time", "lat", "lon"],
    coords={"time": time, "lat": lat, "lon": lon}
)
td_art = TOAD(data) 

print(td_art.data.dims)


('time', 'lat', 'lon')


: 

In [None]:
import time

# Define the chunk sizes to test
chunk_sizes = [50, 100, 500]

# Run tests
results = []

print("Benchmarking chunk sizes...\n")
for size in chunk_sizes:
    # get test data
    td = TOAD(data)

    # Time the execution
    start_time = time.time()
    td_art.compute_shifts(var,
                    method=ASDETECT(),
                    overwrite=True,
                    chunk_size=size,)
    elapsed = time.time() - start_time

    results.append((size, elapsed))
    print(f"Chunk size {size}x{size}: {elapsed:.2f} seconds")

# Summary
print("\nSummary:")
for size, elapsed in results:
    print(f"Chunk size {size}x{size}: {elapsed:.2f} s")


Benchmarking chunk sizes...

