In [5]:
from netCDF4 import Dataset
import netCDF4
import numpy as np
import xarray as xr
import os

In [6]:
variable = "<var>"
lon, lat = "<lon>", "<lat>"
data_path = "../data/<path>"
label_path = "../data/<path>"
output_path = "../data/<path>"
data = xr.open_dataset(xr.backends.NetCDF4DataStore(Dataset(data_path, mode='r')), decode_times=True)
labels = xr.open_dataset(xr.backends.NetCDF4DataStore(Dataset(label_path, mode='r')), decode_times=True)

## De-Seasonalization

the next code block is used to de-seasonalize the data according to the thesis.

In [7]:
### a simplistic method to subtract the daily average values
def deseasonalization(ds):
    daily_climatology = ds.groupby('time.dayofyear').mean(dim='time')
    window_size=10
    ### make sure the circular nature of the data is recognized when taking the rolling mean
    daily_climatology_circular = xr.concat([daily_climatology.isel(dayofyear=slice(-window_size//2, None)),
                        daily_climatology,
                        daily_climatology.isel(dayofyear=slice(None, window_size//2))],
                        dim='dayofyear')
    # smooth the daily data for the climatology, because it is noisy
    daily_climatology_circular = daily_climatology_circular.rolling(dayofyear=10,center=True,min_periods=1).mean()
    daily_climatology = daily_climatology_circular.isel(dayofyear=slice(window_size//2,-window_size//2))
    # Subtract the daily climatology from the original data to get deseasonalized data
    ds_deseasonalized = ds - daily_climatology.sel(dayofyear=ds.time.dt.dayofyear)
    return ds_deseasonalized
    
### deaseasonalize and then divide by the respective standard deviation
data = deseasonalization(data)

## De-Trending

the next code block is used to de-trend the data according to the thesis

In [8]:
def detrending(dataset):
    x = np.arange(len(dataset.time))
    mean = np.mean(dataset[variable].to_numpy(), axis=(1,2))
    coef = np.polyfit(x, mean, 1)
    poly1d_fn = np.poly1d(coef)
    
    poly_array = np.array([np.full((len(dataset[lat]), len(dataset[lon])), poly1d_fn(x)) for x in range(len(dataset.time))])
    detrended_ds = dataset - poly_array
    return detrended_ds

# Assuming data is an xarray dataset with a 'time' dimension
data = detrending(data)

## Normalization

the data is normalized using the standard deviation. this calculates a similar score to z-score

In [9]:
data[variable].data = data[variable].data/np.std(data[variable].data,axis=0)

## Period Transformation

the data is transformed into (time, 5, 45, 100) samples to allow blocking detection on each individual data sample.

In [10]:
try:
    os.remove(output_path)
except:
    pass

extended_data = np.concatenate([data[variable][:], data[variable][:4]], axis=0)

data = data.assign(
    z=(
        ["time", "day_range", "lat", "lon"],
        [
            np.array(extended_data[i : i + 5])
            for i in range(len(extended_data) - 4)
        ]
    )
)

data.to_netcdf(output_path)