## Test Notebook for inserting time stamps not chronologically into a zarr file

In [1]:
import xarray as xr
import pandas as pd
import numpy as np
import zarr
import os
from shutil import copyfile
import sys

First greate an empty zarr, which will be build upon

In [2]:
DATASET_PATH = "test.zarr"

In [3]:
def create_dataset(datetime):
    """Create a spatial, single time step dataset with two variables."""
    dims = ("time", "lat", "lon")
    time = [pd.to_datetime(datetime)]
    w = 4000
    h = 2000
    lon = np.linspace(0, 4, w)
    lat = np.linspace(50, 52, h)
    precipitation_var = xr.DataArray(np.random.rand(1, h, w), coords=(time, lat, lon), dims=("time", "lat", "lon"))
    temperature_var = xr.DataArray(np.random.rand(1, h, w), coords=(time, lat, lon), dims=("time", "lat", "lon"))
    ds = xr.Dataset({"precipitation": precipitation_var, "temperature": temperature_var})
    return ds
    

## Save a single time step dataset with default chunking

In [4]:
ds = create_dataset("2018-01-01")

In [5]:
ds

<xarray.Dataset>
Dimensions:        (lat: 2000, lon: 4000, time: 1)
Coordinates:
  * time           (time) datetime64[ns] 2018-01-01
  * lat            (lat) float64 50.0 50.0 50.0 50.0 ... 52.0 52.0 52.0 52.0
  * lon            (lon) float64 0.0 0.001 0.002001 0.003001 ... 3.998 3.999 4.0
Data variables:
    precipitation  (time, lat, lon) float64 0.07173 0.6099 ... 0.3031 0.3511
    temperature    (time, lat, lon) float64 0.965 0.9713 0.8693 ... 0.9365 0.136

In [6]:
ds.time.encoding

{}

In [7]:
ds.precipitation.encoding

{}

In [8]:
ds.to_zarr(DATASET_PATH, mode="w")

<xarray.backends.zarr.ZarrStore at 0x7f798550e4a8>

In [9]:
ds.close()

## Inspect default chunking

In [10]:
ds = xr.open_zarr(DATASET_PATH)

In [11]:
ds

<xarray.Dataset>
Dimensions:        (lat: 2000, lon: 4000, time: 1)
Coordinates:
  * lat            (lat) float64 50.0 50.0 50.0 50.0 ... 52.0 52.0 52.0 52.0
  * lon            (lon) float64 0.0 0.001 0.002001 0.003001 ... 3.998 3.999 4.0
  * time           (time) datetime64[ns] 2018-01-01
Data variables:
    precipitation  (time, lat, lon) float64 dask.array<shape=(1, 2000, 4000), chunksize=(1, 250, 500)>
    temperature    (time, lat, lon) float64 dask.array<shape=(1, 2000, 4000), chunksize=(1, 250, 500)>

In [12]:
ds.time.encoding

{'chunks': (1,),
 'compressor': Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0),
 'filters': None,
 'units': 'days since 2018-01-01 00:00:00',
 'calendar': 'proleptic_gregorian',
 'dtype': dtype('int64')}

In [13]:
ds.precipitation.encoding

{'chunks': (1, 250, 500),
 'compressor': Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0),
 'filters': None,
 '_FillValue': nan,
 'dtype': dtype('float64')}

In [14]:
ds.close()

## -----------------------------------

In [15]:
root_group = zarr.open("test.zarr", mode='a')

This zarr will only contain time stamps for every other day:

In [16]:
for i in range(3, 11, 2):
    ds = create_dataset(f"2018-01-0{i}")
    for var_name, var_array in root_group.arrays():
        var = ds[var_name]
        if 'time' in var.dims:            
            time_axis = var.dims.index('time')
            var_array.append(var, axis=time_axis)

In [17]:
ds.close()

Create another dataset which contains only one time stamp and will be appended to the first data set, stored in test.zarr

In [18]:
DATASET_SINGLE_PATH = "test_single.zarr"

In [19]:
ds = create_dataset("2018-01-01")
ds.time.encoding
ds.precipitation.encoding

{}

In [20]:
ds.to_zarr(DATASET_SINGLE_PATH, mode="w")
ds.close()

In [21]:
root_group = zarr.open("test_single.zarr", mode='a')

In [22]:
ds = create_dataset(f"2018-01-04")
for var_name, var_array in root_group.arrays():
    var = ds[var_name]
    if 'time' in var.dims:            
        time_axis = var.dims.index('time')
        var_array.append(var, axis=time_axis)

In [23]:
ds.close()

## -----------------------------------

In [24]:
ds = xr.open_zarr(DATASET_PATH, decode_times=False)

In [25]:
ds_single = xr.open_zarr(DATASET_SINGLE_PATH, decode_times=False)

In [26]:
ds

<xarray.Dataset>
Dimensions:        (lat: 2000, lon: 4000, time: 5)
Coordinates:
  * lat            (lat) float64 50.0 50.0 50.0 50.0 ... 52.0 52.0 52.0 52.0
  * lon            (lon) float64 0.0 0.001 0.002001 0.003001 ... 3.998 3.999 4.0
  * time           (time) int64 0 1514937600000000000 ... 1515456000000000000
Data variables:
    precipitation  (time, lat, lon) float64 dask.array<shape=(5, 2000, 4000), chunksize=(1, 250, 500)>
    temperature    (time, lat, lon) float64 dask.array<shape=(5, 2000, 4000), chunksize=(1, 250, 500)>

In [27]:
ds_single

<xarray.Dataset>
Dimensions:        (lat: 2000, lon: 4000, time: 2)
Coordinates:
  * lat            (lat) float64 50.0 50.0 50.0 50.0 ... 52.0 52.0 52.0 52.0
  * lon            (lon) float64 0.0 0.001 0.002001 0.003001 ... 3.998 3.999 4.0
  * time           (time) int64 0 1515024000000000000
Data variables:
    precipitation  (time, lat, lon) float64 dask.array<shape=(2, 2000, 4000), chunksize=(1, 250, 500)>
    temperature    (time, lat, lon) float64 dask.array<shape=(2, 2000, 4000), chunksize=(1, 250, 500)>

In [28]:
ds.time.encoding

{'chunks': (1,),
 'compressor': Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0),
 'filters': None,
 'dtype': dtype('int64')}

In [29]:
ds.precipitation.encoding

{'chunks': (1, 250, 500),
 'compressor': Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0),
 'filters': None,
 '_FillValue': nan,
 'dtype': dtype('float64')}

In [30]:
dict(ds.dims)

{'lat': 2000, 'lon': 4000, 'time': 5}

In [31]:
ds.time.values

array([                  0, 1514937600000000000, 1515110400000000000,
       1515283200000000000, 1515456000000000000])

In [32]:
ds_single.time.values

array([                  0, 1515024000000000000])

In [33]:
ds.close()
ds_single.close()

In [34]:
ds.time.shape[0]

5

In [35]:
ds_single.time.shape[0]

2

The first time stamp of both data sets is the same, in order to have the same time encoding. So in this case we are interested in the second time stamp of the ds_single

In [None]:
np.greater(ds.time.values, ds_single.time[1].values)

In [36]:
ds_single.close()

In [37]:
ds.close()

In [38]:
def find_nearest_above(array, target):
    diff = array - target
    mask = np.ma.less_equal(diff, 0)
    # We need to mask the negative differences and zero
    # since we are looking for values above
    if np.all(mask):
        return None # returns None if target is greater than any value
    masked_diff = np.ma.masked_array(diff, mask)
    return masked_diff.argmin()

In [39]:
def rename_directory(path_to_ds, old_index, new_time_i):
    ds = xr.open_zarr(path_to_ds, decode_times=False)
    for v in ds.variables:
        if (v != 'lat') and (v != 'lon'):
            path = os.path.join(path_to_ds, v)
            for root, dirs, files in os.walk(path):  
                for filename in files:
                    if (str(old_index)) in filename[0] and (v != "time"):
                        parts = filename.split('.',1)
                        new_name = (str(new_time_i) + '.{}').format(parts[1]) 
                        if new_name != path:
                             os.rename(os.path.join(path, filename), os.path.join(path, new_name))
                    elif (str(old_index)) in filename[0] and (v == "time"):
                        if str(new_time_i) != path:
                            os.rename(os.path.join(path, filename), os.path.join(path, str(new_time_i)))
                

In [40]:
def adjust_zarray(dst_path, variable, line_to_adjust):
    with open((os.path.join(dst_path, variable, '.zarray')), 'r') as zarray:
        data = zarray.readlines()
    position = 8
    white_space = 10
    data[line_to_adjust] = (str(int(data[line_to_adjust][position])+1) + data[line_to_adjust][9:]).rjust(len(data[line_to_adjust][position])+white_space)

    with open((os.path.join(dst_path, variable, '.zarray')), 'w') as zarray:
        zarray.writelines(data)     

In [41]:
def copy_into_target(src_path, dst_path, src_index):
    ds = xr.open_zarr(src_path, decode_times=False)
    for v in ds.variables:
        if (v != 'lat') and (v != 'lon'):
            path = os.path.join(src_path, v)
            for root, dirs, files in os.walk(path):  
                for filename in files:
                    if str(src_index) in filename[0]:
                        copyfile((os.path.join(src_path, v, filename)), (os.path.join(dst_path, v,  filename)))
                        if v != "time":
                            line_to_adjust = 18
                        elif v == "time":
                            line_to_adjust = 16
                        adjust_zarray(dst_path, v, line_to_adjust)             

In [42]:
def merge_single_zarr_into_destination_zarr(src_path, dst_path):
    ds_single = xr.open_zarr(src_path, decode_times=False)
    ds = xr.open_zarr(dst_path, decode_times=False)
    new_idx = find_nearest_above(ds.time.values, ds_single.time[1].values)
    old_idx = 1 # needs to be adjusted in order to be able to handle data sets with more than one timestamp
# Preparing the source directory with the single time stamp to be ready for merging 
# --> files of variables, excluding "lat" and "lon" need to be renamed
    rename_directory(src_path, old_idx, new_idx)
# Preparing the destinanation directory to be ready for single time stam to be merged 
# --> files of variables, excluding "lat" and "lon" need to be renamed
# The renaming needs to happen in reversed order and starting at the index of nearest above value:
    for i in reversed(range(new_idx,ds.time.shape[0])):
        rename_directory(dst_path, i, (i +1))
# Final step: copy the single time stamp files into the destination zarr and adjusting .zarray to the change. 
    copy_into_target(src_path, dst_path, new_idx)
# Remove the file containing the single time stamp:
#     os.remove(src_path)

In [43]:
merge_single_zarr_into_destination_zarr(DATASET_SINGLE_PATH,DATASET_PATH)

In [None]:
variable

In [None]:
with open((os.path.join(DATASET_PATH, "temperature", '.zarray')), 'r') as zarray:
          data = zarray.readlines()
data[18]

In [None]:
(str(int(data[18][8])+1) + data[18][9:]).rjust(len(data[18][8])+10)

In [None]:
(data[16][len(data[line_to_adjust])-2])

In [None]:
len(data[16])-2

In [None]:
data[18][9:]


In [None]:
line_to_adjust = 18
position = 8
white_space = 10

In [None]:
(str(int(data[line_to_adjust][position])+1) + data[line_to_adjust][9:]).rjust(len(data[line_to_adjust][position])+white_space)