## Test Notebook for inserting time stamps not chronologically into a zarr file

In [15]:
import xarray as xr
import pandas as pd
import numpy as np
import zarr
import os
import shutil
from shutil import copyfile
import sys

In [16]:
DATASET_PATH = "test.zarr"

In [14]:
DATASET_SINGLE_PATH = "test_single.zarr"

In [13]:
def create_dataset(datetime):
    """Create a spatial, single time step dataset with two variables."""
    dims = ("time", "lat", "lon")
    time = [pd.to_datetime(datetime)]
    w = 4000
    h = 2000
    lon = np.linspace(0, 4, w)
    lat = np.linspace(50, 52, h)
    precipitation_var = xr.DataArray(np.random.rand(1, h, w), coords=(time, lat, lon), dims=("time", "lat", "lon"))
    temperature_var = xr.DataArray(np.random.rand(1, h, w), coords=(time, lat, lon), dims=("time", "lat", "lon"))
    ds = xr.Dataset({"precipitation": precipitation_var, "temperature": temperature_var})
    return ds
    

In [5]:
def check_merge_or_append(src_path, dst_path):
    """Check whether the time stamp of the current input file is before or past the last time stamp of the created data cube"""
    ds_single = xr.open_zarr(src_path, decode_times=False)
    ds = xr.open_zarr(dst_path, decode_times=False)
    if np.greater(ds_single.time, ds.time[-1]).all():
        #append modus
        print("Append modus is chosen.")
    else:
        check_if_unique(src_path, dst_path)
        print("Merge modus is chosen.")

In [6]:
def check_if_unique(src_path, dst_path):
    """Check if to be added time stamp is unique """
    ds_single = xr.open_zarr(src_path, decode_times=False)
    ds = xr.open_zarr(dst_path, decode_times=False)
    for src_idx in range(ds_single.time.shape[0]):
        mask = np.equal(ds.time, ds_single.time[src_idx])
        if not mask.all():
            merge_single_zarr_into_destination_zarr(src_path, dst_path, src_idx)
        else:
            print("All timestamps to be merged are aleady in destination data set, and are skipped.")
        

In [7]:
def find_nearest_above(array, target):
    """Find the index of the nearest above time stamp, needed for inserting the new time stamp at the correct place."""
    diff = array - target
    mask = np.ma.less_equal(diff, 0)
    # Need to mask the negative differences and zero
    # since we are looking for values above
    if np.all(mask):
        return None # returns None if target is greater than any value
    masked_diff = np.ma.masked_array(diff, mask)
    return masked_diff.argmin()

In [13]:
def rename_file(path_to_ds, old_index, new_time_i):
    """Renaming files within the directories according to new time index."""
    ds = xr.open_zarr(path_to_ds, decode_times=False)
    for v in ds.variables:
        if (v != 'lat') and (v != 'lon'):
            path = os.path.join(path_to_ds, v)
            for root, dirs, files in os.walk(path):  
                for filename in files:
                    if (str(old_index)) in filename[0] and (v != "time"):
                        parts = filename.split('.',1)
                        new_name = (str(new_time_i) + '.{}').format(parts[1]) 
                        if new_name != path:
                             os.rename(os.path.join(path, filename), os.path.join(path, new_name))
                    elif (str(old_index)) in filename[0] and (v == "time"):
                        if str(new_time_i) != path:
                            os.rename(os.path.join(path, filename), os.path.join(path, str(new_time_i)))
                

In [9]:
def adjust_zarray(dst_path, variable, line_to_adjust):
    """Changing the shape for time in the .zarray file."""
    with open((os.path.join(dst_path, variable, '.zarray')), 'r') as zarray:
        data = zarray.readlines()
    position = 8
    white_space = len(data[line_to_adjust])-1
    data[line_to_adjust] = (str(int(data[line_to_adjust][position])+1) + data[line_to_adjust][9:]).rjust(len(data[line_to_adjust][position])+white_space)

    with open((os.path.join(dst_path, variable, '.zarray')), 'w') as zarray:
        zarray.writelines(data)     

In [10]:
def copy_into_target(src_path, dst_path, src_index):
    """Copy the files with the new time stamp into the existing zarr directory."""
    ds = xr.open_zarr(src_path, decode_times=False)
    for v in ds.variables:
        if (v != 'lat') and (v != 'lon'):
            path = os.path.join(src_path, v)
            for root, dirs, files in os.walk(path):  
                for filename in files:
                    if str(src_index) in filename[0]:
                        copyfile((os.path.join(src_path, v, filename)), (os.path.join(dst_path, v,  filename)))
            if v != "time":
                line_to_adjust = 18
            elif v == "time":
                line_to_adjust = 16
            adjust_zarray(dst_path, v, line_to_adjust)             

In [11]:
def merge_single_zarr_into_destination_zarr(src_path, dst_path, old_idx):
    """Merging the data for the new time stamp into the existing and remaining zarr directory."""
    ds_single = xr.open_zarr(src_path, decode_times=False)
    ds = xr.open_zarr(dst_path, decode_times=False)
    new_idx = find_nearest_above(ds.time, ds_single.time[old_idx])
    ds.close()
    ds_single.close()
# Preparing the source directory with the single time stamp to be ready for merging 
# --> files of variables, excluding "lat" and "lon" need to be renamed
    rename_file(src_path, old_idx, new_idx)
# Preparing the destinanation directory to be ready for single time stam to be merged 
# --> files of variables, excluding "lat" and "lon" need to be renamed
# The renaming needs to happen in reversed order and starting at the index of nearest above value:
    for i in reversed(range(new_idx,ds.time.shape[0])):
        rename_file(dst_path, i, (i +1))
# Final step: copy the single time stamp files into the destination zarr and adjusting .zarray to the change. 
    copy_into_target(src_path, dst_path, new_idx)

-------------------------------------

Testing the performance of the 

In [None]:
for i in range(12, 30, 2):
#     create zarr which will be the cube into wich data is merged 
    ds = create_dataset("2018-01-10")
    ds.to_zarr(DATASET_PATH, mode="w")
    ds.close()
    root_group = zarr.open("test.zarr", mode='a')
    for j in range(11, 30, 2):
#     print(i)
        ds = create_dataset(f"2018-01-{j}")
        for var_name, var_array in root_group.arrays():
            var = ds[var_name]
            if 'time' in var.dims:            
                time_axis = var.dims.index('time')
                var_array.append(var, axis=time_axis)
    ds.close()
    
    ds = create_dataset("2018-01-10")
    ds.to_zarr(DATASET_SINGLE_PATH, mode="w")
    ds.close()
    root_group = zarr.open("test_single.zarr", mode='a')
    ds = create_dataset(f"2018-01-{i}")
    for var_name, var_array in root_group.arrays():
        var = ds[var_name]
        if 'time' in var.dims:            
            time_axis = var.dims.index('time')
            var_array.append(var, axis=time_axis)
    ds.close()                  
    %timeit check_merge_or_append(DATASET_SINGLE_PATH,DATASET_PATH)
    shutil.rmtree(DATASET_PATH)
    shutil.rmtree(DATASET_SINGLE_PATH)

In [None]:
%%time
check_merge_or_append(DATASET_SINGLE_PATH,DATASET_PATH)

In [1]:
import numpy as np


In [7]:
a = np.linspace(0, 1, 27).reshape((3,3,3))

In [8]:
a

array([[[0.        , 0.03846154, 0.07692308],
        [0.11538462, 0.15384615, 0.19230769],
        [0.23076923, 0.26923077, 0.30769231]],

       [[0.34615385, 0.38461538, 0.42307692],
        [0.46153846, 0.5       , 0.53846154],
        [0.57692308, 0.61538462, 0.65384615]],

       [[0.69230769, 0.73076923, 0.76923077],
        [0.80769231, 0.84615385, 0.88461538],
        [0.92307692, 0.96153846, 1.        ]]])

In [9]:
a[1:,:,:] = a[0, :,:]


In [10]:
a[0, :,:] = 999

In [11]:
a

array([[[9.99000000e+02, 9.99000000e+02, 9.99000000e+02],
        [9.99000000e+02, 9.99000000e+02, 9.99000000e+02],
        [9.99000000e+02, 9.99000000e+02, 9.99000000e+02]],

       [[0.00000000e+00, 3.84615385e-02, 7.69230769e-02],
        [1.15384615e-01, 1.53846154e-01, 1.92307692e-01],
        [2.30769231e-01, 2.69230769e-01, 3.07692308e-01]],

       [[0.00000000e+00, 3.84615385e-02, 7.69230769e-02],
        [1.15384615e-01, 1.53846154e-01, 1.92307692e-01],
        [2.30769231e-01, 2.69230769e-01, 3.07692308e-01]]])

In [18]:

#     create zarr which will be the cube into wich data is merged 
ds = create_dataset("2018-01-10")
ds.to_zarr(DATASET_PATH, mode="w")
ds.close()
root_group = zarr.open("test.zarr", mode='a')
for j in range(11, 30, 2):
#     print(i)
    ds = create_dataset(f"2018-01-{j}")
    for var_name, var_array in root_group.arrays():
        var = ds[var_name]
        if 'time' in var.dims:            
            time_axis = var.dims.index('time')
            var_array.append(var, axis=time_axis)
ds.close()

i=12
ds = create_dataset("2018-01-10")
ds.to_zarr(DATASET_SINGLE_PATH, mode="w")
ds.close()
root_group = zarr.open("test_single.zarr", mode='a')
ds = create_dataset(f"2018-01-{i}")
for var_name, var_array in root_group.arrays():
    var = ds[var_name]
    if 'time' in var.dims:            
        time_axis = var.dims.index('time')
        var_array.append(var, axis=time_axis)
ds.close()  

In [19]:
ds_single = xr.open_zarr(DATASET_SINGLE_PATH, decode_times=False)
ds = xr.open_zarr(DATASET_PATH, decode_times=False)

In [20]:
import zarr


In [45]:
ds = zarr.open(DATASET_PATH)

In [22]:
ds

<zarr.hierarchy.Group '/'>

In [23]:
ds.attrs

<zarr.attrs.Attributes at 0x7f4b282463c8>

In [46]:
ds_single = zarr.open(DATASET_SINGLE_PATH)

In [26]:
ds = zarr.array(ds)

In [27]:
ds_single = zarr.array(ds_single)

In [31]:
ds[0:1]

array(['lat'], dtype='<U13')

In [32]:
ds.shape

(5,)

In [None]:
ds[]

In [35]:
t = [('c1', np.int), ('c2', np.float)]
a = np.array([(1, 2.0), (2, 3.0), (3, 4.0)], dtype=t)


In [34]:
a[0] = (4, 5.0)
a[0:1] = [(5, 6.0)]


In [36]:
a

array([(1, 2.), (2, 3.), (3, 4.)], dtype=[('c1', '<i8'), ('c2', '<f8')])

In [37]:
a[0] = (4, 5.0)


In [38]:
a

array([(4, 5.), (2, 3.), (3, 4.)], dtype=[('c1', '<i8'), ('c2', '<f8')])

In [39]:
a[0:1] = [(5, 6.0)]

In [40]:
a

array([(5, 6.), (2, 3.), (3, 4.)], dtype=[('c1', '<i8'), ('c2', '<f8')])

In [41]:
a[0:2] = [(6, 7.0), (7, 8.0)]

In [42]:
a

array([(6, 7.), (7, 8.), (3, 4.)], dtype=[('c1', '<i8'), ('c2', '<f8')])

In [47]:
ds.time.con

<zarr.core.Array '/time' (11,) int64>