## Test Notebook for inserting time stamps not chronologically into a zarr file

In [1]:
import xarray as xr
import pandas as pd
import numpy as np
import zarr
import os
import shutil
from shutil import copyfile
import sys
import json

In [2]:
DATASET_PATH = "/home/alicja/Desktop/DCS4COP/CMEMS_TEST/dcs4cop-bc-sst-sns-l2c-v1-2017.zarr"

In [3]:
DATASET_SINGLE_PATH = "/home/alicja/Desktop/DCS4COP/CMEMS_TEST/test_force_time_chron/dcs4cop-bc-sst-sns-l2c-v1-2017.zarr"

In [4]:
def check_merge_or_append(src_path, dst_path):
    print('Check merge or append')
    """Check whether the time stamp of the current input file is before or past the last time stamp of the created data cube"""
    ds_single = xr.open_zarr(src_path)
    ds = xr.open_zarr(dst_path)
    if np.greater(ds_single.time, ds.time[-1]).all():
        #append modus
        print("Append modus is chosen.")
    else:
        check_if_unique(src_path, dst_path)
#         print("Merge modus is chosen.")

In [5]:
def check_if_unique(src_path, dst_path):
    print('check unique')
    """Check if to be added time stamp is unique """
    ds_single = xr.open_zarr(src_path)
    ds = xr.open_zarr(dst_path)
    for src_idx in range(ds_single.time.shape[0]):
        mask = np.equal(ds.time, ds_single.time[src_idx])
        if not mask.all():
            merge_single_zarr_into_destination_zarr(src_path, dst_path, src_idx)
        else:
            print("All timestamps to be merged are aleady in destination data set, and are skipped.")
        

In [6]:
def rename_file(path_to_ds, old_index, new_time_i):
    print('renaming file')
    """Renaming files within the directories according to new time index."""
    ds = xr.open_zarr(path_to_ds)
    for v in ds.variables:
        if (v != 'lat') and (v != 'lon'):
            path = os.path.join(path_to_ds, v)
            for root, dirs, files in os.walk(path):  
                for filename in files:
                    if (str(old_index)) in filename[0] and (v != "time"):
                        parts = filename.split('.',1)
                        new_name = (str(new_time_i) + '.{}').format(parts[1]) 
                        if new_name != path:
                             os.rename(os.path.join(path, filename), os.path.join(path, new_name))
                    elif (str(old_index)) in filename[0] and (v == "time"):
                        if str(new_time_i) != path:
                            os.rename(os.path.join(path, filename), os.path.join(path, str(new_time_i)))
                

In [7]:
def adjust_zarray(dst_path, variable):
    print('adjusting zarray')
    """Changing the shape for time in the .zarray file."""
    with open((os.path.join(dst_path, variable, '.zarray')), 'r') as jsonFile:
        data = json.load(jsonFile)
    t_shape = data["shape"]
    data["shape"][0] = t_shape[0] +1

    with open((os.path.join(dst_path, variable, '.zarray')), 'w') as jsonFile:
        json.dump(data, jsonFile, indent=4)     

In [8]:
def copy_into_target(src_path, dst_path, src_index):
    print('copy into target')
    """Copy the files with the new time stamp into the existing zarr directory."""
    ds = xr.open_zarr(src_path)
    for v in ds.variables:
        if (v != 'lat') and (v != 'lon') and (v != 'lat_bnds') and (v != 'lon_bnds'):
            path = os.path.join(src_path, v)
            for root, dirs, files in os.walk(path):  
                for filename in files:
                    if str(src_index) in filename[0]:
                        copyfile((os.path.join(src_path, v, filename)), (os.path.join(dst_path, v,  filename)))
            adjust_zarray(dst_path, v)             

In [9]:
def merge_single_zarr_into_destination_zarr(src_path, dst_path, src_idx):
    """Merging the data for the new time stamp into the existing and remaining zarr directory."""
    ds_single = xr.open_zarr(src_path)
    ds = xr.open_zarr(dst_path)
    new_idx = np.amin(np.where((ds.time) >(ds_single.time[src_idx])))
    ds.close()
    ds_single.close()
# Preparing the source directory with the single time stamp to be ready for merging 
# --> files of variables, excluding "lat" and "lon" need to be renamed
    rename_file(src_path, src_idx, new_idx)
# Preparing the destinanation directory to be ready for single time stam to be merged 
# --> files of variables, excluding "lat" and "lon" need to be renamed
# The renaming needs to happen in reversed order and starting at the index of nearest above value:
    for i in reversed(range(new_idx,ds.time.shape[0])):
        rename_file(dst_path, i, (i +1))
# Final step: copy the single time stamp files into the destination zarr and adjusting .zarray to the change. 
    copy_into_target(src_path, dst_path, new_idx)

In [10]:
# ds = xr.open_zarr(DATASET_PATH)
# ds_single = xr.open_zarr(DATASET_SINGLE_PATH)

In [11]:
%time
check_merge_or_append(DATASET_SINGLE_PATH,DATASET_PATH)

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 12.6 µs
Check merge or append
check unique
renaming file
renaming file
copy into target
adjusting zarray
adjusting zarray
adjusting zarray


-------------------------------------