In [1]:
import hashlib
import netCDF4 as nc4
from pathlib import Path
import shutil
import tempfile

In [2]:
def attribute_filter(attributes):
    "Filter for elements which do not start with '__NCH'."
    return filter(lambda a: not a.startswith("__NCH"), attributes)

In [3]:
def update_hash_attr(name, value, hash_obj=None):
    """Update `hash_obj` with the UTF8 encoded string version of first `name` and then `value`."""
    if hash_obj is None:
        hash_obj = hashlib.sha256()
    hash_obj.update(str(name).encode('utf8'))
    hash_obj.update(str(value).encode('utf8'))
    return hash_obj

In [4]:
def update_hash_var(var_obj, hash_obj=None):
    """Update `hash_obj` from a variable."""
    if hash_obj is None:
        hash_obj = hashlib.sha256()
    hash_obj.update(str(var_obj.name).encode('utf8'))
    for attr in attribute_filter(sorted(var_obj.ncattrs())):
        update_hash_attr(attr, hash_obj)

In [5]:
def calculate_file_hash(file_name):
    """Calculate hash for a given netCDF file."""
    with nc4.Dataset(str(file_name)) as data_set:
        hash_obj = None
        for key, var in sorted(data_set.variables.items()):
            hash_obj = update_hash_var(var, hash_obj)
        for name in attribute_filter(data_set.ncattrs()):
            value = data_set.getncattr(name)
            hash_obj = update_hash_attr(name, value, hash_obj)
        return hash_obj.hexdigest()

In [6]:
def calc_and_append_file_hash(file_name):
    file_hash = calculate_file_hash(file_name)
    with nc4.Dataset(file_name, mode="a") as ds:
        ds.__NCH_file_hash = str(file_hash)
        ds.sync()
    return file_hash

In [7]:
def verify_file_hash(file_name, take_hash_from=None):
    if take_hash_from is not None:
        with nc4.Dataset(take_hash_from, mode="r") as ds:
            ref_hash = ds.__NCH_file_hash
    else:
        with nc4.Dataset(file_name, mode="r") as ds:
            ref_hash = ds.__NCH_file_hash
    file_hash = calculate_file_hash(file_name)
    return file_hash == ref_hash

In [8]:
files_to_process = ["example_data/madis-sao.nc" + knd
                    for knd in ["", ".3", ".4", ".6", ".7"]]
with tempfile.TemporaryDirectory() as tmpdir:
    for ftop in files_to_process:
        tmpfile = Path(str(tmpdir)) / Path(ftop).name
        shutil.copy(str(ftop), str(tmpfile))

        print("nc-hashes:\n")
        
        print(calculate_file_hash(ftop), ftop)
        print(calc_and_append_file_hash(tmpfile), tmpfile, "first iteration")

        print("verification:", verify_file_hash(tmpfile), tmpfile)
        print("verification:", verify_file_hash(ftop, tmpfile), ftop, tmpfile)
        
        print("\nfull-file hashes:\n")

        for fn in [ftop, tmpfile]:
            fn = str(fn)
            with open(fn, 'rb') as f:
                print(hashlib.sha256(f.read()).hexdigest(), fn)

        print()

nc-hashes:

d03d576afef618549f44a49bb7a988ff9801670cf0944ad5254e26a2dd6397d8 example_data/madis-sao.nc
d03d576afef618549f44a49bb7a988ff9801670cf0944ad5254e26a2dd6397d8 /tmp/tmpma6nmrcl/madis-sao.nc first iteration
verification: True /tmp/tmpma6nmrcl/madis-sao.nc
verification: True example_data/madis-sao.nc /tmp/tmpma6nmrcl/madis-sao.nc

full-file hashes:

393705135758f8d35cf9078e0e9e34731c81f36ee2bfd369bc765ecfcdaf8abe example_data/madis-sao.nc
9a6a2bdf3276cdebe1faf774a945ced4e952c51ee489f43cf20405eae66c2574 /tmp/tmpma6nmrcl/madis-sao.nc

nc-hashes:

d03d576afef618549f44a49bb7a988ff9801670cf0944ad5254e26a2dd6397d8 example_data/madis-sao.nc.3
d03d576afef618549f44a49bb7a988ff9801670cf0944ad5254e26a2dd6397d8 /tmp/tmpma6nmrcl/madis-sao.nc.3 first iteration
verification: True /tmp/tmpma6nmrcl/madis-sao.nc.3
verification: True example_data/madis-sao.nc.3 /tmp/tmpma6nmrcl/madis-sao.nc.3

full-file hashes:

fbcdea56616f365f8ee62ce7c468e8b72fc4b418bbcb70f0b458e258981e736c example_data/madis-sao