In [None]:
REF1_HASH_DEFAULT = 'b4ebd7576eab41248b4e0de0b92af558b144f5f6'
REF2_HASH_DEFAULT = 'upstream/main'

In [None]:
import os
import shutil
import tempfile
import subprocess
import pandas as pd
from filecmp import dircmp
from pathlib import Path
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode
init_notebook_mode()

In [None]:
try:
    REF1_HASH = os.environ['REF1_HASH']

    if not REF1_HASH:
        raise ValueError

except (KeyError, ValueError):
    REF1_HASH = REF1_HASH_DEFAULT

try:
    REF2_HASH = os.environ['REF2_HASH']

    if not REF2_HASH:
        raise ValueError

except (KeyError, ValueError):
    REF2_HASH = REF2_HASH_DEFAULT

In [None]:
def highlight_missing(val):
    if val == True:
        return 'background-color: #BCF5A9'
    else:
        return 'background-color: #F5A9A9'
    
def highlight_relative_difference(val):
    ret = 'background-color: #BCF5A9'
    if val is None:
        ret = 'background-color: #BCF5A9'
    elif val > 1e-2:
        ret = 'background-color: #F2F5A9'
    elif val > 1e-1:
        ret = 'background-color: #F5D0A9'
    elif val > 1:
        ret = 'background-color: #F5A9A9'
    return ret

In [None]:
REF1_HASH, REF2_HASH

In [None]:
class ReferenceComparer(object):
    def __init__(self, ref1_hash=None, ref2_hash=None):
        assert not ((ref1_hash is None) and (ref2_hash is None)), "One hash can not be None"
        self.test_table_dict = {}
        self.ref1_hash = ref1_hash
        self.ref2_hash = ref2_hash
        self.compare_path = "tardis"
        self.tmp_dir = None
        self.setup()
        self.tmp_dir = Path(self.tmp_dir)
        self.ref1_path = self.tmp_dir / f"ref1_{self.compare_path}"
        self.ref2_path = self.tmp_dir / f"ref2_{self.compare_path}"
        self.dcmp = dircmp(self.ref1_path, self.ref2_path)
        self.print_diff_files(self.dcmp)
    
    def setup(self):
        self.tmp_dir = tempfile.mkdtemp()
        print('Created temporary directory at {0}. Delete after use with .teardown'.format(self.tmp_dir))
        
        for ref_id, ref_hash in enumerate([self.ref1_hash, self.ref2_hash]):
            ref_id += 1
            if ref_hash is not None:
                self._copy_data_from_hash(ref_hash, 'ref{0}_'.format(ref_id))
            else:
                subprocess.Popen('cp {0} {1}'.format(self.compare_path, 
                                                     os.path.join(self.tmp_dir, 
                                                                  'ref{0}_{1}'.format(ref_id, self.compare_path))), 
                                                     shell=True)
            setattr(self, 'ref{0}_fname'.format(ref_id), 
                    os.path.join(self.tmp_dir, 'ref{0}_{1}'.format(ref_id, self.compare_path)))

    def teardown(self):
        shutil.rmtree(self.tmp_dir)

    def _copy_data_from_hash(self, ref_hash, prefix):
        git_cmd = ['git']
        git_cmd.append('--work-tree={0}'.format(self.tmp_dir))
        git_cmd += ['checkout', ref_hash, self.compare_path]
        p = subprocess.Popen(git_cmd)
        p.wait()
        shutil.move(os.path.join(self.tmp_dir, self.compare_path), 
                    os.path.join(self.tmp_dir, prefix + self.compare_path))

    def print_diff_files(self, dcmp):
        for item in dcmp.right_only:
            if Path(dcmp.right + "/" + item).is_file:
                print(f"new file detected at: {dcmp.right + '/' +item}")
                print(f"New file detected inside ref1: {item}")
                print(f"Path: {dcmp.right + '/' +item}")
                print()
        for item in dcmp.left_only:
            if Path(dcmp.left + "/" + item).is_file:
                print(f"New file detected inside ref2: {item}")
                print(f"Path: {dcmp.left + '/' +item}")
                print()
    
        for name in dcmp.diff_files:
            print(f"Modified file found {name}")
            left = dcmp.left.removeprefix(str(self.tmp_dir) + "/" + "ref1_tardis/")
            right = dcmp.right.removeprefix(str(self.tmp_dir) + "/" + "ref2_tardis/")
            if left==right:
                print(f"Path: {left}")
            if name.endswith(".h5"):
                self.test_table_dict[name] = {
                    "path": left
                }
                self.summarise_changes_hdf(name, str(dcmp.left), str(dcmp.right))
            print()
        for sub_dcmp in dcmp.subdirs.values():
            self.print_diff_files(sub_dcmp)
            
    def summarise_changes_hdf(self, name, path1, path2):
        ref1 = pd.HDFStore(path1 + "/"+ name)
        ref2 = pd.HDFStore(path2 + "/"+ name)
        k1 = set(ref1.keys())
        k2 = set(ref2.keys())
        print(f"Total number of keys- in ref1: {len(k1)}, in ref2: {len(k2)}")
        different_keys = len(k1^k2)
        print(f"Number of keys with different names in ref1 and ref2: {different_keys}")

        identical_items = []
        identical_name_different_data = []
        identical_name_different_data_dfs = {}
        for item in k1&k2:
            try:
                if ref2[item].equals(ref1[item]):
                    identical_items.append(item)
                else:
                    diff = ref2[item] - ref1[item]
                    print(f"Displaying heatmap for key {item} in file {name}")
                    if isinstance(diff, pd.Series):
                        diff = pd.DataFrame([diff.mean(), diff.max()], index=['mean', 'max'])
                        display(diff)
                    else:
                        with pd.option_context('display.max_rows', 100, 'display.max_columns', 10):
                            if isinstance(diff.index, pd.core.indexes.multi.MultiIndex):
                                diff = diff.reset_index(drop=True)
                                diff = pd.DataFrame([diff.mean(), diff.max()], index=['mean', 'max'])
                                display(diff.style.background_gradient(cmap='Reds'))
                                
                            else:
                                diff = pd.DataFrame([diff.mean(), diff.max()], index=['mean', 'max'])
                                display(diff.style.background_gradient(cmap='Reds'))
                    identical_name_different_data.append(item)
                    identical_name_different_data_dfs[item] = diff
                    print()
            except Exception as e:
                print("Facing error comparing item: ", item)
                print(e)
                
        print(f"Number of keys with same name but different data in ref1 and ref2: {len(identical_name_different_data)}")
        print(f"Number of totally same keys: {len(identical_items)}")
        print()
        self.test_table_dict[name].update({
            "different_keys": different_keys,
            "identical_keys": len(identical_items),
            "identical_keys_diff_data": len(identical_name_different_data),
            "identical_name_different_data_dfs": identical_name_different_data_dfs
            
        })
        


In [None]:
rc = ReferenceComparer(REF1_HASH, REF2_HASH)
# rc.teardown()

In [None]:
data = [
    {
        "file": k,
        "different_keys": v["different_keys"],
        "identical_keys": v["identical_keys"],
        "identical_keys_diff_data": v["identical_keys_diff_data"],
    }
    for k, v in rc.test_table_dict.items()
]

bar_traces = [
    {"y": [d["different_keys"] for d in data], 
     "name": "Different Keys",
     "marker_color": "steelblue"},
    {"y": [d["identical_keys"] for d in data],
     "name": "Identical Keys",
     "marker_color": "#2e7514"}, 
    {"y": [d["identical_keys_diff_data"] for d in data],
     "name": "Identical Keys (Diff Data)",
     "marker_color": "firebrick"}
]

fig = go.Figure()
x_values = [d["file"] for d in data]

for trace in bar_traces:
    fig.add_trace(go.Bar(x=x_values, **trace))

# Customize the layout
fig.update_layout(
    barmode="stack",
    title="File Comparison Metrics",
    xaxis_title="File",
    yaxis_title="Value",
    xaxis_tickangle=-45,
)

fig.update_xaxes(showticklabels=False)
fig.show()
