In [90]:
"""
Example of Merkle Tree. I will not be implementing proving data presense.
"""

import hashlib
from typing import Optional

class Node:
    """
    Merkle tree node.
    """
    def __init__(self, data: str, left: Optional['Node'] = None, right: Optional['Node'] = None) -> None:
        # ah damn, constructor overload would've been a charm here
        self._data = data
        self.left = left
        self.right = right

    @property
    def hash(self) -> str:
        return hashlib.md5(self._data.encode("utf8")).hexdigest()
    
    def clone(self) -> 'Node':
        left = self.left.clone() if self.left else None
        right = self.right.clone() if self.right else None
        return Node(self._data, left, right)

    def difference(self, other: 'Node') -> list['Node']:
        if self == other:
            return []
        if not self.left and not self.right:  # leaf
            return [self]
        if other.left and other.right:
            return self.left.difference(other.left) + self.right.difference(other.right)
        return [self]
    
    def __eq__(self, other: 'Node') -> bool:
        """I just want it here alright."""
        return self.hash == other.hash
    
    def __repr__(self, level: int) -> str:
        """Because why not"""
        this_node_representation = f"{' ' * level * 34}<{self.hash}>\n"
        if not self.left and not self.right:
            return this_node_representation
        # it's not pretty, but we know that if node is not a leaf it has both left and right subtree; let's exploit it
        left = self.left.__repr__(level + 1)
        right = self.right.__repr__(level + 1)
        return left + this_node_representation + right

    def __str__(self) -> str:
        return self.__repr__(0)


class MerkleTree:
    """
    There are a couple of ways I can think of to represent this structure, one of them being a list of nodes like in a heap.
    I'll do a classical binary tree.
    """
    def __init__(self, data: list[str]) -> None:
        # a couple of points about a number of data points to build a tree from:
        # 1. canonically it SHOULD be even to sustain the binary nature of the tree EVEN IT IT'S NOT BALANCED
        # 2. tree doesn't have to be balanced because it's not used for search but to ensure cryptographic integrity (Merkle proofs)
        # 3. instead of using dummy values to populate odd value, usually the odd data point is duplicated and them combined with itself
        # 4. even if there is a possibility that duplicated data may be compared to genuine data that is exactly the same, such scenario is extremely unlikely in a real setting

        level_nodes: list[Node] = [Node(dat) for dat in data]
        while len(level_nodes) > 1:
            if len(level_nodes) % 2 == 1:  # balance it out - it's not the best solution, but it's good enough for this example
                level_nodes.append(level_nodes[-1].clone())
            next_level_nodes: list[Node] = []
            
            for left_i, right_i in zip(range(0, len(level_nodes) - 1, 2), range(1, len(level_nodes), 2)):  # the best I can come up with without summoning demons
                left, right = level_nodes[left_i], level_nodes[right_i]
                next_level_nodes.append(Node(left.hash + right.hash, left, right))
            level_nodes = next_level_nodes
        self._root = level_nodes[0]
    
    def difference(self, other: 'MerkleTree') -> list[Node]:
        """Shows what data chunks are different between 2 trees."""
        return [node.hash for node in self._root.difference(other._root)]
    
    def __repr__(self) -> str:
        return self._root.__repr__(0)


In [91]:
m1 = MerkleTree(['1', '2', '7'])
m2 = MerkleTree(['1', '2', '6'])

In [92]:
m1

                                                                    <c4ca4238a0b923820dcc509a6f75849b>
                                  <302cbafc0dfbc97f30d576a6f394dad3>
                                                                    <c81e728d9d4c2f636f067f89cc14862c>
<ee4098ef46a821d2229bb5652a59e62b>
                                                                    <8f14e45fceea167a5a36dedd4bea2543>
                                  <e1c3878224311a0ce6f28df8a654a7b5>
                                                                    <8f14e45fceea167a5a36dedd4bea2543>

In [93]:
m2

                                                                    <c4ca4238a0b923820dcc509a6f75849b>
                                  <302cbafc0dfbc97f30d576a6f394dad3>
                                                                    <c81e728d9d4c2f636f067f89cc14862c>
<18ab82e86a54faf786644c72bc203ea5>
                                                                    <1679091c5a880faf6fb5e6087eb1b2dc>
                                  <bd5c907b0fcf5d2ffbc516e5dd090276>
                                                                    <1679091c5a880faf6fb5e6087eb1b2dc>

In [94]:
m1.difference(m2)

['8f14e45fceea167a5a36dedd4bea2543', '8f14e45fceea167a5a36dedd4bea2543']