In [39]:
"""
It is an overly simplistic example of a HYPERLOGLOG implementation. All the elements here are not production-ready
and serve exclusively for the learning purpose and to explain the basic concept behind the data structure and algorithm.
"""

import hashlib
from math import log, log2

class HyperLogLog:
    def __init__(self, size: int = 1024) -> None:
        self._registers = [0] * size
    
    # preparing data for the prediction part

    def _hash_function(self, value: str) -> int:
        hash_value: str = hashlib.sha256(value.encode("utf8")).hexdigest()
        return int(hash_value, 16)

    def _leftmost_1_bit_position(self, hash_value: int, start_position: int) -> int:
        bin_hash: str = bin(hash_value)[2:]  # the first 2 symbols are '0b'
        return bin_hash.find("1", start_position)

    def process_element(self, element: str) -> None:
        hash_value = self._hash_function(element)
        p = int(log2(len(self._registers)))
        #
        remaining_hash = hash_value >> p  # shift the first 'p' bits
        position = self._leftmost_1_bit_position(remaining_hash, p)
        register_index = hash_value & (len(self._registers) - 1)  # first 'p' (?) bits
        self._registers[register_index] = max(self._registers[register_index], position)
        #

    # making a prediction part
    def _harmonic_mean(self) -> float:
        sum_of_inverses = sum(2 ** -reg for reg in self._registers)
        return len(self._registers) / sum_of_inverses

    def _bias_correction(self, raw_estimate: float) -> float:
        m: int = len(self._registers)
        if raw_estimate <= 2.5 * m:  # small values correction
            v = self._registers.count(0)
            if v > 0:
                return m * log(m / v)
        elif raw_estimate > (2 ** 32) / 30:  # large values correction
            return -(2 ** 32) * log(1 - raw_estimate / (2 ** 32))
        return raw_estimate

    def estimate_cardinality(self) -> float:
        alpha_m: float = 0.7213 / (1 + 1.079 / len(self._registers))
        raw_estimate = alpha_m * len(self._registers) ** 2 * self._harmonic_mean()
        return self._bias_correction(raw_estimate)

In [55]:
h = HyperLogLog()
elements = ['1', '2', '2', '3', '4', '5', '6', '7', '7', '7', '7', '8', '9', '10', '10', '11', '12', '13']
for el in elements:
    h.process_element(el)
h.estimate_cardinality()

765251.9209149492