In [2]:
import math
import struct
from hashlib import sha1

In [22]:
hashmap_min_size = 4
hashmap_max_size = 16

def get_alpfa(hash_map_size):
    assert hash_map_size >= hashmap_min_size
    assert hash_map_size <= hashmap_max_size

    if hash_map_size == 4:
        return 0.673
    if hash_map_size == 5:
        return 0.697
    if hash_map_size == 6:
        return 0.709

    m = 2**hash_map_size
    return 0.7213/(1 + 1.079/m)

# aka rho in the algorythm
def get_leading_zeros(number, max_bits):
    return max_bits - number.bit_length() + 1 
    


In [52]:
class HLL(object):
    """
    HLL Cardinality counter
    """
    def __init__(self, hashFunc = sha1, debug = False):
        # hardcoded inputs for now 64 registers =  2 ^ p
        p = 6

        self.debug = debug
        
        self.p = p
        self.m = 2 ** self.p;
        self.alpha = get_alpfa(self.p)
        self.M = [0 for i in range(self.m)]
        self.hashFunc = hashFunc


    def log(self, text):
        if self.debug:
            print(text)
    
    def getHash(self, value):
        """
        returns hased value
        """
        corected_value = value
        if isinstance(value, str):
            corected_value = value.encode('utf-8')
        elif not isinstance(value, bytes):
            corected_value = bytes(value)
        return struct.unpack('!Q', self.hashFunc(corected_value).digest()[:8])[0]
    
    def add(self, value):
        """
        Adds new value to HLL register
        """
        x = self.getHash(value)
        j = x & (self.m - 1)
        w = x >> self.p
        self.M[j] = max(self.M[j], get_leading_zeros(w, self.m - self.p))
        self.log(f'j={j} w={w} M[j]={self.M[j]}')

    def count(self):
        """
        returns estimated cardinality, no bias correction for now
        """

        registers_with_zeros = self.M.count(0)
        result = self.m * math.log(self.m / float(registers_with_zeros))
        self.log(f'registers_with_zeros={registers_with_zeros}')
        return result

In [53]:
hll = HLL(debug=True)
hll.add("Ivan")
hll.add("Ivan")
hll.add("Ivan")
hll.add("Ivan 1")

hll.count()

j=44 w=101385018082514226 M[j]=2
j=44 w=101385018082514226 M[j]=2
j=44 w=101385018082514226 M[j]=2
j=52 w=20859980513863653 M[j]=4
registers_with_zeros=62


2.0319166921331373

In [24]:
6488641157280910508 & 63

44

In [20]:
a = 4439016261869266368
a.bit_length()

62