In [81]:
import math
import struct
import numpy as np 
from hashlib import sha1
import pandas as pd

In [22]:
hashmap_min_size = 4
hashmap_max_size = 16

def get_alpfa(hash_map_size):
    assert hash_map_size >= hashmap_min_size
    assert hash_map_size <= hashmap_max_size

    if hash_map_size == 4:
        return 0.673
    if hash_map_size == 5:
        return 0.697
    if hash_map_size == 6:
        return 0.709

    m = 2**hash_map_size
    return 0.7213/(1 + 1.079/m)

# aka rho in the algorythm
def get_leading_zeros(number, max_bits):
    return max_bits - number.bit_length() + 1 
    


In [103]:
class HLL(object):
    """
    HLL Cardinality counter
    """
    def __init__(self, hashFunc = sha1, debug = False):
        # hardcoded inputs for now 64 registers =  2 ^ p
        p = 6

        self.debug = debug
        
        self.p = p
        self.m = 2 ** self.p;
        self.alpha = get_alpfa(self.p)
        self.M = [0 for i in range(self.m)]
        self.hashFunc = hashFunc


    def log(self, text):
        if self.debug:
            print(text)
    
    def getHash(self, value):
        """
        returns hased value
        """
        corected_value = value
        if isinstance(value, str):
            corected_value = value.encode('utf-8')
        elif isinstance(value, float):
            corected_value = str(value).encode('utf-8')
        elif not isinstance(value, bytes):
            corected_value = bytes(value)
            
        return struct.unpack('!Q', self.hashFunc(corected_value).digest()[:8])[0]
    
    def add(self, value):
        """
        Adds new value to HLL register
        """
        x = self.getHash(value)
        j = x & (self.m - 1)
        w = x >> self.p
        self.M[j] = max(self.M[j], get_leading_zeros(w, self.m - self.p))

    def count(self):
        """
        returns estimated cardinality, no bias correction for now
        """
        
        E = self.alpha * (self.m**2) * math.sqrt(np.sum([ 2 ** (-x) for x in self.M ]))

        if E <= 5*self.m/2:
            registers_wtih_no_data = self.M.count(0)
            if registers_wtih_no_data > 0:
                self.log('linear counting')
                return int(self.m * math.log(self.m / float(registers_with_zeros)))
            else:
                self.log('we cannot do correction')
                return int(E)
        elif E <= (2**32)/30:
            self.log('nuber is within range')
            return int(E)
        else:
            self.log('We may need some more calculation')
            return int(E)
        
        # registers_wtih_no_data = self.M.count(0)
        # if registers_wtih_no_data != 0
        # result = self.m * math.log(self.m / float(registers_with_zeros))
        # self.log(f'registers_with_zeros={registers_with_zeros}')
        return result

    def getM(self):
        return self.M

In [104]:
steam_games = pd.read_csv('data/steam-games.csv')

hll = HLL(debug=True)

for item in steam_games['developer']:
    hll.add(item)


print(hll.count())

real_set = set(steam_games['developer'])
print(len(real_set))

# hll = HLL(debug=True)
# hll.add("Ivan")
# hll.add("Ivan")
# hll.add("Ivan")
# hll.add("Ivan 1")

# hll.count()

nuber is within range
941
25127


In [24]:
6488641157280910508 & 63

44

In [20]:
a = 4439016261869266368
a.bit_length()

62

In [82]:
np.sum([-1, 5])

4

In [88]:
2 ** (-2)

0.25