In [17]:
import sys
!{sys.executable} -m pip install mmh3

Collecting mmh3
  Downloading mmh3-4.1.0-cp311-cp311-win_amd64.whl.metadata (13 kB)
Downloading mmh3-4.1.0-cp311-cp311-win_amd64.whl (31 kB)
Installing collected packages: mmh3
Successfully installed mmh3-4.1.0


In [18]:
import math
import struct
import numpy as np 
from hashlib import sha1
import mmh3
import pandas as pd

In [55]:
hashmap_min_size = 4
hashmap_max_size = 16

def get_alpfa(hash_map_size):
    assert hash_map_size >= hashmap_min_size
    assert hash_map_size <= hashmap_max_size

    if hash_map_size == 4:
        return 0.673
    if hash_map_size == 5:
        return 0.697
    if hash_map_size == 6:
        return 0.709

    return 0.7213/((1 + 1.079)/(1 << hash_map_size))

# aka rho in the algorythm
def get_leading_zeros(number, max_bits):
    return max_bits - number.bit_length() + 1 
    


In [78]:
class HLL(object):
    """
    HLL Cardinality counter
    """
    _hash_range_bit = 64
    def __init__(self, p = 8, debug = False):
        self.debug = debug
        
        self.p = p
        self.m = 1 << p;
        self.alpha = get_alpfa(self.p)
        self.M = np.zeros((self.m,), dtype=np.int8)
        self.max_rank = self._hash_range_bit - self.p


    def log(self, text):
        if self.debug:
            print(text)
    
    def getHash(self, value):
        """
        returns hased value
        """
        corected_value = value
        if isinstance(value, str):
            corected_value = value.encode('utf-8')
        elif isinstance(value, float):
            corected_value = str(value).encode('utf-8')
        elif not isinstance(value, bytes):
            corected_value = bytes(value)
        return mmh3.hash64(corected_value)[0]
    
    def add(self, value):
        """
        Adds new value to HLL register
        """
        #hashed value
        x = self.getHash(value)
        self.log(f'hash of {value} is {x}')
        
        # registry index using first p bits of the hash
        j = x & (self.m - 1)
        self.log(f'index = {j}')
        # get the rest bits
        w = x >> self.p
        self.log(f'w = {w}')
        leading_zeroes = get_leading_zeros(w, self.max_rank)
        self.log(f'leading zeores = {leading_zeroes}')
        self.M[j] = max(self.M[j], leading_zeroes)

    def count(self):
        """
        returns estimated cardinality, no bias correction for now
        """
        znamenatel = np.sum(2.0 ** (-self.M))
        self.log(f'znamenatel = {znamenatel}')
        chislitel = float((self.m**2))
        self.log(f'chislitel = {chislitel}')
        
        E = self.alpha * float((self.m**2)) / np.sum(2.0 ** (-self.M))

        self.log(f'E={E}')
        
        if E <= 5*self.m/2:
            registers_wtih_no_data = self.m - np.count_nonzero(self.M)
            if registers_wtih_no_data > 0:
                self.log('linear counting')
                return int(self.m * np.log(self.m / float(registers_with_zeros)))
            else:
                self.log('we cannot do correction')
                return int(E)
        elif E <= (2**32)/30:
            self.log('number is within range')
            return int(E)
        else:
            self.log('We may need some more calculation')
            return int(E)

    def getRegistry(self):
        return self.M

In [79]:
steam_games = pd.read_csv('data/steam-games.csv')

hll = HLL(debug=False, p = 8)

for item in steam_games['developer']:
    hll.add(item)

hll.debug = True
print(hll.count())

real_set = set(steam_games['developer'])
print(len(real_set))

# hll = HLL(debug=True)
# hll.add("Ivan")
# hll.add("Ivan")
# hll.add("Ivan")
# hll.add("Ivan 1")

# hll.count()

znamenatel = 0.9483871459960938
chislitel = 65536.0
E=6137558.995438805
number is within range
6137558
25127


In [31]:
6.7 <= 5*256/2

True

In [30]:
a = 6593149
a.bit_length()

23

In [54]:
2.0**(-np.array([1, 5]))

array([0.5    , 0.03125])

In [88]:
2 ** (-2)

0.25

In [7]:
q = 101385018082514226
q.bit_length()

57

In [25]:
1 << 8

256

In [58]:
2 ** 8

256