# svpcom/hyperloglog forked from goncalvesnelson/Log-Log-Sketch

Switch branches/tags
Nothing to show
f02e6a5 May 8, 2018
3 contributors

### Users who have contributed to this file

157 lines (107 sloc) 3.75 KB
 """ This module implements probabilistic data structure which is able to calculate the cardinality of large multisets in a single pass using little auxiliary memory """ import math from hashlib import sha1 from .const import rawEstimateData, biasData, tresholdData from .compat import * def bit_length(w): return w.bit_length() def bit_length_emu(w): return len(bin(w)) - 2 if w > 0 else 0 # Workaround for python < 2.7 if not hasattr(int, 'bit_length'): bit_length = bit_length_emu def get_treshold(p): return tresholdData[p - 4] def estimate_bias(E, p): bias_vector = biasData[p - 4] nearest_neighbors = get_nearest_neighbors(E, rawEstimateData[p - 4]) return sum([float(bias_vector[i]) for i in nearest_neighbors]) / len(nearest_neighbors) def get_nearest_neighbors(E, estimate_vector): distance_map = [((E - float(val)) ** 2, idx) for idx, val in enumerate(estimate_vector)] distance_map.sort() return [idx for dist, idx in distance_map[:6]] def get_alpha(p): if not (4 <= p <= 16): raise ValueError("p=%d should be in range [4 : 16]" % p) if p == 4: return 0.673 if p == 5: return 0.697 if p == 6: return 0.709 return 0.7213 / (1.0 + 1.079 / (1 << p)) def get_rho(w, max_width): rho = max_width - bit_length(w) + 1 if rho <= 0: raise ValueError('w overflow') return rho class HyperLogLog(object): """ HyperLogLog cardinality counter """ __slots__ = ('alpha', 'p', 'm', 'M') def __init__(self, error_rate): """ Implementes a HyperLogLog error_rate = abs_err / cardinality """ if not (0 < error_rate < 1): raise ValueError("Error_Rate must be between 0 and 1.") # error_rate = 1.04 / sqrt(m) # m = 2 ** p # M(1)... M(m) = 0 p = int(math.ceil(math.log((1.04 / error_rate) ** 2, 2))) self.alpha = get_alpha(p) self.p = p self.m = 1 << p self.M = [ 0 for i in range(self.m) ] def __getstate__(self): return dict([x, getattr(self, x)] for x in self.__slots__) def __setstate__(self, d): for key in d: setattr(self, key, d[key]) def add(self, value): """ Adds the item to the HyperLogLog """ # h: D -> {0,1} ** 64 # x = h(v) # j = # w = # M[j] = max(M[j], rho(w)) x = long(sha1(bytes(value.encode('utf8') if isinstance(value, unicode) else value)).hexdigest()[:16], 16) j = x & (self.m - 1) w = x >> self.p self.M[j] = max(self.M[j], get_rho(w, 64 - self.p)) def update(self, *others): """ Merge other counters """ for item in others: if self.m != item.m: raise ValueError('Counters precisions should be equal') self.M = [max(*items) for items in zip(*([ item.M for item in others ] + [ self.M ]))] def __eq__(self, other): if self.m != other.m: raise ValueError('Counters precisions should be equal') return self.M == other.M def __ne__(self, other): return not self.__eq__(other) def __len__(self): return round(self.card()) def _Ep(self): E = self.alpha * float(self.m ** 2) / sum(math.pow(2.0, -x) for x in self.M) return (E - estimate_bias(E, self.p)) if E <= 5 * self.m else E def card(self): """ Returns the estimate of the cardinality """ #count number or registers equal to 0 V = self.M.count(0) if V > 0: H = self.m * math.log(self.m / float(V)) return H if H <= get_treshold(self.p) else self._Ep() else: return self._Ep()