Skip to content

Commit

Permalink
Speed opt for PyPy
Browse files Browse the repository at this point in the history
  • Loading branch information
svpcom committed Sep 21, 2013
1 parent aba725b commit 9a85082
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 26 deletions.
26 changes: 15 additions & 11 deletions hyperloglog/hll.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,23 @@

import math
from hashlib import sha1
from functools32 import lru_cache
from const import rawEstimateData, biasData, tresholdData


def get_treshold(p):
return tresholdData[p - 4]


@lru_cache(1000)
def estimate_bias(E, p):
bias_vector = biasData[p - 4]
nearest_neighbors = get_nearest_neighbors(E, rawEstimateData[p - 4])
return sum(float(bias_vector[i]) for i in nearest_neighbors) / len(nearest_neighbors)
return sum([float(bias_vector[i]) for i in nearest_neighbors]) / len(nearest_neighbors)


def get_nearest_neighbors(E, estimate_vector):
distance_map = sorted(((E - float(val)) ** 2, idx) for idx, val in enumerate(estimate_vector))
return list(idx for dist, idx in distance_map)[:6]
distance_map = [((E - float(val)) ** 2, idx) for idx, val in enumerate(estimate_vector)]
distance_map.sort()
return [idx for dist, idx in distance_map[:6]]


def get_alpha(p):
Expand Down Expand Up @@ -102,7 +101,7 @@ def update(self, *others):
if self.m != item.m:
raise ValueError('Counters precisions should be equal')

self.M = list(max(*items) for items in zip(*([ item.M for item in others ] + [ self.M ])))
self.M = [max(*items) for items in zip(*([ item.M for item in others ] + [ self.M ]))]

def __eq__(self, other):
if self.m != other.m:
Expand All @@ -116,16 +115,21 @@ def __ne__(self, other):
def __len__(self):
return round(self.card())

def _Ep(self):
E = self.alpha * float(self.m ** 2) / sum(math.pow(2.0, -x) for x in self.M)
return (E - estimate_bias(E, self.p)) if E <= 5 * self.m else E

def card(self):
"""
Returns the estimate of the cardinality
"""

E = self.alpha * float(self.m ** 2) / sum(math.pow(2.0, -x) for x in self.M)
Ep = (E - estimate_bias(E, self.p)) if E <= 5 * self.m else E

#count number or registers equal to 0
V = self.M.count(0)
H = self.m * math.log(self.m / float(V)) if V > 0 else Ep
return H if H <= get_treshold(self.p) else Ep

if V > 0:
H = self.m * math.log(self.m / float(V))
return H if H <= get_treshold(self.p) else self._Ep()
else:
return self._Ep()

36 changes: 21 additions & 15 deletions hyperloglog/shll.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from hashlib import sha1
from hll import get_treshold, estimate_bias, get_alpha, get_rho


class SlidingHyperLogLog(object):
"""
Sliding HyperLogLog: Estimating cardinality in a data stream (Telecom ParisTech)
Expand Down Expand Up @@ -125,6 +126,10 @@ def __ne__(self, other):
def __len__(self):
raise NotImplemented

def _Ep(self, M):
E = self.alpha * float(self.m ** 2) / sum(math.pow(2.0, -x) for x in M)
return (E - estimate_bias(E, self.p)) if E <= 5 * self.m else E

def card(self, timestamp, window=None):
"""
Returns the estimate of the cardinality at 'timestamp' using 'window'
Expand All @@ -138,15 +143,15 @@ def card(self, timestamp, window=None):
def max_r(l):
return max(l) if l else 0

M = tuple(max_r([R for ts, R in lpfm if ts >= (timestamp - window)]) if lpfm else 0 for lpfm in self.LPFM)

E = self.alpha * float(self.m ** 2) / sum(math.pow(2.0, -x) for x in M)
Ep = (E - estimate_bias(E, self.p)) if E <= 5 * self.m else E
M = [max_r([R for ts, R in lpfm if ts >= (timestamp - window)]) if lpfm else 0 for lpfm in self.LPFM]

#count number or registers equal to 0
V = M.count(0)
H = self.m * math.log(self.m / float(V)) if V > 0 else Ep
return H if H <= get_treshold(self.p) else Ep
if V > 0:
H = self.m * math.log(self.m / float(V))
return H if H <= get_treshold(self.p) else self._Ep(M)
else:
return self._Ep(M)

def card_wlist(self, timestamp, window_list):
"""
Expand All @@ -156,10 +161,12 @@ def card_wlist(self, timestamp, window_list):
if not 0 < window <= self.window:
raise ValueError('0 < window <= W')

tsl = sorted((timestamp - window, idx) for idx, window in enumerate(window_list))
M_list = list([] for _ in window_list)
tsl = [(timestamp - window, idx) for idx, window in enumerate(window_list)]
tsl.sort()

M_list = [[] for _ in window_list]

# Highly optimized code
# Highly optimized code (PyPy), but may be slow in CPython
for lpfm in self.LPFM:
R_max = 0
_p = len(tsl) - 1
Expand All @@ -181,12 +188,11 @@ def card_wlist(self, timestamp, window_list):

res = []
for M in M_list:
E = self.alpha * float(self.m ** 2) / sum(math.pow(2.0, -x) for x in M)
Ep = (E - estimate_bias(E, self.p)) if E <= 5 * self.m else E

#count number or registers equal to 0
V = M.count(0)
H = self.m * math.log(self.m / float(V)) if V > 0 else Ep
res.append(H if H <= get_treshold(self.p) else Ep)
if V > 0:
H = self.m * math.log(self.m / float(V))
res.append(H if H <= get_treshold(self.p) else self._Ep(M))
else:
res.append(self._Ep(M))
return res

0 comments on commit 9a85082

Please sign in to comment.