Speed opt for PyPy

svpcom · Sep 21, 2013 · 9a85082 · 9a85082
1 parent aba725b
commit 9a85082
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 26 deletions.
diff --git a/hyperloglog/hll.py b/hyperloglog/hll.py
@@ -4,24 +4,23 @@
 
 import math
 from hashlib import sha1
-from functools32 import lru_cache
 from const import rawEstimateData, biasData, tresholdData
 
 
 def get_treshold(p):
     return tresholdData[p - 4]
 
 
-@lru_cache(1000)
 def estimate_bias(E, p):
     bias_vector = biasData[p - 4]
     nearest_neighbors = get_nearest_neighbors(E, rawEstimateData[p - 4])
-    return sum(float(bias_vector[i]) for i in nearest_neighbors) / len(nearest_neighbors)
+    return sum([float(bias_vector[i]) for i in nearest_neighbors]) / len(nearest_neighbors)
 
 
 def get_nearest_neighbors(E, estimate_vector):
-    distance_map = sorted(((E - float(val)) ** 2, idx) for idx, val in enumerate(estimate_vector))
-    return list(idx for dist, idx in distance_map)[:6]
+    distance_map = [((E - float(val)) ** 2, idx) for idx, val in enumerate(estimate_vector)]
+    distance_map.sort()
+    return [idx for dist, idx in distance_map[:6]]
 
 
 def get_alpha(p):
@@ -102,7 +101,7 @@ def update(self, *others):
             if self.m != item.m:
                 raise ValueError('Counters precisions should be equal')
 
-        self.M = list(max(*items) for items in zip(*([ item.M for item in others ] + [ self.M ])))
+        self.M = [max(*items) for items in zip(*([ item.M for item in others ] + [ self.M ]))]
 
     def __eq__(self, other):
         if self.m != other.m:
@@ -116,16 +115,21 @@ def __ne__(self, other):
     def __len__(self):
         return round(self.card())
 
+    def _Ep(self):
+        E = self.alpha * float(self.m ** 2) / sum(math.pow(2.0, -x) for x in self.M)
+        return (E - estimate_bias(E, self.p)) if E <= 5 * self.m else E
+
     def card(self):
         """
         Returns the estimate of the cardinality
         """
 
-        E = self.alpha * float(self.m ** 2) / sum(math.pow(2.0, -x) for x in self.M)
-        Ep = (E - estimate_bias(E, self.p)) if E <= 5 * self.m else E
-
         #count number or registers equal to 0
         V = self.M.count(0)
-        H = self.m * math.log(self.m / float(V)) if V > 0 else Ep
-        return H if H <= get_treshold(self.p) else Ep
+
+        if V > 0:
+            H = self.m * math.log(self.m / float(V))
+            return H if H <= get_treshold(self.p) else self._Ep()
+        else:
+            return self._Ep()
 
diff --git a/hyperloglog/shll.py b/hyperloglog/shll.py
@@ -7,6 +7,7 @@
 from hashlib import sha1
 from hll import get_treshold, estimate_bias, get_alpha, get_rho
 
+
 class SlidingHyperLogLog(object):
     """
     Sliding HyperLogLog: Estimating cardinality in a data stream (Telecom ParisTech)
@@ -125,6 +126,10 @@ def __ne__(self, other):
     def __len__(self):
         raise NotImplemented
 
+    def _Ep(self, M):
+        E = self.alpha * float(self.m ** 2) / sum(math.pow(2.0, -x) for x in M)
+        return (E - estimate_bias(E, self.p)) if E <= 5 * self.m else E
+
     def card(self, timestamp, window=None):
         """
         Returns the estimate of the cardinality at 'timestamp' using 'window'
@@ -138,15 +143,15 @@ def card(self, timestamp, window=None):
         def max_r(l):
             return max(l) if l else 0
 
-        M = tuple(max_r([R for ts, R in lpfm if ts >= (timestamp - window)]) if lpfm else 0 for lpfm in self.LPFM)
-
-        E = self.alpha * float(self.m ** 2) / sum(math.pow(2.0, -x) for x in M)
-        Ep = (E - estimate_bias(E, self.p)) if E <= 5 * self.m else E
+        M = [max_r([R for ts, R in lpfm if ts >= (timestamp - window)]) if lpfm else 0 for lpfm in self.LPFM]
 
         #count number or registers equal to 0
         V = M.count(0)
-        H = self.m * math.log(self.m / float(V)) if V > 0 else Ep
-        return H if H <= get_treshold(self.p) else Ep
+        if V > 0:
+            H = self.m * math.log(self.m / float(V))
+            return H if H <= get_treshold(self.p) else self._Ep(M)
+        else:
+            return self._Ep(M)
 
     def card_wlist(self, timestamp, window_list):
         """
@@ -156,10 +161,12 @@ def card_wlist(self, timestamp, window_list):
             if not 0 < window <= self.window:
                 raise ValueError('0 < window <= W')
 
-        tsl = sorted((timestamp - window, idx) for idx, window in enumerate(window_list))
-        M_list = list([] for _ in window_list)
+        tsl = [(timestamp - window, idx) for idx, window in enumerate(window_list)]
+        tsl.sort()
+
+        M_list = [[] for _ in window_list]
 
-        # Highly optimized code
+        # Highly optimized code (PyPy), but may be slow in CPython
         for lpfm in self.LPFM:
             R_max = 0
             _p = len(tsl) - 1
@@ -181,12 +188,11 @@ def card_wlist(self, timestamp, window_list):
 
         res = []
         for M in M_list:
-            E = self.alpha * float(self.m ** 2) / sum(math.pow(2.0, -x) for x in M)
-            Ep = (E - estimate_bias(E, self.p)) if E <= 5 * self.m else E
-
             #count number or registers equal to 0
             V = M.count(0)
-            H = self.m * math.log(self.m / float(V)) if V > 0 else Ep
-            res.append(H if H <= get_treshold(self.p) else Ep)
+            if V > 0:
+                H = self.m * math.log(self.m / float(V))
+                res.append(H if H <= get_treshold(self.p) else self._Ep(M))
+            else:
+                res.append(self._Ep(M))
         return res
-