first commit

worldveil · Jan 1, 2014 · 30c3dd2 · 30c3dd2
commit 30c3dd2
Show file tree

Hide file tree

Showing 4 changed files with 293 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+*.pyc
+.DS_Store
+
diff --git a/README.md b/README.md
@@ -0,0 +1,73 @@
+bloompy
+====
+
+BloomPy is a minimalist bloom filter implemented in Python using the Murmur hash algorithm.
+
+Currently only works with strings and numeric types. I built this on a car ride.
+
+## Dependencies
+* [mmh3](https://github.com/hajimes/mmh3) (C++ Murmur hash Python wrapper)
+
+## Usage
+
+To create a bloom filter, choose an approximate capacity and an acceptable error rate.
+
+```python
+>>> from bloompy import BloomFilter
+>>> import random, math
+>>> bf = BloomFilter(capacity=10000, error_rate=0.0001)
+```
+
+ Based on these preferences, `bloompy` will choose the optimal number of hash functions (`k`) and bit vector size (`m`): 
+
+```python
+>>> print bf
+<BloomFilter n=10000, k=13, m=191701, p=0.0001>
+```
+
+Congrats! You now have an empty bloom filter. You can treat it like a Python `set`, with the exception that you cannot retrieve the keys. 
+
+```python
+>>> print "apple" in bf
+False
+>>> bf.add("apple")
+>>> bf.add(9)
+>>> bf.add("orange")
+>>> print "apple" in bf and 9 in bf and "orange" in bf
+True
+```
+
+## Implementation
+
+For a given approximate number of elements (`n`) and a desired false positive probability (`p`), there exists an [optimal setting](http://en.wikipedia.org/wiki/Bloom_filter#Optimal_number_of_hash_functions) of:
+
+* length of bit vector (`m`)
+* number of hash functions (`k`)
+
+in order to minimize the space required while still maintaining `p` under condition of `n` elements or less. These optimal settings are found through the equation for `m`:
+
+    m* = -n * ln(p) / ln(2)^2
+
+and for `k`:
+
+    k = ln(2) * m / n
+
+Instead of using a family of hash functions, I simply use `k` random salts (created upon instantiation) for the Murmur hash algorithm, which has phenomenal speed and key distribution. Each item to be added undergoes `k` hashings, each with a salt. Each 128 bit hash output is pulverized modulo the size of the bit vector (`m`), and that bit in the bit vector is set to 1.
+
+Testing for membership simply involves hashing the item into a bit vector of size `m` with at most `k` set bits. This bit vector is `AND` with the bloom filter's vector, and then tested for equality with the original bit vector from hashing that element. 
+
+## Performance
+
+Speed: not so great, error adherence: right on!
+
+```
+[*] Now testing with 100000 unique strings and desired error rate of 0.001
+[*] Performance results: 
+pybloom: 1.711986 seconds with error rate = 0.001050
+pybloomfilter: 0.303201 seconds with error rate = 0.000360
+bloompy: 62.798033 seconds with error rate = 0.000990
+```
+
+As my implementation code is only about 50 lines and I use the built-in Python `int` for the bitstring, that's not too surprising. 
+
+As you can see the math works and the error rate is maintained quite well. `pybloomfilter` is really quite masterfully done, being quite fast and keeping the error rate lower than desired. 
diff --git a/bloompy/__init__.py b/bloompy/__init__.py
@@ -0,0 +1,69 @@
+import mmh3 # murmur hash algorithm
+import random
+import math
+
+class BloomFilter():
+    """
+    A set of strings or integers implemented with a bloom filter.
+    
+    Given an approximate idea of number of elements 
+    to hold and a desired false positive probability, 
+    creates a set using a bloom filter.
+
+    Hash family is built from different salts to the 
+    Murmur hash (mmh3) and uses the optimal settings for 
+    number of hash functions (k) and size of bit vector (m).
+    """
+
+    SALT_SIZE = 5
+
+    def __init__(self, capacity, error_rate):
+        assert error_rate > 0 and error_rate < 1
+        assert capacity > 1
+        self.p = error_rate
+        self.n = int(capacity)
+        self.m = int(-self.n * math.log(self.p) / math.log(2)**2)
+        self.k = int(math.log(2) * self.m / self.n)
+        self.vector = 0
+
+        # create salts
+        self.salts = set()
+        while len(self.salts) < self.k:
+            salt = ""
+            for j in range(BloomFilter.SALT_SIZE):
+                salt += chr(random.randint(0, 255))
+            self.salts.add(salt)
+
+    def _hash(self, item):
+        """
+        Hashes item k times and updates bit vector.
+        """
+        if not isinstance(item, (basestring, int, long, float, complex)):
+            raise Exception("Item is of unsupported type.")
+        bloom = 0
+        for salt in self.salts:
+            h = mmh3.hash128(salt + str(item)) % self.m
+            bloom |= (1L << h)
+        return bloom
+
+    def add(self, item):
+        """
+        Adds item to set.
+        """
+        self.vector |= self._hash(item)
+
+    def __contains__(self, item):
+        """
+        Test for membership in set.
+        """
+        h = self._hash(item)
+        return ((h & self.vector) == h)
+
+    def clear(self):
+        """
+        Empties the set. 
+        """
+        self.vector = 0
+
+    def __repr__(self):
+        return "<BloomFilter n=%d, k=%d, m=%d, p=%f>" % (self.n, self.k, self.m, self.p)
diff --git a/tests.py b/tests.py
@@ -0,0 +1,148 @@
+from bloompy import BloomFilter
+import random
+import math
+import time
+
+def test_error_rate():
+    n = 10000
+    p = 0.001
+    b = BloomFilter(n, p)
+    print "Creating BloomFilter for %d elements and false positive probability = %f ..." % (n, p)
+    print "Optimal values are m = %d, k = %d" % (b.m, b.k)
+    elt = 'apple'
+
+    print "Testing..."
+    assert elt not in b
+
+    print "After adding '%s'..." % elt
+    b.add(elt)
+
+    print "Testing..."
+    assert elt in b
+
+    # create random strings
+    strings = set()
+    string_size = 20
+    for i in range(n):
+    	string = ""
+    	for j in range(string_size):
+    		string += chr(random.randint(0, 255))
+    	strings.add(string)
+
+    # other strings
+    other_strings = set()
+    for i in range(n):
+    	string = ""
+    	for j in range(string_size):
+    		string += chr(random.randint(0, 255))
+    	other_strings.add(string)
+
+    # add all to set
+    for s in list(strings):
+    	b.add(s)
+
+    # test for collisions
+    other_strings = other_strings - strings
+    collisions = 0
+    for s in list(other_strings):
+    	if s in b:
+    		collisions += 1
+
+    print "False positive rate was %d / %d = %f" % (
+    	collisions, len(other_strings), 
+    	float(collisions) / float(len(other_strings)))
+
+def test_speed():
+
+    n = 10000
+    p = 0.0001
+    b = BloomFilter(n, p)
+    print b
+
+    strings = set()
+    string_size = 20
+    for i in range(n):
+    	string = ""
+    	for j in range(string_size):
+    		string += chr(random.randint(0, 255))
+        strings.add(string)
+
+    total_time = 0
+    starttime = time.time()
+    for string in strings:
+    	b.add(string)
+    total_time = (time.time() - starttime)
+
+    ns = float(len(strings))
+    k = float(b.k)
+    total_time = float(total_time)
+
+    print "Number of hash functions: %d" % b.k
+    print "Speed per hash: %f seconds" % (total_time / ns / k)
+    print "Speed per add: %f seconds" % (total_time / ns)
+
+def test_performance():
+
+    n = 100000
+    p = 0.001
+
+    # create set of strings to use
+    strings = set()
+    string_size = 50    # make this number higher 
+                        # if performance test is taking too long
+    while len(strings) < n:
+    	string = ""
+    	for j in range(string_size):
+    		string += chr(random.randint(0, 255))
+        strings.add(string)
+
+    # create another set
+    otherstrings = set()
+    while len(otherstrings) < n:
+        string = ""
+    	for j in range(string_size):
+    		string += chr(random.randint(0, 255))
+
+        if string not in strings:
+            otherstrings.add(string)
+
+    print "[*] Strings created."
+
+    ### 1) pybloom
+    import pybloom
+    bf1 = pybloom.BloomFilter(capacity=n, error_rate=p)
+
+    ### 2) pybloomfilter
+    import pybloomfilter
+    bf2 = pybloomfilter.BloomFilter(n, p)
+
+    ### 3) bloompy
+    import bloompy
+    bf3 = bloompy.BloomFilter(capacity=n, error_rate=p)
+
+    # add them
+    bfs = [("pybloom", bf1), ("pybloomfilter", bf2), ("bloompy", bf3)]
+    for s in strings:
+        for _, b in bfs:
+            b.add(s)
+
+    print "[*] Bloom filters to compare performance:\n %s\n\n" % bfs
+
+    # add all strings
+    for _, bf in bfs:
+        for string in strings:
+            bf.add(string)
+
+    # then test for collisions
+    # add all strings
+    print "[*] Now testing with %d unique strings and desired error rate of %f" % (n, p)
+    print "[*] Performance results: "
+    for name, bf in bfs:
+        collisions = 0
+        starttime = time.time()
+        for string in otherstrings:
+            if string in bf:
+                collisions += 1
+        elapsed = time.time() - starttime
+        error_rate = float(collisions) / float(n)
+        print "%s: %f seconds with error rate = %f" % (name, elapsed, error_rate)