From 30c3dd2b2aca8e694aa932315fbbcb4dd2b46366 Mon Sep 17 00:00:00 2001 From: worldveil Date: Tue, 31 Dec 2013 19:42:25 -0600 Subject: [PATCH] first commit --- .gitignore | 3 + README.md | 73 ++++++++++++++++++++++ bloompy/__init__.py | 69 +++++++++++++++++++++ tests.py | 148 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 293 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 bloompy/__init__.py create mode 100644 tests.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ca2b0af --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.pyc +.DS_Store + diff --git a/README.md b/README.md new file mode 100644 index 0000000..288111c --- /dev/null +++ b/README.md @@ -0,0 +1,73 @@ +bloompy +==== + +BloomPy is a minimalist bloom filter implemented in Python using the Murmur hash algorithm. + +Currently only works with strings and numeric types. I built this on a car ride. + +## Dependencies +* [mmh3](https://github.com/hajimes/mmh3) (C++ Murmur hash Python wrapper) + +## Usage + +To create a bloom filter, choose an approximate capacity and an acceptable error rate. + +```python +>>> from bloompy import BloomFilter +>>> import random, math +>>> bf = BloomFilter(capacity=10000, error_rate=0.0001) +``` + + Based on these preferences, `bloompy` will choose the optimal number of hash functions (`k`) and bit vector size (`m`): + +```python +>>> print bf + +``` + +Congrats! You now have an empty bloom filter. You can treat it like a Python `set`, with the exception that you cannot retrieve the keys. + +```python +>>> print "apple" in bf +False +>>> bf.add("apple") +>>> bf.add(9) +>>> bf.add("orange") +>>> print "apple" in bf and 9 in bf and "orange" in bf +True +``` + +## Implementation + +For a given approximate number of elements (`n`) and a desired false positive probability (`p`), there exists an [optimal setting](http://en.wikipedia.org/wiki/Bloom_filter#Optimal_number_of_hash_functions) of: + +* length of bit vector (`m`) +* number of hash functions (`k`) + +in order to minimize the space required while still maintaining `p` under condition of `n` elements or less. These optimal settings are found through the equation for `m`: + + m* = -n * ln(p) / ln(2)^2 + +and for `k`: + + k = ln(2) * m / n + +Instead of using a family of hash functions, I simply use `k` random salts (created upon instantiation) for the Murmur hash algorithm, which has phenomenal speed and key distribution. Each item to be added undergoes `k` hashings, each with a salt. Each 128 bit hash output is pulverized modulo the size of the bit vector (`m`), and that bit in the bit vector is set to 1. + +Testing for membership simply involves hashing the item into a bit vector of size `m` with at most `k` set bits. This bit vector is `AND` with the bloom filter's vector, and then tested for equality with the original bit vector from hashing that element. + +## Performance + +Speed: not so great, error adherence: right on! + +``` +[*] Now testing with 100000 unique strings and desired error rate of 0.001 +[*] Performance results: +pybloom: 1.711986 seconds with error rate = 0.001050 +pybloomfilter: 0.303201 seconds with error rate = 0.000360 +bloompy: 62.798033 seconds with error rate = 0.000990 +``` + +As my implementation code is only about 50 lines and I use the built-in Python `int` for the bitstring, that's not too surprising. + +As you can see the math works and the error rate is maintained quite well. `pybloomfilter` is really quite masterfully done, being quite fast and keeping the error rate lower than desired. \ No newline at end of file diff --git a/bloompy/__init__.py b/bloompy/__init__.py new file mode 100644 index 0000000..aa1c69b --- /dev/null +++ b/bloompy/__init__.py @@ -0,0 +1,69 @@ +import mmh3 # murmur hash algorithm +import random +import math + +class BloomFilter(): + """ + A set of strings or integers implemented with a bloom filter. + + Given an approximate idea of number of elements + to hold and a desired false positive probability, + creates a set using a bloom filter. + + Hash family is built from different salts to the + Murmur hash (mmh3) and uses the optimal settings for + number of hash functions (k) and size of bit vector (m). + """ + + SALT_SIZE = 5 + + def __init__(self, capacity, error_rate): + assert error_rate > 0 and error_rate < 1 + assert capacity > 1 + self.p = error_rate + self.n = int(capacity) + self.m = int(-self.n * math.log(self.p) / math.log(2)**2) + self.k = int(math.log(2) * self.m / self.n) + self.vector = 0 + + # create salts + self.salts = set() + while len(self.salts) < self.k: + salt = "" + for j in range(BloomFilter.SALT_SIZE): + salt += chr(random.randint(0, 255)) + self.salts.add(salt) + + def _hash(self, item): + """ + Hashes item k times and updates bit vector. + """ + if not isinstance(item, (basestring, int, long, float, complex)): + raise Exception("Item is of unsupported type.") + bloom = 0 + for salt in self.salts: + h = mmh3.hash128(salt + str(item)) % self.m + bloom |= (1L << h) + return bloom + + def add(self, item): + """ + Adds item to set. + """ + self.vector |= self._hash(item) + + def __contains__(self, item): + """ + Test for membership in set. + """ + h = self._hash(item) + return ((h & self.vector) == h) + + def clear(self): + """ + Empties the set. + """ + self.vector = 0 + + def __repr__(self): + return "" % (self.n, self.k, self.m, self.p) diff --git a/tests.py b/tests.py new file mode 100644 index 0000000..8f1fcfb --- /dev/null +++ b/tests.py @@ -0,0 +1,148 @@ +from bloompy import BloomFilter +import random +import math +import time + +def test_error_rate(): + n = 10000 + p = 0.001 + b = BloomFilter(n, p) + print "Creating BloomFilter for %d elements and false positive probability = %f ..." % (n, p) + print "Optimal values are m = %d, k = %d" % (b.m, b.k) + elt = 'apple' + + print "Testing..." + assert elt not in b + + print "After adding '%s'..." % elt + b.add(elt) + + print "Testing..." + assert elt in b + + # create random strings + strings = set() + string_size = 20 + for i in range(n): + string = "" + for j in range(string_size): + string += chr(random.randint(0, 255)) + strings.add(string) + + # other strings + other_strings = set() + for i in range(n): + string = "" + for j in range(string_size): + string += chr(random.randint(0, 255)) + other_strings.add(string) + + # add all to set + for s in list(strings): + b.add(s) + + # test for collisions + other_strings = other_strings - strings + collisions = 0 + for s in list(other_strings): + if s in b: + collisions += 1 + + print "False positive rate was %d / %d = %f" % ( + collisions, len(other_strings), + float(collisions) / float(len(other_strings))) + +def test_speed(): + + n = 10000 + p = 0.0001 + b = BloomFilter(n, p) + print b + + strings = set() + string_size = 20 + for i in range(n): + string = "" + for j in range(string_size): + string += chr(random.randint(0, 255)) + strings.add(string) + + total_time = 0 + starttime = time.time() + for string in strings: + b.add(string) + total_time = (time.time() - starttime) + + ns = float(len(strings)) + k = float(b.k) + total_time = float(total_time) + + print "Number of hash functions: %d" % b.k + print "Speed per hash: %f seconds" % (total_time / ns / k) + print "Speed per add: %f seconds" % (total_time / ns) + +def test_performance(): + + n = 100000 + p = 0.001 + + # create set of strings to use + strings = set() + string_size = 50 # make this number higher + # if performance test is taking too long + while len(strings) < n: + string = "" + for j in range(string_size): + string += chr(random.randint(0, 255)) + strings.add(string) + + # create another set + otherstrings = set() + while len(otherstrings) < n: + string = "" + for j in range(string_size): + string += chr(random.randint(0, 255)) + + if string not in strings: + otherstrings.add(string) + + print "[*] Strings created." + + ### 1) pybloom + import pybloom + bf1 = pybloom.BloomFilter(capacity=n, error_rate=p) + + ### 2) pybloomfilter + import pybloomfilter + bf2 = pybloomfilter.BloomFilter(n, p) + + ### 3) bloompy + import bloompy + bf3 = bloompy.BloomFilter(capacity=n, error_rate=p) + + # add them + bfs = [("pybloom", bf1), ("pybloomfilter", bf2), ("bloompy", bf3)] + for s in strings: + for _, b in bfs: + b.add(s) + + print "[*] Bloom filters to compare performance:\n %s\n\n" % bfs + + # add all strings + for _, bf in bfs: + for string in strings: + bf.add(string) + + # then test for collisions + # add all strings + print "[*] Now testing with %d unique strings and desired error rate of %f" % (n, p) + print "[*] Performance results: " + for name, bf in bfs: + collisions = 0 + starttime = time.time() + for string in otherstrings: + if string in bf: + collisions += 1 + elapsed = time.time() - starttime + error_rate = float(collisions) / float(n) + print "%s: %f seconds with error rate = %f" % (name, elapsed, error_rate)