Permalink
Browse files

first commit

  • Loading branch information...
willf committed May 21, 2011
0 parents commit a59ab96b7151b6a81b48d2daafbdae206ca29238
Showing with 304 additions and 0 deletions.
  1. +7 −0 Makefile
  2. +53 −0 README.md
  3. +158 −0 bloom.go
  4. +86 −0 bloom_test.go
@@ -0,0 +1,7 @@
+include $(GOROOT)/src/Make.inc
+
+TARG=bloom
+GOFILES=\
+ bloom.go
+
+include $(GOROOT)/src/Make.pkg
@@ -0,0 +1,53 @@
+Bloom filters
+-------------
+
+A Bloom filter is a representation of a set of _n_ items, where the main
+requirement is to make membership queries; _i.e._, whether an item is a
+member of a set.
+
+A Bloom filter has two parameters: _m_, a maximum size (typically a reasonably large
+multiple of the cardinality of the set to represent) and _k_, the number of hashing
+functions on elements of the set. (The actual hashing functions are important, too,
+but this is not a parameter for this implementation). A Bloom filter is backed by
+a BitSet; a key is represented in the filter by setting the bits at each value of the
+hashing functions (modulo _m_). Set membership is done by _testing_ whether the
+bits at each value of the hashing functions (again, modulo _m_) are set. If so,
+the item is in the set. If the item is actually in the set, a Bloom filter will
+never fail (the true positive rate is 1.0); but it is susceptible to false
+positives. The art is to choose _k_ and _m_ correctly.
+
+In this implementation, the hashing function used is FNV, a non-cryptographic
+hashing function which is part of the Go package (hash/fnv). For a item, the
+64-bit FNV hash is computed, and upper and lower 32 bit numbers, call them h1 and
+h2, are used. Then, the _i_th hashing function is:
+
+ h1 + h2*i
+
+Thus, the underlying hash function, FNV, is only called once per key.
+
+This implementation accepts keys for setting as testing as []byte. Thus, to
+add a string item, "Love":
+
+ uint n = 1000
+ filter := bloom.New(20*n, 5) // load of 20, 5 keys
+ filter.Add([]byte("Love"))
+
+Similarly, to test if "Love" is in bloom:
+
+ if filter.Test([]byte("Love"))
+
+For numeric data, I recommend that you look into the binary/encoding library. But,
+for example, to add a uint32 to the filter:
+
+ i := uint32(100)
+ n1 := make([]byte,4)
+ binary.BigEndian.PutUint32(n1,i)
+ f.Add(n1)
+
+Finally, there is a method to estimate the false positive rate of a particular
+bloom filter for a set of size _n_:
+
+ if filter.EstimateFalsePositiveRate(1000) > 0.001
+
+Given the particular hashing scheme, it's best to be empirical about this. Note
+that estimating the FP rate will clear the Bloom filter.
158 bloom.go
@@ -0,0 +1,158 @@
+package bloom
+
+/*
+A Bloom filter is a representation of a set of _n_ items, where the main
+requirement is to make membership queries; _i.e._, whether an item is a
+member of a set.
+
+A Bloom filter has two parameters: _m_, a maximum size (typically a reasonably large
+multiple of the cardinality of the set to represent) and _k_, the number of hashing
+functions on elements of the set. (The actual hashing functions are important, too,
+but this is not a parameter for this implementation). A Bloom filter is backed by
+a BitSet; a key is represented in the filter by setting the bits at each value of the
+hashing functions (modulo _m_). Set membership is done by _testing_ whether the
+bits at each value of the hashing functions (again, modulo _m_) are set. If so,
+the item is in the set. If the item is actually in the set, a Bloom filter will
+never fail (the true positive rate is 1.0); but it is susceptible to false
+positives. The art is to choose _k_ and _m_ correctly.
+
+In this implementation, the hashing function used is FNV, a non-cryptographic
+hashing function which is part of the Go package (hash/fnv). For a item, the
+64-bit FNV hash is computed, and upper and lower 32 bit numbers, call them h1 and
+h2, are used. Then, the _i_th hashing function is:
+
+ h1 + h2*i
+
+Thus, the underlying hash function, FNV, is only called once per key.
+
+This implementation accepts keys for setting as testing as []byte. Thus, to
+add a string item, "Love":
+
+ uint n = 1000
+ filter := bloom.New(20*n, 5) // load of 20, 5 keys
+ filter.Add([]byte("Love"))
+
+Similarly, to test if "Love" is in bloom:
+
+ if filter.Test([]byte("Love"))
+
+For numeric data, I recommend that you look into the binary/encoding library. But,
+for example, to add a uint32 to the filter:
+
+ i := uint32(100)
+ n1 := make([]byte,4)
+ binary.BigEndian.PutUint32(n1,i)
+ f.Add(n1)
+
+Finally, there is a method to estimate the false positive rate of a particular
+bloom filter for a set of size _n_:
+
+ if filter.EstimateFalsePositiveRate(1000) > 0.001
+
+Given the particular hashing scheme, it's best to be empirical about this. Note
+that estimating the FP rate will clear the Bloom filter.
+*/
+
+import (
+ "bitset"
+ "hash"
+ "hash/fnv"
+ "encoding/binary"
+ //"fmt"
+)
+
+type BloomFilter struct {
+ length uint
+ k uint
+ b *bitset.BitSet
+ hasher hash.Hash64
+}
+
+// Create a new Bloom filter with _m_ bits and _k_ hashing functions
+func New(m uint, k uint) *BloomFilter {
+ return &BloomFilter{m,k, bitset.New(m), fnv.New64() }
+}
+
+// Return the capacity, _n_, of a Bloom filter
+func (b *BloomFilter) Cap() uint {
+ return b.length
+}
+
+// Return the number of hash functions used
+func (b *BloomFilter) K() uint {
+ return b.k
+}
+
+// get the two basic hash function values for data
+func (f *BloomFilter) base_hashes(data []byte) (a uint32, b uint32) {
+ f.hasher.Reset()
+ f.hasher.Write(data)
+ sum := f.hasher.Sum()
+ upper := sum[0:4]
+ lower := sum[4:8]
+ a = binary.BigEndian.Uint32(lower)
+ b = binary.BigEndian.Uint32(upper)
+ return
+}
+
+// get the _k_ locations to set/test in the underlying bitset
+func (f *BloomFilter) locations(data []byte) (locs []uint) {
+ locs = make([]uint,f.k)
+ a,b := f.base_hashes(data)
+ ua := uint(a)
+ ub := uint(b)
+ //fmt.Println(ua, ub)
+ for i := uint(0) ; i < f.k; i++ {
+ locs[i] = (ua + ub * i ) % f.length
+ }
+ //fmt.Println(data, "->", locs)
+ return
+}
+
+// Add data to the Bloom Filter. Returns the filter (allows chaining)
+func (f *BloomFilter) Add(data []byte) *BloomFilter {
+ for _,loc := range f.locations(data) {
+ f.b.Set(loc)
+ }
+ return f
+}
+
+// Tests for the presence of data in the Bloom filter
+func (f *BloomFilter) Test(data []byte) bool {
+ for _,loc := range f.locations(data) {
+ if !f.b.Test(loc) {
+ return false
+ }
+ }
+ return true
+}
+
+// Clear all the data in a Bloom filter, removing all keys
+func (f *BloomFilter) ClearAll() *BloomFilter {
+ f.b.ClearAll()
+ return f
+}
+
+// Estimate, for a BloomFilter with a limit of m bytes
+// and k hash functions, what the false positive rate will be
+// whilst storing n entries; runs 10k tests
+func (f *BloomFilter) EstimateFalsePositiveRate(n uint) (fp_rate float64) {
+ f.ClearAll()
+ n1 := make([]byte,4)
+ for i := uint32(0); i<uint32(n);i++ {
+ binary.BigEndian.PutUint32(n1,i)
+ f.Add(n1)
+ }
+ fp := 0
+ // test 10k numbers
+ for i := uint32(0); i<uint32(10000);i++ {
+ binary.BigEndian.PutUint32(n1,i+uint32(n)+1)
+ if f.Test(n1) {
+ fp++
+ }
+ }
+ fp_rate = float64(fp)/float64(100)
+ f.ClearAll()
+ return
+}
+
@@ -0,0 +1,86 @@
+package bloom
+
+import (
+ "testing"
+ "encoding/binary"
+ "fmt"
+)
+
+func TestBasic(t *testing.T) {
+ f := New(1000,4)
+ n1 := []byte("Bess")
+ n2 := []byte("Jane")
+ f.Add(n1)
+ n1b := f.Test(n1)
+ n2b := f.Test(n2)
+ if !n1b {
+ t.Errorf("%v should be in.", n1)
+ }
+ if n2b {
+ t.Errorf("%v should not be in.", n2)
+ }
+}
+
+func TestBasicUint32(t *testing.T) {
+ f := New(1000,4)
+ n1 := make([]byte,4)
+ n2 := make([]byte,4)
+ n3 := make([]byte,4)
+ binary.BigEndian.PutUint32(n1,100)
+ binary.BigEndian.PutUint32(n2,101)
+ binary.BigEndian.PutUint32(n3,102)
+ f.Add(n1)
+ n1b := f.Test(n1)
+ n2b := f.Test(n2)
+ f.Test(n3)
+ if !n1b {
+ t.Errorf("%v should be in.", n1)
+ }
+ if n2b {
+ t.Errorf("%v should not be in.", n2)
+ }
+}
+
+
+func TestEstimate20_5(t *testing.T) {
+ n := uint(10000)
+ k := uint(5)
+ load := uint(20)
+ f := New(n*load,k)
+ fp_rate := f.EstimateFalsePositiveRate(n)
+ if fp_rate > 0.0001 {
+ t.Errorf("False positive rate too high: load=%v, k=%v, %f", load, k, fp_rate)
+ }
+}
+
+func TestEstimate15_10(t *testing.T) {
+ n := uint(10000)
+ k := uint(10)
+ load := uint(15)
+ f := New(n*load,k)
+ fp_rate := f.EstimateFalsePositiveRate(n)
+ if fp_rate > 0.0001 {
+ t.Errorf("False positive rate too high: load=%v, k=%v, %f", load, k, fp_rate)
+ }
+}
+
+func BenchmarkEstimates(t *testing.B) {
+ n := uint(10000)
+ max_k := uint(10)
+ max_load := uint(20)
+ fmt.Printf("m/n")
+ for k := uint(2); k <= max_k; k++ {
+ fmt.Printf("\tk=%v",k)
+ }
+ fmt.Println()
+ for load := uint(2); load <= max_load; load++ {
+ fmt.Print(load)
+ for k := uint(2); k <= max_k; k++ {
+ f := New(n * load, k)
+ fp_rate := f.EstimateFalsePositiveRate(n)
+ fmt.Printf("\t%f",fp_rate)
+ }
+ fmt.Println()
+ }
+}
+

0 comments on commit a59ab96

Please sign in to comment.