forked from algorand/go-algorand
/
bloom.go
158 lines (135 loc) · 3.8 KB
/
bloom.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
// Copyright 2016 David Lazar. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package bloom implements Bloom filters.
package bloom
import (
"encoding/binary"
"encoding/json"
"errors"
"math"
"github.com/dchest/siphash"
)
const maxHashes = uint32(32)
// Filter represents the state of the Bloom filter
type Filter struct {
numHashes uint32
data []byte
prefix [4]byte
}
// New creates a new Bloom filter
func New(sizeBits int, numHashes uint32, prefix uint32) *Filter {
m := (sizeBits + 7) / 8
filter := Filter{
numHashes: numHashes,
data: make([]byte, m),
}
binary.BigEndian.PutUint32(filter.prefix[:], prefix)
return &filter
}
// Optimal computes optimal Bloom filter parameters.
// These parameters are optimal for small bloom filters as
// described in section 4.1 of this paper:
//
// https://web.stanford.edu/~ashishg/papers/inverted.pdf
func Optimal(numElements int, falsePositiveRate float64) (sizeBits int, numHashes uint32) {
n := float64(numElements)
p := falsePositiveRate
m := -(n+0.5)*math.Log(p)/math.Pow(math.Log(2), 2) + 1
k := -math.Log(p) / math.Log(2)
numHashes = uint32(math.Ceil(k))
if numHashes > maxHashes {
numHashes = maxHashes
}
return int(math.Ceil(m)), numHashes
}
// Set marks x as present in the filter
func (f *Filter) Set(x []byte) {
withPrefix := append(f.prefix[:], x...)
hs := hash(withPrefix, f.numHashes)
n := uint32(len(f.data) * 8)
for _, h := range hs {
f.set(h % n)
}
}
// Test checks whether x is present in the filter
func (f *Filter) Test(x []byte) bool {
withPrefix := append(f.prefix[:], x...)
hs := hash(withPrefix, f.numHashes)
n := uint32(len(f.data) * 8)
for _, h := range hs {
if !f.test(h % n) {
return false
}
}
return true
}
// Len returns the size of the filter in bytes
func (f *Filter) Len() int {
return len(f.data)
}
// NumHashes returns the number of hash functions used in the filter
func (f *Filter) NumHashes() uint32 {
return f.numHashes
}
// MarshalBinary defines how this filter should be encoded to binary
func (f *Filter) MarshalBinary() ([]byte, error) {
data := make([]byte, len(f.data)+8)
n := uint32(f.numHashes)
binary.BigEndian.PutUint32(data[0:4], n)
copy(data[4:8], f.prefix[:])
copy(data[8:], f.data)
return data, nil
}
// UnmarshalBinary restores the state of the filter from raw data
func UnmarshalBinary(data []byte) (*Filter, error) {
f := &Filter{}
if len(data) <= 8 {
return nil, errors.New("short data")
}
f.numHashes = binary.BigEndian.Uint32(data[0:4])
if f.numHashes > maxHashes {
return nil, errors.New("too many hashes")
}
copy(f.prefix[:], data[4:8])
f.data = data[8:]
return f, nil
}
// MarshalJSON defines how this filter should be encoded to JSON
func (f *Filter) MarshalJSON() ([]byte, error) {
data, err := f.MarshalBinary()
if err != nil {
return nil, err
}
return json.Marshal(data)
}
// UnmarshalJSON defines how this filter should be decoded from JSON
func UnmarshalJSON(data []byte) (*Filter, error) {
var bs []byte
if err := json.Unmarshal(data, &bs); err != nil {
return nil, err
}
return UnmarshalBinary(bs)
}
// Previously, we used the hashing method described in this paper:
// http://www.eecs.harvard.edu/~michaelm/postscripts/rsa2008.pdf
// but this gave us bad false positive rates for small bloom filters.
func hash(x []byte, nhash uint32) []uint32 {
res := make([]uint32, nhash+3)
for i := uint32(0); i < (nhash+3)/4; i++ {
h1, h2 := siphash.Hash128(uint64(i), 666666, x)
res[i*4] = uint32(h1)
res[i*4+1] = uint32(h1 >> 32)
res[i*4+2] = uint32(h2)
res[i*4+3] = uint32(h2 >> 32)
}
return res[:nhash]
}
func (f *Filter) test(bit uint32) bool {
i := bit / 8
return f.data[i]&(1<<(bit%8)) != 0
}
func (f *Filter) set(bit uint32) {
i := bit / 8
f.data[i] |= 1 << (bit % 8)
}