Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
datatypes: add Bloom filter (#18327)
- Loading branch information
Showing
3 changed files
with
212 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
module datatypes | ||
|
||
// Bloom filter is used to test whether a given element is part of a set. Lookups will occasionally generate false positives, but never false negatives. | ||
|
||
[heap] | ||
struct BloomFilter[T] { | ||
hash_func fn (T) u32 // hash function, input [T] , output u32 | ||
table_size int // every entry is one-bit, packed into `table` | ||
num_functions int // 1~16 | ||
mut: | ||
table []u8 | ||
} | ||
|
||
const ( | ||
// Salt values(random values). These salts are XORed with the output of the hash function to give multiple unique hashes. | ||
salts = [ | ||
// vfmt off | ||
u32(0xefd8c55b),0xa1c57493,0x174c3763,0xc26e60d4, | ||
0x9ec387fe,0xdcdc9e97,0xfc495ddc,0x6a1fa748, | ||
0x8d82a03b,0x38dc692a,0x97d0f42d,0x048a2be3, | ||
0x9b5d83aa,0x2380d32f,0x2437552f,0xcc622295, | ||
// vfmt on | ||
] | ||
) | ||
|
||
fn (b &BloomFilter[T]) free() { | ||
unsafe { | ||
free(b.table) | ||
} | ||
} | ||
|
||
// new_bloom_filter_fast create a new bloom_filter. `table_size` is 16384 , and `num_functions` is 4 | ||
pub fn new_bloom_filter_fast[T](hash_func fn (T) u32) &BloomFilter[T] { | ||
return &BloomFilter[T]{ | ||
hash_func: hash_func | ||
table_size: 16384 | ||
num_functions: 4 | ||
table: []u8{len: (16384 + 7) / 8} | ||
} | ||
} | ||
|
||
// new_bloom_filter create a new bloom_filter. `table_size` should greate than 0 , and `num_functions` should be 1~16 | ||
pub fn new_bloom_filter[T](hash_func fn (T) u32, table_size int, num_functions int) !&BloomFilter[T] { | ||
if table_size <= 0 { | ||
return error('table_size should great that 0') | ||
} | ||
if num_functions < 1 || num_functions > datatypes.salts.len { | ||
return error('num_functions should between 1~${datatypes.salts.len}') | ||
} | ||
|
||
return &BloomFilter[T]{ | ||
hash_func: hash_func | ||
table_size: table_size | ||
num_functions: num_functions | ||
table: []u8{len: (table_size + 7) / 8} | ||
} | ||
} | ||
|
||
// adds the element to bloom filter. | ||
pub fn (mut b BloomFilter[T]) add(element T) { | ||
hash := b.hash_func(element) | ||
|
||
for i in 0 .. b.num_functions { | ||
subhash := hash ^ datatypes.salts[i] | ||
index := int(subhash % u32(b.table_size)) | ||
bb := u8((1 << (index % 8))) | ||
b.table[index / 8] |= bb | ||
} | ||
} | ||
|
||
// checks the element is exists. | ||
pub fn (b &BloomFilter[T]) exists(element T) bool { | ||
hash := b.hash_func(element) | ||
for i in 0 .. b.num_functions { | ||
subhash := hash ^ datatypes.salts[i] | ||
index := int(subhash % u32(b.table_size)) | ||
bb := b.table[index / 8] | ||
bit := 1 << (index % 8) | ||
if bb & bit == 0 { | ||
return false | ||
} | ||
} | ||
|
||
return true | ||
} | ||
|
||
// @union returns the union of the two bloom filters. | ||
pub fn (l &BloomFilter[T]) @union(r &BloomFilter[T]) !&BloomFilter[T] { | ||
if l.table_size != r.table_size || l.num_functions != r.num_functions | ||
|| l.hash_func != r.hash_func { | ||
return error('Both filters must be created with the same values.') | ||
} | ||
|
||
mut new_f := BloomFilter[T]{ | ||
hash_func: l.hash_func | ||
table_size: l.table_size | ||
num_functions: l.num_functions | ||
table: []u8{len: (l.table_size + 7) / 8} | ||
} | ||
for i in 0 .. l.table.len { | ||
new_f.table[i] = l.table[i] | r.table[i] | ||
} | ||
|
||
return &new_f | ||
} | ||
|
||
// intersection returns the intersection of bloom filters. | ||
pub fn (l &BloomFilter[T]) intersection(r &BloomFilter[T]) !&BloomFilter[T] { | ||
if l.table_size != r.table_size || l.num_functions != r.num_functions | ||
|| l.hash_func != r.hash_func { | ||
return error('Both filters must be created with the same values.') | ||
} | ||
|
||
mut new_f := BloomFilter[T]{ | ||
hash_func: l.hash_func | ||
table_size: l.table_size | ||
num_functions: l.num_functions | ||
table: []u8{len: (l.table_size + 7) / 8} | ||
} | ||
for i in 0 .. l.table.len { | ||
new_f.table[i] = l.table[i] & r.table[i] | ||
} | ||
|
||
return &new_f | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
module datatypes | ||
|
||
import hash | ||
|
||
fn hash_func(s string) u32 { | ||
val64 := hash.sum64_string(s, 0x12345678) | ||
return u32(val64) | ||
} | ||
|
||
fn test_bloom_filter_fast() { | ||
mut b := new_bloom_filter_fast[string](hash_func) | ||
b.add('hello world') | ||
b.add('v is awsome') | ||
b.add('power by v') | ||
assert b.exists('hello world') == true | ||
assert b.exists('v is awsome') == true | ||
assert b.exists('power by v') == true | ||
assert b.exists('my world') == false | ||
} | ||
|
||
fn test_bloom_filter_fast_normal() { | ||
mut b := new_bloom_filter[string](hash_func, 65536, 16) or { panic(err) } | ||
b.add('hello world') | ||
b.add('v is awsome') | ||
b.add('power by v') | ||
assert b.exists('hello world') == true | ||
assert b.exists('v is awsome') == true | ||
assert b.exists('power by v') == true | ||
assert b.exists('my world') == false | ||
} | ||
|
||
fn test_bloom_filter_false_positive() { | ||
// every `add` will set 8 bits in the table(total length = 16), so overflow very quickly | ||
mut b := new_bloom_filter[string](hash_func, 16, 8) or { panic(err) } | ||
b.add('hello world') | ||
b.add('v is awsome') | ||
b.add('power by v') | ||
assert b.exists('hello world') == true | ||
assert b.exists('v is awsome') == true | ||
assert b.exists('power by v') == true | ||
assert b.exists('my world') == true // false positive | ||
} | ||
|
||
fn test_bloom_filter_fast_union_intersection() { | ||
mut a := new_bloom_filter_fast[string](hash_func) | ||
mut b := new_bloom_filter_fast[string](hash_func) | ||
|
||
a.add('power by v') | ||
a.add('silly c') | ||
a.add('super rust') | ||
|
||
b.add('hello world') | ||
b.add('v is awsome') | ||
b.add('power by v') | ||
|
||
assert a.exists('power by v') == true | ||
assert a.exists('silly c') == true | ||
assert a.exists('super rust') == true | ||
assert a.exists('power c++') == false | ||
|
||
assert b.exists('hello world') == true | ||
assert b.exists('v is awsome') == true | ||
assert b.exists('power by v') == true | ||
assert b.exists('my world') == false | ||
|
||
// a || b test | ||
mut c := a.@union(b) or { panic(err) } | ||
assert c.exists('silly c') == true | ||
assert c.exists('super rust') == true | ||
assert c.exists('power c++') == false | ||
assert c.exists('hello world') == true | ||
assert c.exists('v is awsome') == true | ||
assert c.exists('power by v') == true | ||
assert c.exists('my world') == false | ||
|
||
// a && b test | ||
mut d := a.intersection(b) or { panic(err) } | ||
assert d.exists('silly c') == false | ||
assert d.exists('super rust') == false | ||
assert d.exists('power c++') == false | ||
assert d.exists('hello world') == false | ||
assert d.exists('v is awsome') == false | ||
assert d.exists('power by v') == true | ||
assert d.exists('my world') == false | ||
} |