Skip to content

Commit

Permalink
datatypes: add Bloom filter (#18327)
Browse files Browse the repository at this point in the history
  • Loading branch information
kbkpbot committed Jun 2, 2023
1 parent 9764342 commit 0fc33c6
Show file tree
Hide file tree
Showing 3 changed files with 212 additions and 1 deletion.
3 changes: 2 additions & 1 deletion vlib/datatypes/README.md
Expand Up @@ -6,7 +6,7 @@ data types.
V's `builtin` module is imported implicitly, and has implementations for arrays,
maps and strings. These are good for many applications, but there are a plethora
of other useful data structures/containers, like linked lists, priority queues,
tries, etc, that allow for algorithms with different time complexities, which may
trees, etc, that allow for algorithms with different time complexities, which may
be more suitable for your specific application.

It is implemented using generics, that you have to specialise for the type of
Expand All @@ -28,4 +28,5 @@ println(stack)
- [x] Min heap (priority queue)
- [x] Set
- [x] Quadtree
- [x] Bloom filter
- [ ] ...
125 changes: 125 additions & 0 deletions vlib/datatypes/bloom_filter.v
@@ -0,0 +1,125 @@
module datatypes

// Bloom filter is used to test whether a given element is part of a set. Lookups will occasionally generate false positives, but never false negatives.

[heap]
struct BloomFilter[T] {
hash_func fn (T) u32 // hash function, input [T] , output u32
table_size int // every entry is one-bit, packed into `table`
num_functions int // 1~16
mut:
table []u8
}

const (
// Salt values(random values). These salts are XORed with the output of the hash function to give multiple unique hashes.
salts = [
// vfmt off
u32(0xefd8c55b),0xa1c57493,0x174c3763,0xc26e60d4,
0x9ec387fe,0xdcdc9e97,0xfc495ddc,0x6a1fa748,
0x8d82a03b,0x38dc692a,0x97d0f42d,0x048a2be3,
0x9b5d83aa,0x2380d32f,0x2437552f,0xcc622295,
// vfmt on
]
)

fn (b &BloomFilter[T]) free() {
unsafe {
free(b.table)
}
}

// new_bloom_filter_fast create a new bloom_filter. `table_size` is 16384 , and `num_functions` is 4
pub fn new_bloom_filter_fast[T](hash_func fn (T) u32) &BloomFilter[T] {
return &BloomFilter[T]{
hash_func: hash_func
table_size: 16384
num_functions: 4
table: []u8{len: (16384 + 7) / 8}
}
}

// new_bloom_filter create a new bloom_filter. `table_size` should greate than 0 , and `num_functions` should be 1~16
pub fn new_bloom_filter[T](hash_func fn (T) u32, table_size int, num_functions int) !&BloomFilter[T] {
if table_size <= 0 {
return error('table_size should great that 0')
}
if num_functions < 1 || num_functions > datatypes.salts.len {
return error('num_functions should between 1~${datatypes.salts.len}')
}

return &BloomFilter[T]{
hash_func: hash_func
table_size: table_size
num_functions: num_functions
table: []u8{len: (table_size + 7) / 8}
}
}

// adds the element to bloom filter.
pub fn (mut b BloomFilter[T]) add(element T) {
hash := b.hash_func(element)

for i in 0 .. b.num_functions {
subhash := hash ^ datatypes.salts[i]
index := int(subhash % u32(b.table_size))
bb := u8((1 << (index % 8)))
b.table[index / 8] |= bb
}
}

// checks the element is exists.
pub fn (b &BloomFilter[T]) exists(element T) bool {
hash := b.hash_func(element)
for i in 0 .. b.num_functions {
subhash := hash ^ datatypes.salts[i]
index := int(subhash % u32(b.table_size))
bb := b.table[index / 8]
bit := 1 << (index % 8)
if bb & bit == 0 {
return false
}
}

return true
}

// @union returns the union of the two bloom filters.
pub fn (l &BloomFilter[T]) @union(r &BloomFilter[T]) !&BloomFilter[T] {
if l.table_size != r.table_size || l.num_functions != r.num_functions
|| l.hash_func != r.hash_func {
return error('Both filters must be created with the same values.')
}

mut new_f := BloomFilter[T]{
hash_func: l.hash_func
table_size: l.table_size
num_functions: l.num_functions
table: []u8{len: (l.table_size + 7) / 8}
}
for i in 0 .. l.table.len {
new_f.table[i] = l.table[i] | r.table[i]
}

return &new_f
}

// intersection returns the intersection of bloom filters.
pub fn (l &BloomFilter[T]) intersection(r &BloomFilter[T]) !&BloomFilter[T] {
if l.table_size != r.table_size || l.num_functions != r.num_functions
|| l.hash_func != r.hash_func {
return error('Both filters must be created with the same values.')
}

mut new_f := BloomFilter[T]{
hash_func: l.hash_func
table_size: l.table_size
num_functions: l.num_functions
table: []u8{len: (l.table_size + 7) / 8}
}
for i in 0 .. l.table.len {
new_f.table[i] = l.table[i] & r.table[i]
}

return &new_f
}
85 changes: 85 additions & 0 deletions vlib/datatypes/bloom_filter_test.v
@@ -0,0 +1,85 @@
module datatypes

import hash

fn hash_func(s string) u32 {
val64 := hash.sum64_string(s, 0x12345678)
return u32(val64)
}

fn test_bloom_filter_fast() {
mut b := new_bloom_filter_fast[string](hash_func)
b.add('hello world')
b.add('v is awsome')
b.add('power by v')
assert b.exists('hello world') == true
assert b.exists('v is awsome') == true
assert b.exists('power by v') == true
assert b.exists('my world') == false
}

fn test_bloom_filter_fast_normal() {
mut b := new_bloom_filter[string](hash_func, 65536, 16) or { panic(err) }
b.add('hello world')
b.add('v is awsome')
b.add('power by v')
assert b.exists('hello world') == true
assert b.exists('v is awsome') == true
assert b.exists('power by v') == true
assert b.exists('my world') == false
}

fn test_bloom_filter_false_positive() {
// every `add` will set 8 bits in the table(total length = 16), so overflow very quickly
mut b := new_bloom_filter[string](hash_func, 16, 8) or { panic(err) }
b.add('hello world')
b.add('v is awsome')
b.add('power by v')
assert b.exists('hello world') == true
assert b.exists('v is awsome') == true
assert b.exists('power by v') == true
assert b.exists('my world') == true // false positive
}

fn test_bloom_filter_fast_union_intersection() {
mut a := new_bloom_filter_fast[string](hash_func)
mut b := new_bloom_filter_fast[string](hash_func)

a.add('power by v')
a.add('silly c')
a.add('super rust')

b.add('hello world')
b.add('v is awsome')
b.add('power by v')

assert a.exists('power by v') == true
assert a.exists('silly c') == true
assert a.exists('super rust') == true
assert a.exists('power c++') == false

assert b.exists('hello world') == true
assert b.exists('v is awsome') == true
assert b.exists('power by v') == true
assert b.exists('my world') == false

// a || b test
mut c := a.@union(b) or { panic(err) }
assert c.exists('silly c') == true
assert c.exists('super rust') == true
assert c.exists('power c++') == false
assert c.exists('hello world') == true
assert c.exists('v is awsome') == true
assert c.exists('power by v') == true
assert c.exists('my world') == false

// a && b test
mut d := a.intersection(b) or { panic(err) }
assert d.exists('silly c') == false
assert d.exists('super rust') == false
assert d.exists('power c++') == false
assert d.exists('hello world') == false
assert d.exists('v is awsome') == false
assert d.exists('power by v') == true
assert d.exists('my world') == false
}

0 comments on commit 0fc33c6

Please sign in to comment.