Skip to content

Commit

Permalink
Merge pull request #11 from t0k4rt/feature/estimate_set_cardinality
Browse files Browse the repository at this point in the history
Feature/estimate set cardinality
  • Loading branch information
yankun1992 committed Oct 7, 2023
2 parents 913bc64 + ff3d6dc commit 5e685f3
Show file tree
Hide file tree
Showing 7 changed files with 45 additions and 0 deletions.
6 changes: 6 additions & 0 deletions fastbloom-rs/src/bloom.rs
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,12 @@ impl BloomFilter {
self.bit_set.is_empty()
}

/// Returns estimated cardinality of the set
/// see [Scalable and Efficient Privacy Preserving Global Itemset Support Approximation Using Bloom Filters](https://inria.hal.science/hal-01284874/document) as reference
pub fn estimate_set_cardinality(&self) -> f64 {
(self.bit_set.count_zeros() as f64 / self.config.size as f64).ln() / (self.hashes() as f64 * (1.0 - 1.0/self.config.size as f64).ln())
}

pub(crate) fn set_bit_vec(&mut self, bit_vec: BloomBitVec) {
assert_eq!(self.config.size, bit_vec.nbits as u64);
self.bit_set = bit_vec
Expand Down
16 changes: 16 additions & 0 deletions fastbloom-rs/src/vec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ impl BloomBitVec {
}
}

pub fn count_zeros(&self)->u32 {
self.storage.iter().fold(0, |acc, x| acc + x.count_zeros())
}

pub fn clear(&mut self) {
self.storage.fill(0);
Expand Down Expand Up @@ -222,4 +225,17 @@ fn test_count_vec() {
vec.increment(7);

assert_eq!(1, vec.get(7))
}

#[test]
fn test_count_zeros() {
let mut vec = BloomBitVec::new(4);
vec.set(37);
vec.set(30);
vec.set(38);
println!("{:?}", vec);
#[cfg(target_pointer_width = "64")]
assert_eq!(vec.count_zeros(), 253);
#[cfg(target_pointer_width = "32")]
assert_eq!(vec.count_zeros(), 125);
}
Binary file added fastbloom_rs/fastbloom_rs.abi3.so
Binary file not shown.
3 changes: 3 additions & 0 deletions fastbloom_rs/fastbloom_rs.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ class PyBloomFilter(object):
def clear(self):
...

def estimate_set_cardinality(self):
...

def get_hash_indices(self, element: bytes) -> Sequence[int]:
...

Expand Down
8 changes: 8 additions & 0 deletions fastbloom_rs/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,14 @@ def is_empty(self) -> bool:
:return:
"""
return self._py_bloom.is_empty()

def estimate_set_cardinality(self) -> float:
"""
Returns Returns estimated cardinality of the set
:return:
"""
return self._py_bloom.estimate_set_cardinality()

def union(self, other: "BloomFilter") -> bool:
"""
Expand Down
8 changes: 8 additions & 0 deletions py_tests/test_bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,14 @@ def test_bloom_add():
assert not ('hello' in bloom)


def test_bloom_estimate_set_cardinality():
bloom = BloomFilter(100_000_000, 0.01)
for data in range(0, 10_000_000):
bloom.add_int(data)

assert (bloom.estimate_set_cardinality() < 10_100_000) and (bloom.estimate_set_cardinality() > 9_900_000)


def test_bloom_op():
bloom = BloomFilter(100_000_000, 0.001)
bloom.add_bytes(b'hello')
Expand Down
4 changes: 4 additions & 0 deletions src/pybloom.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,10 @@ impl PyBloomFilter {
Ok(self.bloomfilter.is_empty())
}

pub fn estimate_set_cardinality(&self) -> PyResult<f64> {
Ok(self.bloomfilter.estimate_set_cardinality())
}

pub fn union(&mut self, other: &PyBloomFilter) -> PyResult<bool> {
Ok(self.bloomfilter.union(&other.bloomfilter))
}
Expand Down

0 comments on commit 5e685f3

Please sign in to comment.