In [1]:
import time
from uuid import uuid4
from random import random

class BloomFilter:
    """This is a naive implementation without any regards for deleting elements."""

    def __init__(self, size: int) -> None:
        self._data = [0] * size * 1000  # this coefficient can vary, see https://hur.st/bloomfilter
    
    def _get_item_position(self, data: str) -> int:
        """Applies hash-function to calculate item's place in the internal array."""
        return hash(data) % len(self._data)

    def add(self, data: str) -> None:
        self._data[self._get_item_position(data)] = 1
    
    def exists(self, data: str) -> bool:
        """
        True -> item is possibly(!) in the array; this is due to hash-collisions with other items.
        False -> item is DEFINITELY not in the array.
        """
        return self._data[self._get_item_position(data)] == 1

In [2]:
# these tests doesn't make any sense but we have to showcase how it works, right
number_of_elements = 10000
samples: list[str] = [str(uuid4()) for _ in range(number_of_elements)]
bloom = BloomFilter(number_of_elements)
test_set: list[str] = []
for sample in samples:
    bloom.add(sample)
    if random() < 0.1:
        test_set.append(sample)
        test_set.append(str(uuid4()))

bloom_start = time.perf_counter()
bloom_answers: list[bool] = []
for sample in test_set:
    if not bloom.exists(sample):
        bloom_answers.append(1)
bloom_end = time.perf_counter()

linear_start = time.perf_counter()
linear_answers: list[bool] = []
for sample in test_set:
    if not sample in samples:
        linear_answers.append(1)
linear_end = time.perf_counter()

assert bloom_answers == linear_answers  # to check if data matches

print(f"bloom time: {bloom_end - bloom_start}")
print(f"linear time: {linear_end - linear_start}")

bloom time: 0.001406948002113495
linear time: 0.12742764200083911
