In [98]:
!pip install mmh3

You should consider upgrading via the 'c:\users\utkarsh priyadarshi\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.




# Bloom Filter
This is part of the series **Mastering Data Structures for Databases**
You can find the article at https://medium.com/@utkarshpriyadarshi5026/mastering-data-structures-for-databases-part-3-bloom-filters-f92f3bff7dcc 

## Hash functions used in the Bloom Filter

Let's create a hash function type that will be used in the Bloom Filter.

Our hash function will take two arguments:
- a string to be hashed
- an integer that will be used as a seed for the hash function

In [99]:
from typing import Callable

HashFunction = Callable[[str, int], int]


### CryptoGraphic Hash Functions 

Different types of hash functions can be used in the Bloom Filter.


In [100]:
import hashlib

def sha256_hash(item: str, seed: int = 0) -> int:
    hash_value = int(hashlib.sha256((item + str(seed)).encode()).hexdigest(), 16)
    return hash_value

def md5_hash(item: str, seed: int = 0) -> int:
    hash_value = int(hashlib.md5((item + str(seed)).encode()).hexdigest(), 16)
    return hash_value

def sha1_hash(item: str, seed: int = 0) -> int:
    return int(hashlib.sha1((item + str(seed)).encode()).hexdigest(), 16)

### Non-Cryptographic Hash Functions

#### Murmur Hash
MurmurHash processes the input data in blocks, mixing the bits in each block to produce a final hash value. It uses a combination of multiplication and bitwise operations to achieve a good distribution of hash values.
#### DJB2 Hash
DJB2 starts with an initial hash value (often 5381) and iterates over each character in the input string. For each character, it multiplies the current hash value by 33 and adds the ASCII value of the character. This process is repeated for all characters in the input string.
#### FNV-1a Hash
FNV-1a starts with an initial hash value (FNV offset basis) and iterates over each character in the input string. For each character, it XORs the current hash value with the ASCII value of the character and then multiplies the result by the FNV prime. This process is repeated for all characters in the input string.


In [101]:
import mmh3

def murmur_hash(item: str, seed: int = 0) -> int:
    hash_value = mmh3.hash(item, seed)
    return hash_value

def djb2_hash(item: str, seed: int = 0) -> int:
    hash_value = seed
    for char in item:
        hash_value = ((hash_value << 5) + hash_value) + ord(char)
    return hash_value & 0xFFFFFFFF

def fnv1a_hash(item: str, seed: int = 0) -> int:
    hash_value = 0x811c9dc5 + seed # FNV offset basis
    fnv_prime = 0x01000193 # 32 bit FNV prime
    for char in item:
        hash_value ^= ord(char) # XOR
        hash_value *= fnv_prime # Multiplication
    return hash_value & 0xFFFFFFFF 

## Structure of the Bloom Filter
A Bloom filter is a space-efficient probabilistic data structure used to test whether an element is a member of a set. The structure of a Bloom filter consists of the following components:

1. Bit Array:  
    - A fixed-size bit array (or bit vector) initialized to all zeros. The size of the bit array determines the accuracy and space efficiency of the Bloom filter.
2. Hash Functions:  
    - Multiple independent hash functions that map elements to positions in the bit array. Each hash function should uniformly distribute the input elements across the bit array.

## Operations on the Bloom Filter

1. Add:
    - To add an element to the Bloom filter, each hash function is applied to the element to get multiple hash values. The corresponding positions in the bit array are then set to 1
2. Check:
    - To check if an element is in the Bloom filter, each hash function is applied to the element to get multiple hash values. If all the corresponding positions in the bit array are set to 1, the element is likely in the set. If any position is 0, the element is definitely not in the set.

In [102]:
from typing import List

class BloomFilter:
    def __init__(self, size: int, hash_functions: List[HashFunction]) -> None:
        """
        Initializes the Bloom Filter with a given size and list of hash functions.
        
        Args:
            size (int): The size of the bit array.
            hash_functions (List[HashFunction]): A list of hash functions.
        """
        self.size = size
        self.hash_functions = hash_functions
        self.bit_array = [0] * size
        self.item_count = 0
        print(f"Initialized Bloom Filter with size {size} and {len(hash_functions)} hash functions.")

    def _hashes(self, item: str) -> List[int]:
        """
        Computes the hash values for a given item using the hash functions.
        
        Args:
            item (str): The item to be hashed.
        
        Returns:
            List[int]: A list of hash values.
        """
        hashes = []
        for i, hash_func in enumerate(self.hash_functions):
            hash_value = hash_func(item, i) % self.size
            hashes.append(hash_value)
           
        return hashes

    def add(self, item: str) -> None:
        """
        Adds an item to the Bloom Filter.
        
        Args:
            item (str): The item to be added.
        """
        hashes = self._hashes(item)
        for i, hash_value in enumerate(hashes):
            self.bit_array[hash_value] = 1
        
        self.item_count += 1

    def check(self, item: str) -> bool:
        """
        Checks if an item is present in the Bloom Filter.
        
        Args:
            item (str): The item to be checked.
        
        Returns:
            bool: True if the item is likely present, False if the item is definitely not present.
        """
        hashes = self._hashes(item)
        result = all(self.bit_array[hash_value] == 1 for hash_value in hashes)
        return result
    
    def show(self):
        print(self.bit_array)

In [103]:
import random

def get_random_hash_functions(num_functions: int) -> List[HashFunction]:
    available_hash_functions: List[HashFunction] = [
        sha256_hash,
        md5_hash,
        sha1_hash,
        murmur_hash,
        djb2_hash,
        fnv1a_hash
    ]
    if num_functions > len(available_hash_functions):
        raise ValueError("Number of hash functions exceeds the available functions.")
    
    selected_functions = random.sample(available_hash_functions, num_functions)
    
    for i, func in enumerate(selected_functions):
        print(f"Selected Hash Function {i+1}: {func.__name__}")
        
    print("\n")
    return selected_functions

In [104]:
funcs = get_random_hash_functions(3)
bloom_filter = BloomFilter(10, funcs)

Selected Hash Function 1: murmur_hash
Selected Hash Function 2: fnv1a_hash
Selected Hash Function 3: sha256_hash


Initialized Bloom Filter with size 10 and 3 hash functions.


In [105]:
bloom_filter.add("apple")
bloom_filter.check("apple")

True

## Counting Bloom Filter

A Counting Bloom Filter (CBF) is an extension of the standard Bloom Filter that allows for the removal of elements. It achieves this by using an array of counters instead of a simple bit array. Each counter keeps track of the number of times a particular position has been set, enabling both addition and deletion of elements.

#### Key Components:
1. **Count Array**:
   - An array of integers (counters) initialized to all zeros. Each counter represents the number of times a particular bit position has been set.

2. **Hash Functions**:
   - Multiple independent hash functions that map elements to positions in the count array. Each hash function should uniformly distribute the input elements across the count array.

#### Basic Operations:
1. **Add Operation**:
   - To add an element, each hash function is applied to the element to get multiple hash values. The corresponding positions in the count array are incremented by 1.

2. **Remove Operation**:
   - To remove an element, each hash function is applied to the element to get multiple hash values. The corresponding positions in the count array are decremented by 1, ensuring the count does not go below zero.

3. **Check Operation**:
   - To check if an element is in the filter, each hash function is applied to the element to get multiple hash values. If all the corresponding positions in the count array are greater than zero, the element is likely in the set. If any position is zero, the element is definitely not in the set.

In [106]:
class CountingBloomFilter(BloomFilter):
    """
    A Counting Bloom Filter (CBF) is an extension of the standard Bloom Filter that allows for the removal of elements.
    It uses an array of counters instead of a simple bit array to keep track of the number of times a particular position has been set.

    Attributes:
        size (int): The size of the count array.
        hash_functions (List[HashFunction]): A list of hash functions used to map elements to positions in the count array.
        count_array (List[int]): The count array used to store the presence of elements.
    """

    def __init__(self, size: int, hash_functions: List[HashFunction]) -> None:
        """
        Initializes the Counting Bloom Filter with a given size and list of hash functions.

        Args:
            size (int): The size of the count array.
            hash_functions (List[HashFunction]): A list of hash functions.
        """
        super().__init__(size, hash_functions)
        # Initialize a count array instead of a bit array
        self.count_array = [0] * size
        print(f"Initialized Counting Bloom Filter with size {size}.")

    def add(self, item: str) -> None:
        """
        Adds an item to the Counting Bloom Filter.

        Args:
            item (str): The item to be added.
        """
        hashes = self._hashes(item)
        for i, hash_value in enumerate(hashes):
            self.count_array[hash_value] += 1
            

    def remove(self, item: str) -> None:
        """
        Removes an item from the Counting Bloom Filter.

        Args:
            item (str): The item to be removed.
        """
        hashes = self._hashes(item)
        for i, hash_value in enumerate(hashes):
            if self.count_array[hash_value] > 0:
                # Decrement the count if it's greater than 0
                self.count_array[hash_value] -= 1
              

    def check(self, item: str) -> bool:
        """
        Checks if an item is present in the Counting Bloom Filter.

        Args:
            item (str): The item to be checked.

        Returns:
            bool: True if the item is likely present, False if the item is definitely not present.
        """
        hashes = self._hashes(item)
        result = all(self.count_array[hash_value] > 0 for hash_value in hashes)
        print(f"Checking '{item}': {'Present' if result else 'Not Present'} in Counting Bloom Filter.")
        return result

    def show(self) -> None:
        """
        Displays the current state of the count array.
        """
        print(self.count_array)

In [107]:
funcs = get_random_hash_functions(3)
count_bloom_filter = CountingBloomFilter(10, funcs)

print("Adding 'apple' to the filter.")
count_bloom_filter.add("apple")

if count_bloom_filter.check("apple"):
    print("Found apple in the filter.")
    print("Removing 'apple' from the filter.")
    count_bloom_filter.remove("apple")
    
print("Checking 'apple' after removal.")
if not count_bloom_filter.check("apple"):
    print("Apple not found in the filter.")

Selected Hash Function 1: md5_hash
Selected Hash Function 2: fnv1a_hash
Selected Hash Function 3: murmur_hash


Initialized Bloom Filter with size 10 and 3 hash functions.
Initialized Counting Bloom Filter with size 10.
Adding 'apple' to the filter.
Checking 'apple': Present in Counting Bloom Filter.
Found apple in the filter.
Removing 'apple' from the filter.
Checking 'apple' after removal.
Checking 'apple': Not Present in Counting Bloom Filter.
Apple not found in the filter.


## Scalable Bloom Filter

A Scalable Bloom Filter (SBF) is an extension of the standard Bloom Filter that dynamically grows to accommodate more elements while maintaining a specified error rate.

#### Key Components:
1. **Initial Size**:
   - The initial size of the Bloom Filter.
2. **Error Rate**:
   - The desired false positive rate.
3. **Hash Functions**:
   - A list of hash functions used to map elements to positions in the bit array.
4. **Growth Factor**:
   - The factor by which the Bloom Filter size grows when it needs to expand.
5. **Capacity Factor**:
   - The factor that determines when the Bloom Filter needs to expand based on its fill ratio.
6. **Current Filter Size**:
   - The current size of the Bloom Filter.
7. **Filters**:
   - A list of Bloom Filters used to store elements.

#### Basic Operations:
1. **Add Operation**:
   - Adds an item to the Scalable Bloom Filter. If the current filter is full, a new filter is created and the item is added to it.
2. **Check Operation**:
   - Checks if an item is present in the Scalable Bloom Filter. It checks if the item is present in any of the filters.
3. **Expansion Check**:
   - Determines if the Bloom Filter needs to be expanded based on its fill ratio and capacity factor.

In [113]:
import math

class ScalableBloomFilter:
    """
    A Scalable Bloom Filter (SBF) is an extension of the standard Bloom Filter that dynamically grows to accommodate more elements while maintaining a specified error rate.

    Attributes:
        initial_size (int): The initial size of the Bloom Filter.
        error_rate (float): The desired false positive rate.
        hash_functions (list[HashFunction]): A list of hash functions used to map elements to positions in the bit array.
        growth_factor (float): The factor by which the Bloom Filter size grows when it needs to expand.
        capacity_factor (float): The factor that determines when the Bloom Filter needs to expand based on its fill ratio.
        current_filter_size (int): The current size of the Bloom Filter.
        filters (List[BloomFilter]): A list of Bloom Filters used to store elements.
    """
    def __init__(self, initial_size: int, error_rate: float, hash_functions: list[HashFunction], growth_factor: float = 2.0, capacity_factor: float = 0.75) -> None:
        """
        Initializes the Scalable Bloom Filter with the given parameters.

        Args:
            initial_size (int): The initial size of the Bloom Filter.
            error_rate (float): The desired false positive rate.
            hash_functions (list[HashFunction]): A list of hash functions.
            growth_factor (float): The factor by which the Bloom Filter size grows when it needs to expand.
            capacity_factor (float): The factor that determines when the Bloom Filter needs to expand based on its fill ratio.
        """
        self.filters: List[BloomFilter] = []
        self.initial_size = initial_size
        self.error_rate = error_rate
        self.hash_functions = hash_functions
        self.growth_factor = growth_factor
        self.capacity_factor = capacity_factor
        self.current_filter_size = initial_size
        self._create_new_filter(initial_size, error_rate)
        
        
    def _create_new_filter(self, size: int, error_rate: float) -> None:
        """
        Creates a new Bloom Filter and adds it to the list of filters.

        Args:
            size (int): The size of the new Bloom Filter.
            error_rate (float): The desired false positive rate for the new Bloom Filter.
        """
        new_filter = BloomFilter(size, self.hash_functions)
        self.filters.append(new_filter)
        
        print(f"Created new Bloom Filter with size {size}.\n")
        
    
    def add(self, item: str) -> None:
        """
        Adds an item to the Scalable Bloom Filter.
        if the current filter is full, a new filter is created and the item is added to it.

        Args:
            item (str): The item to be added.
        """
        last_filter = self.filters[-1]
        last_filter.add(item)
        
        if self._is_expansion_needed(last_filter):
            self.current_filter_size = int(self.current_filter_size * self.growth_factor)
            
            filter_size = self.current_filter_size
            new_error_rate = self.error_rate / len(self.filters)
            self._create_new_filter(filter_size, new_error_rate)
        
    def check(self, item: str) -> bool:
        """
        Checks if an item is present in the Scalable Bloom Filter. it checks if the item is present in any of the filters.

        Args:
            item (str): The item to be checked.

        Returns:
            bool: True if the item is likely present, False if the item is definitely not present.
        """
        return any(bloom.check(item) for bloom in self.filters)
    
    def _is_expansion_needed(self, bl_filter: BloomFilter) -> bool:
        """
        Determines if the Bloom Filter needs to be expanded based on its fill ratio and capacity factor.

        Args:
            bl_filter (BloomFilter): The Bloom Filter to be checked.

        Returns:
            bool: True if expansion is needed, False otherwise.
        """
        
        # Calculate the optimal number of items for this filter
        num_hashes = len(bl_filter.hash_functions)
        optimal_items = int((bl_filter.size * math.log(2)) / num_hashes)
        return bl_filter.item_count >= (optimal_items * self.capacity_factor)
    
    def __str__(self) -> str:
        return f"ScalableBloomFilter with {len(self.filters)} filters, current size: {self.current_filter_size}"
    


In [115]:
# Example usage
funcs = get_random_hash_functions(3)
print("Checking for items:")
sbf = ScalableBloomFilter(initial_size=10, error_rate=0.01, hash_functions=funcs)
for i in range(1000):
    sbf.add(f"item_{i}")
    if i % 100 == 0:
        print(sbf)

# Check for items
print("\nChecking for items:")
for i in range(1100):  # Check for 100 items that weren't added
    if not sbf.check(f"item_{i}"):
        print(f"item_{i} not found")


Selected Hash Function 1: djb2_hash
Selected Hash Function 2: md5_hash
Selected Hash Function 3: sha1_hash


Checking for items:
Initialized Bloom Filter with size 10 and 3 hash functions.
Created new Bloom Filter with size 10.

ScalableBloomFilter with 1 filters, current size: 10
Initialized Bloom Filter with size 20 and 3 hash functions.
Created new Bloom Filter with size 20.

Initialized Bloom Filter with size 40 and 3 hash functions.
Created new Bloom Filter with size 40.

Initialized Bloom Filter with size 80 and 3 hash functions.
Created new Bloom Filter with size 80.

Initialized Bloom Filter with size 160 and 3 hash functions.
Created new Bloom Filter with size 160.

Initialized Bloom Filter with size 320 and 3 hash functions.
Created new Bloom Filter with size 320.

ScalableBloomFilter with 6 filters, current size: 320
Initialized Bloom Filter with size 640 and 3 hash functions.
Created new Bloom Filter with size 640.

ScalableBloomFilter with 7 filters, current size: 640
Init