In [1]:
import numpy as np
import math, mmh3

class CountingBloomFilter:
    # num_stored (n): the CBF must be able to store at least this 
    # many elements while maintaining the false positive rate.
    # error_rate (p): the theoretically expected probability of  
    # returning false positives, default is 1%.
    # We use these two values to calculate the optimal memory 
    # size (m) and number of hash functions (k) (in this case,
    # rather number of seeds) to use.
    
    def __init__(self, num_stored, error_rate=0.01):
        if not (0 < error_rate < 1):
            raise ValueError("Error_Rate must be between 0 and 1.")
        if not num_stored > 0:
            raise ValueError("Number of elements stored must be > 0")
        self.m = int(math.ceil(
            (num_stored * abs(math.log(error_rate))) /
            (math.log(2) ** 2)))
        self.k = int(math.ceil(math.log(1.0 / error_rate, 2)))
        self.p = error_rate
        self.array = np.zeros(self.m)
        
    def insert(self, elem):
        for seed in range(self.k):
            index = mmh3.hash(elem, seed)%self.m
            self.array[index] += 1
        
    def query(self, elem):
        for seed in range(self.k):
            index = mmh3.hash(elem, seed)%self.m
            if self.array[index] == 0:
                return False
        return True

    def delete(self, elem):
        if self.query(elem):
            for seed in range(self.k):
                index = mmh3.hash(elem, seed)%self.m
                self.array[index] -= 1