https://micvog.com/wp-content/uploads/2015/06/approximate_freq_count_over_data_streams_vldb_2002.pdf

In [8]:
from math import ceil

class LossyCount:
    def __init__(self, error: float) -> None:
        self._error = error
        self._bucket_size: int = ceil(1 / error)
        self._processed_total = 0
        self._data: dict[int, tuple[int, int]] = {}

    def process_element(self, element: int) -> None:
        self._processed_total += 1
        current_bucket = ceil(self._processed_total / self._bucket_size)

        if element in self._data:
            frequency, possible_error = self._data[element]
            self._data[element] = (frequency + 1, possible_error)
        else:
            self._data[element] = (1, current_bucket - 1)
        
        # it's time to prune if we handled bucket size of elements
        if self._processed_total % self._bucket_size != 0:
            return
        for key in list(self._data.keys()):  # due to some dict iterator limitations
            frequency, possible_error = self._data[key]            
            if frequency + possible_error <= current_bucket:
                self._data.pop(key)

    def get_frequencies(self, threshold: float) -> list[tuple[int, int]]:
        return [
            (key, fr) for key, (fr, _) in self._data.items() if fr >= (threshold - self._error) * self._processed_total
        ]
    
    def __repr__(self) -> str:
        return str(self._data)

In [35]:
from random import randint

lc = LossyCount(0.001)

for _ in range(10000):
    lc.process_element(randint(1, 10))

In [41]:
lc.get_frequencies(0.096)

[(7, 1003),
 (4, 975),
 (2, 1047),
 (9, 1043),
 (3, 1018),
 (10, 978),
 (6, 1037),
 (5, 967),
 (1, 991)]