In [1]:
import math
import hashlib
from collections import defaultdict

class CountMinSketch:
    def __init__(self, epsilon, delta):
        """
        - epsilon: Error factor (ε). The error in the count is at most ε * total_count.
        - delta: Probability factor (δ). The probability of the error exceeding the bound is at most δ.
        """
        self.epsilon = epsilon
        self.delta = delta
        self.w = math.ceil(math.e / epsilon)  # Width of the sketch
        self.d = math.ceil(math.log(1 / delta))  # Depth of the sketch
        self.total = 0
        self.table = [[0] * self.w for _ in range(self.d)]
        self.hash_seeds = [random.randint(1, 1 << 30) for _ in range(self.d)]

    def _hash(self, item, seed):
        item = str(item)
        hash_input = f"{item}_{seed}".encode('utf-8')
        return int(hashlib.md5(hash_input).hexdigest(), 16)

    def add(self, item, count=1):
        self.total += count
        for i in range(self.d):
            hash_val = self._hash(item, self.hash_seeds[i]) % self.w
            self.table[i][hash_val] += count

    def estimate(self, item):
        min_count = float('inf')
        for i in range(self.d):
            hash_val = self._hash(item, self.hash_seeds[i]) % self.w
            min_count = min(min_count, self.table[i][hash_val])
        return min_count


In [2]:
import random

total_profiles = 20
total_views = 1000000
epsilon = 0.1
delta = 0.1

cms = CountMinSketch(epsilon=epsilon, delta=delta)

profile_ids = [f"profile_{i}" for i in range(total_profiles)]

# Simulate adding profile page views with varying frequencies
# For simplicity, we'll randomly assign each of the 1000000 views to one of the 20 profiles
actual_frequencies = defaultdict(int)

for _ in range(total_views):
    pid = random.choice(profile_ids)
    cms.add(pid, 1)
    actual_frequencies[pid] += 1

def get_view_count(profile_id):
    estimated = cms.estimate(profile_id)
    actual = actual_frequencies.get(profile_id, 0)
    return estimated, actual

print("\nEstimated vs Actual View Counts:")
for pid in profile_ids:
    estimated, actual = get_view_count(pid)
    print(f"Profile ID: {pid}")
    print(f"  Estimated Views: {estimated}")
    print(f"  Actual Views:    {actual}")
    print("-" * 30)



Estimated vs Actual View Counts:
Profile ID: profile_0
  Estimated Views: 49977
  Actual Views:    49977
------------------------------
Profile ID: profile_1
  Estimated Views: 50116
  Actual Views:    50116
------------------------------
Profile ID: profile_2
  Estimated Views: 50222
  Actual Views:    50222
------------------------------
Profile ID: profile_3
  Estimated Views: 50084
  Actual Views:    50084
------------------------------
Profile ID: profile_4
  Estimated Views: 49843
  Actual Views:    49843
------------------------------
Profile ID: profile_5
  Estimated Views: 50056
  Actual Views:    50056
------------------------------
Profile ID: profile_6
  Estimated Views: 49997
  Actual Views:    49997
------------------------------
Profile ID: profile_7
  Estimated Views: 49825
  Actual Views:    49825
------------------------------
Profile ID: profile_8
  Estimated Views: 50233
  Actual Views:    50233
------------------------------
Profile ID: profile_9
  Estimated Views