In [None]:
import math
from hashlib import sha1

class HyperLogLog:
    def __init__(self, p):
        self.p = p
        self.m = 2 ** p
        self.registers = [0] * self.m
        self.alpha = self.get_alpha()

    def get_alpha(self):
        if self.m == 16:
            return 0.673
        elif self.m == 32:
            return 0.697
        elif self.m == 64:
            return 0.709
        else:
            return 0.7213 / (1 + 1.079 / self.m)

    def add(self, item):
        # Hash the item using SHA-1 and convert to binary
        hash_value = sha1(str(item).encode('utf-8')).hexdigest()
        binary_hash = bin(int(hash_value, 16))[2:].zfill(160)

        # Use the first p bits to determine the register
        register_index = int(binary_hash[:self.p], 2)

        # The remaining bits determine the number of leading zeros
        remaining_bits = binary_hash[self.p:]
        leading_zeros = len(remaining_bits) - len(remaining_bits.lstrip('0')) + 1

        # Update the register with the maximum leading zeros observed
        self.registers[register_index] = max(self.registers[register_index], leading_zeros)

    def estimate(self):
        # Calculate the harmonic mean of 2^-register
        indicator = sum([2 ** -r for r in self.registers])
        return self.alpha * self.m ** 2 / indicator

In [None]:
import random

total_profiles = 1000000

profile_ids = random.sample(range(0, total_profiles), total_profiles)

# Initialize HyperLogLog with p=10 (m=1024 registers)
hll = HyperLogLog(p=10)

for number in profile_ids:
  hll.add(number)

actual_unique = len(set(profile_ids))

hll_estimate = hll.estimate()

print(f"Total Profiles: {total_profiles}")
print(f"Actual Unique Profiles: {actual_unique}")
print(f"HyperLogLog Estimate of Unique Profiles: {hll_estimate:.2f}")


Total Profiles: 1000000
Actual Unique Profiles: 1000000
HyperLogLog Estimate of Unique Profiles: 945912.59


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType
from pyspark.sql.functions import approx_count_distinct

spark = SparkSession.builder \
    .appName("HyperLogLogExample") \
    .getOrCreate()

schema = StructType([
    StructField("profile_id", IntegerType(), False)
])

df = spark.createDataFrame([(num,) for num in profile_ids], schema)

exact_count = df.select("profile_id").d)istinct(.count()
print(f"Exact Unique Profiles: {exact_count}")

approx_count = df.select(approx_count_distinct("profile_id")).collect()[0][0]
print(f"Spark HLL Estimate of Unique Profiles: {approx_count:.2f}")

Exact Unique Profiles: 1000000
Spark HLL Estimate of Unique Profiles: 943039.00
