# Getting Started with Sketches

This notebook introduces probabilistic data structures (sketches) and demonstrates their usage.

In [ ]:
import sketches
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
%matplotlib inline

## 1. What are Probabilistic Data Structures?

Probabilistic data structures trade perfect accuracy for massive space savings. They're useful when:
- Exact counts aren't critical
- Data is too large to store entirely
- Real-time processing is required

## 2. HyperLogLog - Counting Unique Elements

In [ ]:
# Create a HyperLogLog sketch
hll = sketches.HllSketch(12)  # lg_k parameter

# Add elements
for i in range(10000):
    hll.update(f"user_{i}")

# Get estimate
print(f"Actual unique elements: 10,000")
print(f"HLL estimate: {hll.estimate():.0f}")
print(f"Error: {abs(hll.estimate() - 10000) / 10000 * 100:.2f}%")

## 3. Memory Efficiency Demonstration

In [ ]:
# Compare memory usage: exact counting vs HLL
import sys

# Exact counting with a set
exact_set = set()
for i in range(100000):
    exact_set.add(f"item_{i}")

# HLL counting
hll = sketches.HllSketch(14)  # lg_k parameter
for i in range(100000):
    hll.update(f"item_{i}")

print(f"Exact set memory: ~{sys.getsizeof(exact_set):,} bytes")
print(f"HLL memory: ~{len(hll.to_bytes()):,} bytes")
print(f"Memory savings: {(1 - len(hll.to_bytes())/sys.getsizeof(exact_set)) * 100:.1f}%")

## 4. Theta Sketch - Set Operations

In [ ]:
# Create two user sets with overlap
website_a = sketches.ThetaSketch(4096)  # k parameter
website_b = sketches.ThetaSketch(4096)  # k parameter

# Website A users (0-5999)
for i in range(6000):
    website_a.update(f"user_{i}")

# Website B users (4000-9999)
for i in range(4000, 10000):
    website_b.update(f"user_{i}")

# Set operations
union = website_a.union(website_b)
intersection = website_a.intersect(website_b)
a_only = website_a.difference(website_b)

print(f"Website A users: ~{website_a.estimate():.0f}")
print(f"Website B users: ~{website_b.estimate():.0f}")
print(f"Total unique users: ~{union.estimate():.0f}")
print(f"Users on both sites: ~{intersection.estimate():.0f}")
print(f"Users only on A: ~{a_only.estimate():.0f}")

## 5. Accuracy vs Precision Trade-off

In [ ]:
# Test different precision levels
precisions = [8, 10, 12, 14, 16]
n_elements = 50000
results = []

for p in precisions:
    hll = sketches.HllSketch(p)  # lg_k parameter
    for i in range(n_elements):
        hll.update(f"element_{i}")
    
    estimate = hll.estimate()
    error = abs(estimate - n_elements) / n_elements * 100
    memory = len(hll.to_bytes())
    
    results.append({
        'precision': p,
        'estimate': estimate,
        'error_pct': error,
        'memory_bytes': memory
    })

df = pl.DataFrame(results)
print(df)

# Visualize trade-off
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

precisions_data = df['precision'].to_numpy()
errors_data = df['error_pct'].to_numpy()
memory_data = df['memory_bytes'].to_numpy()

ax1.plot(precisions_data, errors_data, 'b-o')
ax1.set_xlabel('Precision')
ax1.set_ylabel('Error %')
ax1.set_title('Precision vs Accuracy')
ax1.grid(True)

ax2.plot(precisions_data, memory_data, 'r-o')
ax2.set_xlabel('Precision')
ax2.set_ylabel('Memory (bytes)')
ax2.set_title('Precision vs Memory Usage')
ax2.grid(True)

plt.tight_layout()
plt.show()