# Top-K elements



In [1]:
import datasketching.cms as cms
import heapq

MAX_OVERHEAD = 10

class TopK(object):

    def __init__(self, k, width, hashes):
        self.width = width
        self.hashes = hashes
        self.cms = cms.CMS(width, hashes)
        self.k = k
        self.queue = []
        self.seen = set()

    def insert(self, obj):
        """ Inserts _obj_ in this summary and updates the top k elements if this
            element is likely to be in the top k elements as given by the underlying top-k sketch """

        # Identify how many times you've seen `obj` already
        self.cms.insert(obj)
        count = self.cms.lookup(obj)

        # Insert _obj_ into the priority queue (`self.queue`).

        # Hint #1:  How would you sort the priority queue to ensure that it contains the top k elements?
        # Hint #2:  Python's `heapq.heappush` function puts the smallest things first
        # Hint #3:  What happens when you need to update the count of something that's already in the queue?

        # By storing a tuple with the negated count as the first element, we can keep the smallest things first
        to_insert = (-count, obj)

        if obj in self.seen:
            item, = [x for x in self.queue if x[1] == obj]
            self.queue.remove(item)
        else:
            self.seen.add(obj)

        heapq.heappush(self.queue, to_insert)

        # Hint #4:  How will you ensure that the priority queue doesn't grow unbounded?
        if len(self.queue) > (self.k * MAX_OVERHEAD):
            newsize = max(int(self.k * MAX_OVERHEAD / 2), 1)
            self.queue = heapq.nsmallest(newsize, self.queue)
            self.seen = set([v for _, v in self.queue])


    def topk(self):
        """ Returns a list of 2-tuples (value, count) for the top k elements in this structure """
        return [(value, -count) for count, value in heapq.nsmallest(self.k, self.queue)]


    def merge(self, other):
        result = TopK(self.width, self.hashes)
        result.cms.merge_from(self.cms)
        result.cms.merge_from(other.cms)
        newsize = max(int(self.k * MAX_OVERHEAD / 2), 1)
        result.queue = heapq.nsmallest(newsize, heaqp.merge(self.queue, other.queue))
        result.seen = set([v for _, v in result.queue])
        result.k = self.k
        return result

In [5]:
def topk_experiment(sample_count, size, hashes, k=10, seed=0x15300625):
    import random
    from collections import namedtuple
   
    random.seed(seed)
    topk = TopK(k, size, hashes)
    
    result = []
    total_count = 0
    
    # update the counts
    for i in range(sample_count):
        bits = random.getrandbits(64)
        if i % 100 == 0:
            # every hundredth entry is a heavy hitter
            insert_count = (bits % 512) + 1
        else:
            insert_count = (bits % 8) + 1
        
        for i in range(insert_count):
            topk.insert(bits)
    
    return topk.topk()

In [8]:
from datasketching.hashing import hashes_for
topk_experiment(40000, 16384, hashes_for(3,8), k=20)

[(14364086215630792700, 517),
 (2991287392996463102, 516),
 (9676343778858885118, 511),
 (5115259641344439293, 510),
 (14529685338139490810, 507),
 (766813825201687544, 505),
 (4701884162691338739, 505),
 (5631968365117192696, 505),
 (14240245373571445239, 504),
 (2362392399177886197, 502),
 (7310823104820324852, 501),
 (4853657629704797682, 499),
 (7209717596912719346, 499),
 (5367578118589810665, 496),
 (5853828888560029671, 496),
 (2743391102434662886, 492),
 (17900334999078540778, 491),
 (5414524836017999845, 486),
 (486034635062679522, 483),
 (15682754308008826338, 483)]