In [24]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # visualizations
import os
import glob
import time

Count Min Sketch Ideas:

1) Show Frequency Count efficiency (Naive -> CMS implementation)
    - Basic Dictionary Count
    - Binning applications (When we can say "close enough" and clump instances togeather)

2) Show application to specific problems (Anomaly Detection

In [1]:
# naively count frequencies

class dictionary():
    
    def __init__(self):
        self.dictionary = {}
    
    def getsize(self):
        print("Dictionary is Size: {} Bytes\n".format(self.nbytes))
        
    def add(self,token):
        if self.dictionary.Contains(token):
            self.dictionary[token] += 1
        else:
            self.dictionary[token] = 1
            
    def timed_update(self,tokenlist):
        startsize = self.nbytes
        start = time.time()
        for token in tokenlist:
            self.add(token)
        end = time.time() - start
        dsize = self.nbytes - startsize
        print("Time Elapsed: {} Seconds \n".format(end))
        print("Change In Memory: {} Bytes\n".format(dsize))
    
    def estimate(self,token):
        try:
            return self.dictionary[token]
        except:
            print("Error: Token Not Found \n")

In [27]:

class CountMinSketch:
    
    def __init__(self,vec_length=2**4):
        self.N = vec_length
        self.table = np.zeros((len(self.Primes,self.N))
        '''
        Hash table is indexed by Table[HashFunction,Vector_Index]
        '''
        self.hashes = [self._genhash(100, prime) for prime in self.PRIMES]
        self.PRIMES = [121021, 121151, 150151, 151051, 151121, 180181, 180811, 181081, 2976221, 
          3021377, 6972593, 13466917, 20996011, 24036583, 25964951, 30402457,33923743,33492811,77381235]
        
    def _genhash(max_n, prime):
        p = prime
        n = max_n
        def hash_fn(val):
            return (val*p) % n
        return hash_fn

    def getsize(self):
        print("Sketch is Size: {} Bytes\n".format(self.nbytes))
        
    def add(self, val):
        '''
        This function accepts a 16 indice vector as an input. This can be modified under
        The clas instatiation
        '''
        for ix in range(0, N):
            h = self.hashes[ix](val)
            self.tables[h][ix] += 1
            
    def timed_update(self,valuelist):
        startsize = self.nbytes
        '''
        This function takes a list of values to be counted and computes the run time
        and change in memory size
        
        '''
        start = time.time()
        for value in valuelist:
            self.add(value)
        end = time.time() - start
        dsize = self.nbytes - startsize
        print("Time Elapsed: {} Seconds \n".format(end))
        print("Change In Memory: {} Bytes\n".format(dsize))
                              
                              
       
    def count(self,val):
        # Helper Function
        vals = []
        for ix in range(0, N):
            h = self.hashes[ix](val)
            vals.append(self.tables[h][ix])
        return vals
            
    def estimate(self, value):
        '''
        Estimate the frequency of a given value via point query
        '''
        results = []
        for ix in range(0, N):
            h = self.hashes[ix](value)
            c = self.table[h][ix]
            results.append(c)
        return min(results)

In [None]:
class changeDetection():
    def __init__(self,sketchset,w):
        self.sketchset = sketchset
        self.w = w
        self.MA = None
        
    def update(self,sketch)
        self.sketchset.append(sketch)
        if len(self.sketchet >self.w):
            self.sketchset.pop(0)
        self.calcmovingAv()
            
    def calcmovingAv(self):
        tableav = 0
        for i,sketch in enumerate(self.sketchset):
            tableav += sketch.table
        tableav /= len(self.sketchset)
        self.MA = tableav
        
    def indmovingAv(self,val):
        # Helper Function
        mavals = []
        for ix in range(0, N):
            h = self.sketchset[-1].hashes[ix](val)
            mavals.append(self.MA[h][ix])
        return mavals
    
    def bucketDist(self,value,idx):
        # Helper Function
        sketch = self.sketchset[idx]
        K = np.sum(sketch.table > 0)
        counts = sketch.count(vals)
        vals = (counts - self.indmovingAv(value))
        vals = [value/(1 - 1/K) for value in vals]
        v_a  = np.median(vals)
        return counts,v_a
    
    def isAttack(self,value,beta):
        # given a set of values, compute their respective counts and determine the
        # "Change" in the count frequency relative to their respective bins to dermine 
        # an anomoly
        
        # If the ID post count exceeds the criteria from the previous sketch
        # then it qualifies as an attack
        counts,variance = self.bucketDist(value,-1)
        mean = self.MA
        crit = mean + beta*variance
        for count in counts:
            if count > crit:
                print("Found Anomoly: {},{}".format(count,crit))
                return True
          