In [1]:
import numpy as np
import matplotlib as plt
import time
from collections import Counter

In [2]:
# The data stream will be in range [0, n)
n = 10
#Size of data stream
m = 10000
#the k of F_k
k = 2
#two user defined parameters lambda and eplison, so that Pr(|Y-F_k|>lambda F_k) < eplison
lambd = 0.1
eplison = 0.1
#compute s1 and s2 in the paper
S1 = int(np.ceil(1*k*n**(1-1/k)/lambd**2))
S2 = int(np.ceil(2*np.log(1/eplison)))

In [3]:
# TODO: online update

class AMS_offline(object):
    '''
    AMS Sketch for offline learning. We precompute the frequency vector of the data
    stream. Online update is not implemented yet.
    '''

    def __init__(self, stream, s1, s2):
        '''
        Build frequency vector
        :param stream: list of non-negative integer in range(0, RANGE)
        '''
        counter = Counter(stream)
        self.frequency_vector = np.array([counter[i] for i in sorted(list(counter))])

        self.F_1 = np.sum(self.frequency_vector)
        self.s1 = s1
        self.s2 = s2
        
        self.stream = stream

    def estimate_F2(self):
        '''
        Implement the improved estimation of F2 with matrix multiplication.
        Uniformly random hashing is used in place of the four-independent hashing
        This is not exactly the same as in the original paper, but is the most popular
        way for implementation nowadays
        :return: estimated F_2
        '''
        self.hashing_matrix = np.random.choice(a = [1, -1], size=(self.s1, self.s2, n), replace=True)
        Z = np.matmul(self.hashing_matrix, self.frequency_vector)
        X = np.square(Z)
        Y = np.mean(X, axis=0)
        estimation = np.median(Y)

        return estimation


    def estimate_Fk(self, k):
        '''
        Implement the estimation of Fk
        :return: estimated Fk
        '''
        self.X = np.zeros((self.s1, self.s2))
        
        for i in range(self.s1):
            for j in range(self.s2):
                idx = np.random.randint(low=0, high=m)
                r = np.sum((self.stream==self.stream[idx])[idx:-1])
                self.X[i][j] = self.F_1 * (r**k - (r - 1)**k)

        Y = np.mean(self.X, axis=0)
        estimation = np.median(Y)

        return estimation

In [4]:
#define a stream
a = np.random.randint(low=0, high=n, size=m)

In [5]:
#compute the ground truth
a_counter = Counter(a)
frequency_vector = np.array([a_counter[i] for i in sorted(list(a_counter))])
ground_true = np.sum(frequency_vector**2)

In [6]:
A = AMS_offline(a, S1, S2)
time1 = time.time()
AMS1 = A.estimate_F2()
time2 = time.time()
AMS2 = A.estimate_Fk(2)
time3 = time.time()

print("The ground true value is:",ground_true)
print("F_2 function return AMS sketch:",AMS1.astype("float32"), "using",(time2-time1),"second")
print("F_k function return AMS sketch:",AMS2.astype("float32"), "using",(time3-time2),"second")

The ground true value is: 10007340
F_2 function return AMS sketch: 10314008.0 using 0.003206491470336914 second
F_k function return AMS sketch: 9954423.0 using 0.2903439998626709 second


In [7]:
###check if the result meet the accuracy
print("The F_2 function return meet the accuracy:", np.abs(AMS1-ground_true)<lambd*ground_true)
print("The F_k function return meet the accuracy:", np.abs(AMS2-ground_true)<lambd*ground_true)

The F_2 function return meet the accuracy: True
The F_k function return meet the accuracy: True
