# Find mean of streaming data
Suppose you have a streaming of large numbers, and you don't know how many numbers will be there. Find the mean of these numbers. 


### Method 1: compute mean after each number arrives. However, there could be accuracy issue when there are lots of numbers.

In [62]:
class streamMean:
    def __init__(self):
        self.mean = 0
        self.count = 0
        
    def getMean(self):
        return self.mean
    
    def addData(self, num):
        self.count += 1
        self.mean = self.mean / self.count * (self.count-1) + num/self.count

In [63]:
data = streamMean()

In [64]:
for i in range(1000000):
    data.addData(1000000000)

In [65]:
data.mean

999999999.9999974

In [66]:
data = streamMean()
for i in range(1000000):
    data.addData(1000000000+i)
abs(data.getMean() - 1000500000)

0.5000604391098022

### Method 2: keep track of means every 2^k numbers. Increase accuracy but it can take log(n) time to compute mean and add new numbers

In [67]:
class streamMean:
    def __init__(self):
        self.means = []
        self.count = 0
        
    def getMean(self):
        res = 0
        for x in self.means[::-1]:
            res += x[0]/self.count*x[1]
        return res
        
    def addData(self, num):
        self.count += 1
        self.means.append([num,1])
        p = len(self.means)-1
        while p > 0:
            if self.means[p][1] == self.means[p-1][1]:
                self.means[p-1][0] = (self.means[p][0] + self.means[p-1][0])/2
                self.means[p-1][1] *= 2
                self.means.pop()
            else:
                break
            p -= 1

In [68]:
data = streamMean()

In [69]:
for i in range(1000000):
    data.addData(1000000000)

In [70]:
data.getMean()

1000000000.0

In [71]:
data = streamMean()
for i in range(1000000):
    data.addData(1000000000+i)
abs(data.getMean() - 1000500000)

0.5

### Method 3: Resevior sampling. Sample K numbers and compute the mean as an approximate to overall mean

In [72]:
import random, math
class streamMean:
    def __init__(self, k:int):
        self.k = k
        self.nums = []
        self.count = 0

    def getMean(self):
        res = 0
        for i in range(len(self.nums)):
            res = res/(i+1)*i + self.nums[i]/(i+1)
        return res
        
    def addData(self, num):
        self.count += 1
        if len(self.nums) < self.k:
            self.nums.append(num)
        else:
            randnum = random.random() * self.count
            if randnum < self.k:
                self.nums[math.floor(randnum)] = num
                

In [73]:
data = streamMean(100)
for i in range(1000000):
    data.addData(1000000000)

In [74]:
data.getMean()

999999999.9999994

In [77]:
data = streamMean(10)
for i in range(1000000):
    data.addData(1000000000+i)
abs(data.getMean() - 1000500000)

147362.60000014305