In [2]:
# ref http://notmatthancock.github.io/2017/03/23/simple-batch-stat-updates.html
import numpy as np

In [90]:
class StatsRecorder:
    def __init__(self, data=None):
        """
        data: ndarray, shape (nobservations, ndimensions)
        """
        if data is not None:
            data = np.atleast_2d(data)
            self.mean = data.mean(axis=0)
            self.std  = data.std(axis=0)
            self.nobservations = data.shape[0]
            self.ndimensions   = data.shape[1]
        else:
            self.nobservations = 0

    def update(self, data):
        """
        data: ndarray, shape (nobservations, ndimensions)
        """
        if self.nobservations == 0:
            self.__init__(data)
        else:
            data = np.atleast_2d(data)
            if data.shape[1] != self.ndimensions:
                raise ValueError("Data dims don't match prev observations.")

            newmean = data.mean(axis=0)
            newstd  = data.std(axis=0)

            m = self.nobservations * 1.0
            n = data.shape[0]

            tmp = self.mean

            self.mean = m/(m+n)*tmp + n/(m+n)*newmean
            self.std  = m/(m+n)*self.std**2 + n/(m+n)*newstd**2 +\
                        m*n/(m+n)**2 * (tmp - newmean)**2
            self.std  = np.sqrt(self.std)

            self.nobservations += n

In [91]:
rs = np.random.RandomState(1)
data = rs.randn(1000000, 3)
data_mean = data.mean(axis=0)
data_mean

array([-0.00018205,  0.00027317, -0.00049592])

In [92]:
data_split = np.split(data, 10000)
recorder = StatsRecorder()

for c in data_split:
    recorder.update(c)

recorder.mean


array([-0.00018205,  0.00027317, -0.00049592])