# Exercise 5

In [72]:
# Add imports for the packages and Dataset you need
import numpy as np
import pandas as pd
import random

## Implementing online calculations

In this exercise you will be tasked to create three algorithms to calculate the mean, the variance and finally the covariance matrix. usually these are calculated in batch. This means that all the data is accessible and used to calculate these measurements.  
It is however also possible to calculate them online. This means the values (or the matrix in the last case) are constantly updated as new information (i.e. new rows of a table or new data from sensors) is available. 

#### Important note
Please write some documentation and use comments to explain your code.  
Documentation of your code accounts for **1 point**  
Furthermore proper documentation of your code helps us to understand what it does and to give points fairly if something does not work.

### Online calculation of the mean (2 points)

First you have to create a function that updates the mean of a one-dimensional dataset. The old dataset may not be given as an input.  
The output should be an updated new mean of all datapoints.

In [73]:
def online_mean(new_dataset, prev_mean, prev_length):
    """
    Calculate online mean by using Welford's algorithm
    
    Parameters
    ----------
    new_dataset : one-dimensional list
        new data
    prev_mean : number
        mean of previous data
    prev_length : number
        length of previous data
    
    Returns
    -------
    dict : { "mean": number, "length": number }
        the new mean and total length of data
    """
    cache_mean = prev_mean
    cache_length = prev_length

    def compute_new_mean(val):
        nonlocal cache_mean
        nonlocal cache_length
        
        # start calculating new mean
        cache_length += 1
        if cache_length == 1:
            new_mean = val
        else:
            new_mean = cache_mean + (val - cache_mean)/cache_length # mean of x_n = mean of x_n-1 + (x - mean of x_n-1) / n
        
        # update cache for next iteration
        cache_mean = new_mean
        
    for num in new_dataset:
        compute_new_mean(num)

    return { "mean": cache_mean, "length": cache_length}

### Online calculation of the variance (2 points)

Next you have to create a function that updates the variance of a one-dimensional dataset. The old dataset may not be given as an input.  
The output should be an updated new variance of all datapoints.

In [74]:
def online_variance(new_dataset, prev_mean, prev_variance, prev_length):
    """
    Calculate online sample variance by using Welford's algorithm
    
    Parameters
    ----------
    new_dataset : one-dimensional list
        new data
    prev_mean : number
        mean of previous data
    prev_variance : number
        variance of previous data
    prev_length : number
        length of previous data
    
    Returns
    -------
    dict : { "variance": number, "length": number }
        the new sample variance and total length of data
    """
    cache_mean = prev_mean
    cache_sos = prev_variance * (prev_length - 1) # sum of squares of differences from the mean
    cache_length = prev_length

    def compute_new_variance(new_value):
        nonlocal cache_mean
        nonlocal cache_sos
        nonlocal cache_length
        new_mean = online_mean([new_value], cache_mean, cache_length)["mean"]
        
        # start calculating new variance when has new value
        cache_length += 1
        if cache_length == 1:
            new_sos = 0
        else:
            new_sos = cache_sos + (new_value - cache_mean)*(new_value - new_mean)
        
        # update cache for next iteration
        cache_mean = new_mean
        cache_sos = new_sos
        
    for num in new_dataset:
        compute_new_variance(num)

    return {
        "variance": 0 if cache_sos == 0 else (cache_sos / (cache_length - 1)),
        "length": cache_length
    }

### Online calculation of the covariance matrix (5 points)

Finally you have to create a function that updates the covariance matrix of a two-dimensional dataset. The old dataset may not be given as an input.   
The output should be an updated covariance matrix.

In [75]:
def online_cov_matrix(new_dataset, prev_covariance, prev_first_dim_mean, prev_second_dim_mean, prev_length):
    """
    Calculate online sample covariance
    
    Parameters
    ----------
    new_dataset : list
        new data. `new_dataset[0]` is first dimension data, `new_dataset[1]` is second dimension data
    prev_covariance: two-dimensional list
        covariance of previous data
    prev_first_dim_mean : number
        mean of first dimension of previous data
    prev_second_dim_mean : number
        mean of second dimension of previous data
    prev_length : number
        length of previous data
    
    Returns
    -------
    dict : { "covariance": list, "first_dim_mean": number, "second_dim_mean": number, "length": number }
        the new sample covariance, mean of first and second dimension, and total length of data
    """
    first_dim_new_dataset = new_dataset[0]
    second_dim_new_dataset = new_dataset[1]
    
    cache_covariance = prev_covariance
    cache_first_dim_mean = prev_first_dim_mean
    cache_second_dim_mean = prev_second_dim_mean
    cache_length = prev_length

    def compute_new_covariance(first_dim_new_value, second_dim_new_value):
        nonlocal cache_covariance
        nonlocal cache_first_dim_mean
        nonlocal cache_second_dim_mean
        nonlocal cache_length
        
        cache_cov11 = cache_covariance[0][0]
        cache_cov22 = cache_covariance[1][1]
        cache_cov12 = cache_covariance[0][1]
        
        # covariance(X,X) = variance(X)
        new_cov11 = online_variance([first_dim_new_value], cache_first_dim_mean, cache_cov11, cache_length)["variance"]
        # covariance(Y,Y) = variance(Y)
        new_cov22 = online_variance([second_dim_new_value], cache_second_dim_mean, cache_cov22, cache_length)["variance"]
        
        new_first_dim_mean = online_mean([first_dim_new_value], cache_first_dim_mean, cache_length)["mean"]
        new_second_dim_mean = online_mean([second_dim_new_value], cache_second_dim_mean, cache_length)["mean"]
        
        cache_length += 1
        
        if cache_length == 1:
            new_cov12 = 0
        else:
            d1 = first_dim_new_value - cache_first_dim_mean
            d2 = second_dim_new_value - new_second_dim_mean
            co_moment = cache_cov12 * (cache_length - 2) + d1 * d2 # co_moment of n = co_moment of n-1 + (x_n - mean of x_n-1)*(y_n - mean of y_n)
            new_cov12 = co_moment / (cache_length - 1)
        
        # update cache for next iteration
        cache_covariance = [[new_cov11, new_cov12], [new_cov12, new_cov22]]
        cache_first_dim_mean = new_first_dim_mean
        cache_second_dim_mean = new_second_dim_mean
    
    for i in range(0, len(first_dim_new_dataset)):
        compute_new_covariance(first_dim_new_dataset[i], second_dim_new_dataset[i])
        
    return {
        "covariance": cache_covariance, 
        "first_dim_mean": cache_first_dim_mean, 
        "second_dim_mean": cache_second_dim_mean,
        "length": cache_length
    }

In [76]:
def test():
    """
    Test online_mean() vs numpy.mean(); online_variance() vs numpy.var()
    """
    print("------------Test online mean & variance------------")
    prev_mean = 0
    prev_length = 0
    prev_variance = 0
    
    random_list = []
    for i in range(0, 10):
        random_list.append(random.sample(range(0, 1000), random.randint(5, 15)))

    all_list = []
    for i in range(0, len(random_list)):
        computed_mean = online_mean(random_list[i], prev_mean, prev_length)
#         print("mean of", i, "list:", computed_mean)
        
        computed_variance = online_variance(random_list[i], prev_mean, prev_variance, prev_length)
#         print("variance of", i, "list:", computed_variance)
        
        prev_mean = computed_mean["mean"]
        prev_length = computed_mean["length"]
        prev_variance = computed_variance["variance"]            
        
        # merge random list to test with numpy
        all_list += random_list[i]
        
    print("mean:", computed_mean["mean"])
    print("mean by numpy", np.mean(all_list))
    print("variance:", computed_variance["variance"])
    print("variance by numpy", np.var(all_list, ddof = 1))
    
    return
test()

def test_cov():
    """
    Test online_cov_matrix() vs numpy.cov()
    """
    print("------------Test online covariance------------")
    prev_covariance = [[0, 0], [0, 0]]
    prev_first_mean = 0
    prev_second_mean = 0
    prev_length = 0
    
    data_stream = []
    for i in range(0, 10):
        data_stream.append([random.sample(range(0, 1000), 10), random.sample(range(0, 1000), 10)])
    
    all_first_dim = []
    all_second_dim = []
    for i in range(0, len(data_stream)):
        computed = online_cov_matrix(data_stream[i], prev_covariance, prev_first_mean, prev_second_mean, prev_length)
        prev_covariance = computed["covariance"]
        prev_first_mean = computed["first_dim_mean"]
        prev_second_mean = computed["second_dim_mean"]
        prev_length = computed["length"]
        
        # merge data to test with numpy
        all_first_dim += data_stream[i][0]
        all_second_dim += data_stream[i][1]
        
#     print("first:", all_first_dim)
#     print("second:", all_second_dim)
    print("covariance:", prev_covariance)
    print("covariance by numpy:", np.cov(all_first_dim, all_second_dim, ddof = 1))
    
    return

test_cov()

------------Test online mean & variance------------
mean: 484.3669724770642
mean by numpy 484.3669724770642
variance: 86608.3640842678
variance by numpy 86608.36408426777
------------Test online covariance------------
covariance: [[79070.47838383842, -5512.853737373736], [-5512.853737373736, 85368.80191919189]]
covariance by numpy: [[79070.47838384 -5512.85373737]
 [-5512.85373737 85368.80191919]]
