### <span style="color:red">IMPORTANT: Only modify cells which have the following comment:</span>
```python
# Modify this cell
```
##### <span style="color:red">Do not add any new cells when you submit the homework</span>

# Setting Up Notebook

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
sc = SparkContext(master="local[4]")

In [3]:
import numpy as np
import math
from numpy import linalg as LA

# Exercise:
The function **computeCov** computes the covariance matrix using RDDs. The code allows undefined entries and calculates the covariance without bias. 

Your homework is to complete the missing parts in **computeCov** (Marked with `...`) so that it calculates the covariance correctly.

    Note: The functions and libraries in the cell below will be useful to you

In [4]:
def outerProduct(X):
    """Computer outer product and indicate which locations in matrix are undefined"""
    O=np.outer(X,X)
    N=1-np.isnan(O)
    return (O,N)

def sumWithNan(M1,M2):
    """Add two pairs of (matrix,count)"""
    (X1,N1)=M1
    (X2,N2)=M2
    N=N1+N2
    X=np.nansum(np.dstack((X1,X2)),axis=2)
    return (X,N)

In [9]:
# Modify this cell

def computeCov(RDDin):
    # input: RDDin is an RDD of np arrays, all of the same length
    
    # we insert 1 at the beginning of each vector so the calculation also yields the mean vector
    RDD=RDDin.map(lambda v:np.array(np.insert(v,0,1),dtype=np.float64)) 
    
    # separating map and reduce does not matter, since Spark uses lazy execution.
    OuterRDD=RDD.map(lambda x:outerProduct(x))    #<-- do mapping here
    (S,N)=OuterRDD.reduce(lambda x,y:sumWithNan(x,y))  #<-- do reducing here
    
    E=S[0,1:]
    NE=np.float64(N[0,1:])
    print 'shape of E=',E.shape,'shape of NE=',NE.shape
    Mean=E/NE
    O=S[1:,1:]
    NO=np.float64(N[1:,1:])
    covshape=O.shape[0]
    Cov=np.zeros((covshape,covshape),dtype=np.float64) # This is the covariance matrix
    for i in xrange(covshape):
        for j in xrange(covshape):
            Cov[i,j]= O[i,j]/NO[i,j]-Mean[i]*Mean[j] if NO[i,j]!=0 else np.nan
    # Output also the diagnal which is the variance for each day
    Var=np.array([Cov[i,i] for i in range(Cov.shape[0])])
    return {'E':E,'NE':NE,'O':O,'NO':NO,'Cov':Cov,'Mean':Mean,'Var':Var}


In [10]:
import Tester.SmallPCA as pca
pca.exercise(computeCov, sc)

Checking data_list of length 3 with length 10 vectors each having 2 np.NaN values
shape of E= (10,) shape of NE= (10,)
[[  7.91839805e-01  -1.56895646e-02  -3.94775805e-01   4.51249590e-01
   -5.60284020e-01  -6.30514812e-02  -1.68899783e-01  -2.91810336e-01
   -5.64239542e-01   1.05594273e+00]
 [ -1.56895646e-02   3.10874037e-04   4.89021944e-03   1.34167502e-01
   -5.12897658e-03  -3.17390134e-01   6.56611101e-03   6.75567824e-03
   -2.85890101e-01   1.43786709e-01]
 [ -3.94775805e-01   4.89021944e-03   1.29514086e-01  -8.27040078e-02
    1.80407303e-01  -5.13288358e-02   5.66731359e-02   9.66203363e-02
    7.70841528e-02  -2.24940912e-01]
 [  4.51249590e-01   1.34167502e-01  -8.27040078e-02   4.46724353e-02
   -1.25950151e-01  -9.94657259e-02  -3.22060779e-02  -5.89021614e-02
   -3.66669765e-01   2.67964568e-02]
 [ -5.60284020e-01  -5.12897658e-03   1.80407303e-01  -1.25950151e-01
    2.51943088e-01  -5.67817967e-02   7.87044799e-02   1.34420240e-01
    1.20014752e-01  -3.12629585e-