### <span style="color:red">IMPORTANT: Only modify cells which have the following comment:</span>
```python
# Modify this cell
```
##### <span style="color:red">Do not add any new cells when you submit the homework</span>

# Setting Up Notebook

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
sc = SparkContext(master="local[4]")

In [3]:
import numpy as np
import math
from numpy import linalg as LA

# Exercise:
The function **computeCov** computes the covariance matrix using RDDs. The code allows undefined entries and calculates the covariance without bias. 

Your homework is to complete the missing parts in **computeCov** (Marked with `...`) so that it calculates the covariance correctly.

    Note: The functions and libraries in the cell below will be useful to you

In [4]:
def outerProduct(X):
    """Computer outer product and indicate which locations in matrix are undefined"""
    O=np.outer(X,X)
    N=1-np.isnan(O)
    return (O,N)

def sumWithNan(M1,M2):
    """Add two pairs of (matrix,count)"""
    (X1,N1)=M1
    (X2,N2)=M2
    N=N1+N2
    X=np.nansum(np.dstack((X1,X2)),axis=2)
    return (X,N)

In [5]:
# Modify this cell

def computeCov(RDDin):
    # input: RDDin is an RDD of np arrays, all of the same length
    
    # we insert 1 at the beginning of each vector so the calculation also yields the mean vector
    RDD=RDDin.map(lambda v:np.array(np.insert(v,0,1),dtype=np.float64)) 
    
    # separating map and reduce does not matter, since Spark uses lazy execution.
    OuterRDD=RDD.map(outerProduct)    #<-- do mapping here
    (S,N)=OuterRDD.reduce(sumWithNan)  #<-- do reducing here
    
    E=S[0,1:]
    NE=np.float64(N[0,1:])
    print 'shape of E=',E.shape,'shape of NE=',NE.shape
    Mean=E/NE
    O=S[1:,1:]
    NO=np.float64(N[1:,1:])
    print 'shape of O=',O.shape,'shape of NO=',NO.shape,'shape of Mean=',Mean.shape
    print np.outer(np.transpose(Mean),Mean)
    Cov=  O/NO - np.outer(np.transpose(Mean),Mean) # This is the covariance matrix
    
    # Output also the diagnal which is the variance for each day
    Var=np.array([Cov[i,i] for i in range(Cov.shape[0])])
    return {'E':E,'NE':NE,'O':O,'NO':NO,'Cov':Cov,'Mean':Mean,'Var':Var}


In [6]:
import Tester.SmallPCA as pca
pca.exercise(computeCov, sc)

Checking data_list of length 3 with length 10 vectors each having 2 np.NaN values
shape of E= (10,) shape of NE= (10,)
shape of O= (10, 10) shape of NO= (10, 10) shape of Mean= (10,)
[[ 0.3657792   0.40225383 -0.13383224 -0.34764459 -0.07040832  0.13402857
  -0.10156681 -0.13003311  0.06221052  0.47056363]
 [ 0.40225383  0.44236562 -0.14717767 -0.38231088 -0.07742927  0.14739357
  -0.11169481 -0.1429997   0.068414    0.5174871 ]
 [-0.13383224 -0.14717767  0.04896688  0.1271971   0.02576118 -0.04903872
   0.03716153  0.04757685 -0.02276174 -0.17217103]
 [-0.34764459 -0.38231088  0.1271971   0.33040906  0.06691762 -0.12738369
   0.09653133  0.12358632 -0.05912624 -0.44723401]
 [-0.07040832 -0.07742927  0.02576118  0.06691762  0.0135528  -0.02579897
   0.01955045  0.02502989 -0.01197482 -0.09057813]
 [ 0.13402857  0.14739357 -0.04903872 -0.12738369 -0.02579897  0.04911066
  -0.03721604 -0.04764664  0.02279514  0.1724236 ]
 [-0.10156681 -0.11169481  0.03716153  0.09653133  0.01955045 -0.03