In [1]:
import numpy as np

def pca(X = np.array([]), no_dims = 50):
    """
    Runs PCA on the N x D array X in order to reduce its dimensionality to 
     no_dims dimensions.
    Inputs:
    - X: A matrix with shape N x D where N is the number of examples and D is 
         the dimensionality of original data.
    - no_dims: A scalar indicates the output dimension of examples after 
         performing PCA.
    Returns:
    - Y: A matrix of reduced size with shape N x no_dims where N is the number
         of examples  and no_dims is the dimensionality of output examples. 
         no_dims should be smaller than D, which is the dimensionality of 
         original examples.
    - M: A matrix of eigenvectors with shape D x no_dims where D is the 
         dimensionality of the original data
    """
    XtX = np.dot(np.transpose(X), X)
    eig_val, eig_vec = np.linalg.eig(XtX)
    eig_val_idx = np.argsort(eig_val)[::-1][:no_dims]
    eig_val_k = eig_val[eig_val_idx]
    eig_vec_k = eig_vec[:,eig_val_idx]
    
    Y = np.dot(X, eig_vec_k)
    M = eig_vec_k

    """TODO: write your code here"""
    
    return Y, M

def decompress(Y = np.array([]), M = np.array([])):
    """
    Decompress the compressed data.
    Inputs:
    - Y: A matrix of reduced size with shape N x no_dims where N is the number
         of examples  and no_dims is the dimensionality of output examples. 
         no_dims should be smaller than D, which is the dimensionality of 
         original examples.
    - M: A matrix of eigenvectors with shape D x no_dims where D is the 
         dimensionality of the original data
    Returns:
    - X_hat: Reconstructed matrix with shape N x D where N is the number of 
         examples and D is the dimensionality of each example before 
         compression.
    """
    X_hat = np.dot(Y, np.transpose(M))

    """TODO: write your code here"""
    
    return X_hat

def reconstruction_error(orig = np.array([]), decompressed = np.array([])):
    """
    Computes reconstruction error (pixel-wise mean squared error) for original
     image and reconstructed image
    Inputs:
    - orig: An array of size 1xD, original flattened image.
    - decompressed: An array of size 1xD, decompressed version of the image
    """
    D =len(orig)
    error = (np.linalg.norm(orig - decompressed)**2)/D

    """TODO: write your code here"""
    
    return error

def load_data(dataset='mnist_subset.json'):
    # This function reads the MNIST data
    import json


    with open(dataset, 'r') as f:
        data_set = json.load(f)
    mnist = np.vstack((np.asarray(data_set['train'][0]), 
                    np.asarray(data_set['valid'][0]), 
                    np.asarray(data_set['test'][0])))
    return mnist

'''
if __name__ == '__main__':
    
    import argparse
    import sys


    mnist = load_data()
    compression_rates = [2, 10, 50, 100, 250, 500]
    with open('pca_output.txt', 'w') as f:
        for cr in compression_rates:
            Y, M = pca(mnist - np.mean(mnist, axis=0), cr)
            
            decompressed_mnist = decompress(Y, M)
            decompressed_mnist += np.mean(mnist, axis=0)
            
            total_error = 0.
            for mi, di in zip(mnist, decompressed_mnist):
                error = reconstruction_error(mi, di)
                f.write(str(error))
                f.write('\n')
                total_error += error
            print('Total reconstruction error after compression with %d principal '\
                'components is %f' % (cr, total_error))

'''



"\nif __name__ == '__main__':\n    \n    import argparse\n    import sys\n\n\n    mnist = load_data()\n    compression_rates = [2, 10, 50, 100, 250, 500]\n    with open('pca_output.txt', 'w') as f:\n        for cr in compression_rates:\n            Y, M = pca(mnist - np.mean(mnist, axis=0), cr)\n            \n            decompressed_mnist = decompress(Y, M)\n            decompressed_mnist += np.mean(mnist, axis=0)\n            \n            total_error = 0.\n            for mi, di in zip(mnist, decompressed_mnist):\n                error = reconstruction_error(mi, di)\n                f.write(str(error))\n                f.write('\n')\n                total_error += error\n            print('Total reconstruction error after compression with %d principal '                'components is %f' % (cr, total_error))\n\n"

In [2]:
mnist = load_data()

In [3]:
type(mnist)

numpy.ndarray

In [4]:
mnist.shape

(7000, 784)

In [5]:
X = mnist - np.mean(mnist, axis=0)
no_dims = 50

In [11]:
XtX = np.dot(np.transpose(X), X)
eig_val, eig_vec = np.linalg.eig(XtX)
eig_val_idx = np.argsort(eig_val)[::-1][:no_dims]
eig_val_k = eig_val[eig_val_idx]
eig_vec_k = eig_vec[:,eig_val_idx]

Y = np.dot(X, eig_vec_k)
M = eig_vec_k

In [12]:
print(Y.shape)
print(M.shape)

(7000, 50)
(784, 50)


In [13]:
X_hat = np.dot(Y, np.transpose(M))
print(X_hat.shape)

(7000, 784)


In [7]:
np.dot(X, eig_vec_k).shape

(7000, 50)

In [8]:
eig_val_k.shape

(50,)

In [9]:
print(eig_vec_k.shape)
print(eig_vec.shape)
print(eig_vec[:,0]-eig_vec_k[:,0])

(784, 50)
(784, 784)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [10]:
eig_val_idx

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
      dtype=int64)

In [80]:
print(np.sum(np.dot(XtX, eig_vec_k[:,0]) - eig_val_k[0] * eig_vec_k[:,0]))
print(np.dot(XtX, eig_vec_k[:,0]) - eig_val_k[0] * eig_vec_k[:,0])

-1.3934490887546058e-10
[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  1.04083409e-17
 -2.22044605e-16 -2.99760217e-15 -4.44089210e-15 -1.11022302e-14
 -1.66811009e-14 -1.37667655e-14 -3.01425551e-14 -1.52100554e-14
 -6.53643806e-15 -1.64313008e-14 -1.77635684e-14 -7.77156117e-15
 -2.20656826e-15  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
 

In [60]:
eig_val_idx

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
      dtype=int64)

In [82]:
eig_vec_k.shape

(784, 50)

In [10]:
XtX.shape

(784, 784)

In [34]:
A = np.array([[0.5, 0.5],[0.5,.5]])
w, v =np.linalg.eig(A)

In [35]:
w

array([1.00000000e+00, 1.11022302e-16])

In [36]:
v

array([[ 0.70710678, -0.70710678],
       [ 0.70710678,  0.70710678]])

In [39]:
np.dot(A, v[:,0]) 

array([0.70710678, 0.70710678])

In [40]:
w[0]*v[:,0]

array([0.70710678, 0.70710678])

In [68]:
a = np.array([1,5,3,10,2])
print(np.argsort(a))
print(np.argsort(a)[:3])
print(np.argsort(a)[::-1][:3])
a_idx = np.argsort(a)[::-1][:3]

[0 4 2 1 3]
[0 4 2]
[3 1 2]


In [66]:
b = np.array(range(3*5)).reshape(3,5)
print(b)

[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]]


In [69]:
b[:,a_idx]

array([[ 3,  1,  2],
       [ 8,  6,  7],
       [13, 11, 12]])

In [83]:
a.shape

(5,)

In [92]:
dd = np.array([[5],[7]])
cc = np.array([[2],[3]])
np.linalg.norm(dd - cc)**2 

25.0