In [1]:
# author: Yuman Lin
# July 7 2020

In [2]:
import numpy as np
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.target = mnist.target.astype(np.int8)
X_train = mnist["data"][:60000]
X_test  = mnist["data"][60000:]
y_train = mnist["target"][:60000]
y_test  = mnist["target"][60000:]

In [3]:
import timeit

In [5]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
start = timeit.default_timer()
X_reduced = pca.fit_transform(X_train)
stop = timeit.default_timer()
print("'time of PCA library: ", stop - start)

'time of PCA library:  3.377916488000011


In [6]:
X_reduced.shape

(60000, 3)

In [7]:
X_reduced

array([[ 123.92047302, -312.66391533,  -24.37945012],
       [1011.71638145, -294.85111707,  596.35784003],
       [ -51.85130577,  392.17594807, -188.49866045],
       ...,
       [-178.0578634 ,  160.07766406, -257.55249094],
       [ 130.60770887,   -5.60195446,  513.8437239 ],
       [-173.43537783,  -24.72386812,  556.0089233 ]])

In [8]:
type(X_train)

numpy.ndarray

In [9]:
def pca(dataMat, topNfeat=999999):

    
    meanVals = np.mean(dataMat, axis=0)
    meanRemoved = dataMat - meanVals

    
    covmat = np.cov(meanRemoved, rowvar=0)
    print(covmat)
    print(type(covmat))

    
    eigVals, eigVects = np.linalg.eig(np.mat(covmat))
    eigValInd = np.argsort(eigVals)
    eigValInd = eigValInd[:-(topNfeat+1):-1]    
    redEigVects = eigVects[:, eigValInd]        

   
    lowDDataMat = meanRemoved * redEigVects     
    reconMat = (lowDDataMat * redEigVects.T) + meanVals 
    return np.array(lowDDataMat), np.array(reconMat)

In [10]:
start = timeit.default_timer()
lowDDataMat, reconMat = pca(X_train, 3)
end = timeit.default_timer()
print("time of Yuman's PCA: ", end - start)

# UW machine: doppio:
# 1.7455005068331957

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
<class 'numpy.ndarray'>
time of Yuman's PCA:  2.994223785000031


In [11]:
lowDDataMat

array([[ -123.93258866,  -312.67426202,   -24.51405176],
       [-1011.71837587,  -294.85703827,   596.33956104],
       [   51.84960805,   392.17315286,  -188.50974943],
       ...,
       [  178.0534496 ,   160.07821109,  -257.61308227],
       [ -130.60607208,    -5.59193642,   513.85867395],
       [  173.43595244,   -24.71880226,   556.01889393]])

In [12]:
X_reduced

array([[ 123.92047302, -312.66391533,  -24.37945012],
       [1011.71638145, -294.85111707,  596.35784003],
       [ -51.85130577,  392.17594807, -188.49866045],
       ...,
       [-178.0578634 ,  160.07766406, -257.55249094],
       [ 130.60770887,   -5.60195446,  513.8437239 ],
       [-173.43537783,  -24.72386812,  556.0089233 ]])

In [13]:
import cupy as cp

In [14]:
X_train_cp = cp.array(X_train)
type(X_train_cp)

cupy.core.core.ndarray

In [15]:
def pca_cp(dataMat, topNfeat=999999):

    
    meanVals = cp.mean(dataMat, axis=0)
    meanRemoved = dataMat - meanVals

    
    covmat = cp.cov(meanRemoved, rowvar=0)
    # print(covmat)
    covmat = cp.asnumpy(covmat)
    eigVals, eigVects = np.linalg.eig(np.mat(covmat))
    eigVals = cp.array(eigVals)
    eigVects = cp.array(eigVects)
    
    eigValInd = cp.argsort(eigVals)
    eigValInd = eigValInd[:-(topNfeat+1):-1]    
    redEigVects = eigVects[:, eigValInd]    
    print(type(meanRemoved))  
    print(type(redEigVects))  

   
    lowDDataMat = meanRemoved.dot(redEigVects)     
    reconMat = (lowDDataMat.dot(redEigVects.T)) + meanVals 
    return cp.array(lowDDataMat), cp.array(reconMat)

In [18]:
#%time lowDDataMat, reconMat = pca_cp(X_train_cp, 3)

start = timeit.default_timer()
lowDDataMat, reconMat = pca_cp(X_train_cp, 3)
end = timeit.default_timer()
print("time of cupy PCA v1: ", end - start)

<class 'cupy.core.core.ndarray'>
<class 'cupy.core.core.ndarray'>
time of cupy PCA v1:  0.7230669209999974


In [None]:
lowDDataMat

array([[ -123.93258866,  -312.67426202,   -24.51405176],
       [-1011.71837587,  -294.85703827,   596.33956104],
       [   51.84960805,   392.17315286,  -188.50974943],
       ...,
       [  178.0534496 ,   160.07821109,  -257.61308227],
       [ -130.60607208,    -5.59193642,   513.85867395],
       [  173.43595244,   -24.71880226,   556.01889393]])

In [20]:
def pca_cp_v2(dataMat, topNfeat=999999):

    
    meanVals = cp.mean(dataMat, axis=0)
    meanRemoved = dataMat - meanVals

    covmat = cp.cov(meanRemoved, rowvar=0)
    # print(covmat)
    #covmat = cp.asnumpy(covmat)

    eigVals, eigVects = cp.linalg.eigh(covmat)
    #eigVals = cp.array(eigVals)
    #eigVects = cp.array(eigVects)
    
    eigValInd = cp.argsort(eigVals)
    eigValInd = eigValInd[:-(topNfeat+1):-1]    
    redEigVects = eigVects[:, eigValInd]    
    print(type(meanRemoved))  
    print(type(redEigVects))  

   
    lowDDataMat = meanRemoved.dot(redEigVects)     
    reconMat = (lowDDataMat.dot(redEigVects.T)) + meanVals 
    return cp.array(lowDDataMat), cp.array(reconMat)

In [22]:
#%time lowDDataMat, reconMat = pca_cp_v2(X_train_cp, 3)
start = timeit.default_timer()
lowDDataMat, reconMat = pca_cp_v2(X_train_cp, 3)
end = timeit.default_timer()
print("time of cupy PCA v2: ", end - start)

<class 'cupy.core.core.ndarray'>
<class 'cupy.core.core.ndarray'>
time of cupy PCA v2:  0.11553216599997995


In [24]:
%time lowDDataMat, reconMat = pca_cp_v2(X_train_cp, 3)

<class 'cupy.core.core.ndarray'>
<class 'cupy.core.core.ndarray'>
CPU times: user 83.6 ms, sys: 24.1 ms, total: 108 ms
Wall time: 111 ms
