In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA as skPCA
from cuml import PCA as cumlPCA
import cudf
import os

# Helper Functions

In [2]:
from timeit import default_timer

class Timer(object):
    def __init__(self):
        self._timer = default_timer
    
    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self):
        """Start the timer."""
        self.start = self._timer()

    def stop(self):
        """Stop the timer. Calculate the interval in seconds."""
        self.end = self._timer()
        self.interval = self.end - self.start

In [3]:
import gzip
def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz'):
    train_rows = int(nrows*0.8) #We want to do an 80%/20% split for training and test data
    if os.path.exists(cached):
        print('use mortgage data')

        with gzip.open(cached) as f:
            X = np.load(f)
        # the 4th column is 'adj_remaining_months_to_maturity'
        # used as the label
        X = X[:,[i for i in range(X.shape[1]) if i!=4]]
        y = X[:,4:5]
        rindices = np.random.randint(0,X.shape[0]-1,nrows)
        X = X[rindices,:ncols]
        y = y[rindices]
        df_y_train = pd.DataFrame({'fea%d'%i:y[0:train_rows,i] for i in range(y.shape[1])})
        df_y_test = pd.DataFrame({'fea%d'%i:y[train_rows:,i] for i in range(y.shape[1])})

        df_X_train = pd.DataFrame({'fea%d'%i:X[0:train_rows,i] for i in range(X.shape[1])})
        df_X_test = pd.DataFrame({'fea%d'%i:X[train_rows:,i] for i in range(X.shape[1])})
        return df_X_train, df_X_test, df_y_train, df_y_test
    else:
        raise FileNotFoundError('Please download the required dataset or check the path')
   

In [4]:
from sklearn.metrics import mean_squared_error
def array_equal(a,b,threshold=2e-3,with_sign=True):
    a = to_nparray(a)
    b = to_nparray(b)
    if with_sign == False:
        a,b = np.abs(a),np.abs(b)
    error = mean_squared_error(a,b)
    res = error<threshold
    return res

def to_nparray(x):
    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
        return np.array(x)
    elif isinstance(x,np.float64):
        return np.array([x])
    elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
        return x.to_pandas().values
    return x    

# Run tests

In [5]:
%%time
nrows = 2**15
nrows = int(nrows * 1.5)
ncols = 400

X_train, X_test, y_train, y_test = load_data(nrows,ncols)
print('training data',X_train.shape)
print('training label',y_train.shape)
print('testing data',X_test.shape)
print('testing label',y_test.shape)

use mortgage data
training data (39321, 400)
training label (39321, 1)
testing data (9831, 400)
testing label (9831, 1)
CPU times: user 4 s, sys: 1.08 s, total: 5.08 s
Wall time: 5.07 s


In [6]:
n_components = 10
whiten = False
random_state = 42
svd_solver="full"

In [7]:
%%time
pca_sk = skPCA(n_components=n_components,svd_solver=svd_solver, 
            whiten=whiten, random_state=random_state)
result_sk = pca_sk.fit(X_train)

CPU times: user 2.59 s, sys: 311 ms, total: 2.91 s
Wall time: 704 ms


In [8]:
%%time
# Let's transform the test data
test_result_sk = pca_sk.transform(X_test)

CPU times: user 44.5 ms, sys: 15.3 ms, total: 59.8 ms
Wall time: 32.7 ms


In [9]:
print("We used SKL to dimensionally reduced our test data from ", X_test.shape, " to ", test_result_sk.shape)

We used SKL to dimensionally reduced our test data from  (9831, 400)  to  (9831, 10)


In [10]:
%%time
X_cudf = cudf.DataFrame.from_pandas(X_train)
X_cudf_test = cudf.DataFrame.from_pandas(X_test)
y_cudf = y_train.values
y_cudf = y_cudf[:,0]
y_cudf = cudf.Series(y_cudf)

CPU times: user 1.39 s, sys: 27.9 ms, total: 1.42 s
Wall time: 1.42 s


In [11]:
%%time
pca_cuml = cumlPCA(n_components=n_components,svd_solver=svd_solver, 
            whiten=whiten, random_state=random_state)
result_cuml = pca_cuml.fit(X_cudf)

CPU times: user 1.32 s, sys: 127 ms, total: 1.45 s
Wall time: 1.45 s


In [12]:
%%time
test_result_cuml = pca_cuml.transform(X_cudf_test)

CPU times: user 379 ms, sys: 3.89 ms, total: 383 ms
Wall time: 381 ms


In [13]:
print("We used cuML to dimensionally reduced our test data from ", X_cudf_test.shape, " to ", test_result_cuml.shape)

We used cuML to dimensionally reduced our test data from  (9831, 400)  to  (9831, 10)


In [14]:
pca_ft_cuml = cumlPCA(n_components=n_components,svd_solver=svd_solver, 
            whiten=whiten, random_state=random_state)
result_ft_cuml= pca_ft_cuml.fit_transform(X_cudf)

In [15]:
pca_ft_sk = skPCA(n_components=n_components,svd_solver=svd_solver, 
            whiten=whiten, random_state=random_state)
result_ft_sk = pca_ft_sk.fit_transform(X_train)

In [16]:
t_result_ft_sk = pca_ft_sk.transform(X_test)
t_result_ft_cuml = pca_ft_cuml.transform(X_cudf_test)

In [17]:
passed = array_equal(t_result_ft_sk,t_result_ft_cuml)
message = 'compare pca: cuml vs sklearn transformed results %s'%('equal'if passed else 'NOT equal')
print(message)

compare pca: cuml vs sklearn transformed results equal


In [18]:
for attr in ['singular_values_','components_','explained_variance_',
             'explained_variance_ratio_']:
    passed = array_equal(getattr(pca_ft_sk,attr),getattr(pca_ft_cuml,attr))
    message = 'compare pca: cuml vs sklearn {:>25} {}'.format(attr,'equal' if passed else 'NOT equal')
    print(message)

compare pca: cuml vs sklearn          singular_values_ equal
compare pca: cuml vs sklearn               components_ equal
compare pca: cuml vs sklearn       explained_variance_ equal
compare pca: cuml vs sklearn explained_variance_ratio_ equal


In [19]:
getattr(pca_sk,'components_')

array([[ 7.52989177e-04, -6.85964789e-02,  8.49092685e-02, ...,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00],
       [ 3.62532486e-03, -4.17704333e-03,  1.59672876e-01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 1.03034838e-01,  2.10700311e-01, -5.81722652e-01, ...,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00],
       ...,
       [-9.57776625e-04, -4.30351183e-03, -3.45077727e-02, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-4.26812822e-02, -2.44377389e-04, -4.00867204e-02, ...,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00],
       [ 8.38678953e-02, -8.81999305e-02,  2.76965285e-02, ...,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00]])

In [20]:
print(getattr(pca_cuml,'components_'))

               0               1            2              3               4              5               6 ...  399
0 -0.00075295987     0.068595394 -0.084901266    0.086457185      0.97473806 -0.00020333455   5.7124544e-07 ...  0.0
1    0.003624329    -0.004182243   0.15968357    -0.06845117      0.18894178   0.0013814784   2.0137912e-05 ...  0.0
2     0.10304268      0.21071112  -0.58175015      0.5541815    -0.102260865  5.3146563e-05  -4.2688607e-06 ...  0.0
3     0.07181677      0.10149359  -0.27035096      0.2889933    -0.033481438 -0.00016183848  -6.5566455e-06 ...  0.0
4   -0.023751965   -0.0015669416 -0.016504265    -0.04641609  -0.00075190654  0.00031794177  -1.2977891e-06 ...  0.0
5    0.030864557    -0.011696241  0.037474755    0.027293395   -0.0005224035  2.3726097e-05   1.6795184e-06 ...  0.0
6   -0.033160564     0.014325855  -0.03497077   -0.042391483   -0.0012810044  -6.548228e-05  -2.4134117e-06 ...  0.0
7  0.00067340815     0.004311801  0.034198597    0.024209697    

In [21]:
type(pca_sk.components_)

numpy.ndarray

In [22]:
type(pca_cuml.components_)

cudf.dataframe.dataframe.DataFrame

In [23]:
for attr in ['singular_values_','components_','explained_variance_',
             'explained_variance_ratio_','mean_','noise_varaince_']:
    passed = array_equal(getattr(pca_sk,attr),getattr(pca_cuml,attr))
    message = 'compare pca: cuml vs sklearn {:>25} {}'.format(attr,'equal' if passed else 'NOT equal')
    print(message)

compare pca: cuml vs sklearn          singular_values_ equal
compare pca: cuml vs sklearn               components_ NOT equal
compare pca: cuml vs sklearn       explained_variance_ equal
compare pca: cuml vs sklearn explained_variance_ratio_ equal
compare pca: cuml vs sklearn                     mean_ equal


AttributeError: 'PCA' object has no attribute 'noise_varaince_'

In [24]:
passed = array_equal(result_sk,result_cuml)
message = 'compare pca: cuml vs sklearn transformed results %s'%('equal'if passed else 'NOT equal')
print(message)

TypeError: Expected sequence or array-like, got estimator PCA(copy=True, iterated_power='auto', n_components=10, random_state=42,
  svd_solver='full', tol=0.0, whiten=False)

In [25]:
passed = array_equal(test_result_sk,test_result_cuml)
message = 'compare pca: cuml vs sklearn transformed results %s'%('equal'if passed else 'NOT equal')
print(message)

compare pca: cuml vs sklearn transformed results NOT equal


In [26]:
test_result_cuml

<cudf.DataFrame ncols=10 nrows=9831 >

In [29]:
# create the original data from the transformed data
recreated_data_sk = pca_sk.inverse_transform(test_result_sk)
recreated_data_cuml = pca_cuml.inverse_transform(test_result_cuml)

In [30]:
passed_inv = array_equal(recreated_data_sk,recreated_data_cuml,threshold=0.1)
message = 'compare pca: cuml vs sklearn inverted results %s'%('equal'if passed_inv else 'NOT equal')
print(message)

compare pca: cuml vs sklearn inverted results equal


In [31]:
passed_check = array_equal(X_cudf_test,recreated_data_cuml,threshold=0.1)
message = 'compare pca: original dataset vs reduced dimensional results %s'%('equal'if passed_check else 'NOT equal')
print(message)

compare pca: original dataset vs reduced dimensional results equal


In [28]:
from sklearn.utils.testing import (assert_array_almost_equal, assert_equal)
assert_array_almost_equal(test_result_sk, to_nparray(test_result_cuml), decimal=2)

AssertionError: 
Arrays are not almost equal to 2 decimals

(mismatch 29.560573695453158%)
 x: array([ 0.1 , -0.11,  0.13, ..., -0.  , -0.01, -0.  ])
 y: array([-0.1 , -0.11,  0.13, ...,  0.  , -0.01, -0.  ], dtype=float32)

In [None]:
!pip install matplotlib