In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA as skPCA
from cuml import PCA as cumlPCA
import cudf
import os

# Helper Functions

In [2]:
from timeit import default_timer

class Timer(object):
    def __init__(self):
        self._timer = default_timer
    
    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self):
        """Start the timer."""
        self.start = self._timer()

    def stop(self):
        """Stop the timer. Calculate the interval in seconds."""
        self.end = self._timer()
        self.interval = self.end - self.start

In [3]:
import gzip
def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz'):
    train_rows = int(nrows*0.8) #We want to do an 80%/20% split for training and test data
    if os.path.exists(cached):
        print('use mortgage data')

        with gzip.open(cached) as f:
            X = np.load(f)
        # the 4th column is 'adj_remaining_months_to_maturity'
        # used as the label
        X = X[:,[i for i in range(X.shape[1]) if i!=4]]
        y = X[:,4:5]
        rindices = np.random.randint(0,X.shape[0]-1,nrows)
        X = X[rindices,:ncols]
        y = y[rindices]
        df_y_train = pd.DataFrame({'fea%d'%i:y[0:train_rows,i] for i in range(y.shape[1])})
        df_y_test = pd.DataFrame({'fea%d'%i:y[train_rows:,i] for i in range(y.shape[1])})

        df_X_train = pd.DataFrame({'fea%d'%i:X[0:train_rows,i] for i in range(X.shape[1])})
        df_X_test = pd.DataFrame({'fea%d'%i:X[train_rows:,i] for i in range(X.shape[1])})
        return df_X_train, df_X_test, df_y_train, df_y_test
    else:
        raise FileNotFoundError('Please download the required dataset or check the path')
   

In [4]:
from sklearn.metrics import mean_squared_error
def array_equal(a,b,threshold=2e-3,with_sign=True):
    a = to_nparray(a)
    b = to_nparray(b)
    if with_sign == False:
        a,b = np.abs(a),np.abs(b)
    error = mean_squared_error(a,b)
    res = error<threshold
    return res

def to_nparray(x):
    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
        return np.array(x)
    elif isinstance(x,np.float64):
        return np.array([x])
    elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
        return x.to_pandas().values
    return x    

# Run tests

In [6]:
%%time
nrows = 2**15
nrows = int(nrows * 1.5)
ncols = 400

X_train, X_test, y_train, y_test = load_data(nrows,ncols)
print('training data',X_train.shape)
print('training label',y_train.shape)
print('testing data',X_test.shape)
print('testing label',y_test.shape)

use mortgage data
training data (39321, 400)
training label (39321, 1)
testing data (9831, 400)
testing label (9831, 1)
CPU times: user 3.99 s, sys: 1.12 s, total: 5.1 s
Wall time: 5.1 s


In [7]:
n_components = 10
whiten = False
random_state = 42
svd_solver="full"

In [8]:
%%time
pca_sk = skPCA(n_components=n_components,svd_solver=svd_solver, 
            whiten=whiten, random_state=random_state)
result_sk = pca_sk.fit(X_train)

CPU times: user 2.68 s, sys: 293 ms, total: 2.98 s
Wall time: 714 ms


In [9]:
# Let's transform the test data
test_result_sk = pca_sk.transform(X_test)

In [14]:
print("We used SKL to dimensionally reduced our test data from ", X_test.shape, " to ", test_result_sk.shape)

We dimensionally reduced our test data from  (9831, 400)  to  (9831, 10)


In [15]:
%%time
X_cudf = cudf.DataFrame.from_pandas(X_train)
X_cudf_test = cudf.DataFrame.from_pandas(X_test)
y_cudf = y_train.values
y_cudf = y_cudf[:,0]
y_cudf = cudf.Series(y_cudf)

CPU times: user 1.46 s, sys: 28.5 ms, total: 1.49 s
Wall time: 1.5 s


In [16]:
%%time
pca_cuml = cumlPCA(n_components=n_components,svd_solver=svd_solver, 
            whiten=whiten, random_state=random_state)
result_cuml = pca_cuml.fit(X_cudf)

CPU times: user 1.37 s, sys: 160 ms, total: 1.53 s
Wall time: 1.53 s


In [17]:
%%time
test_result_cuml = pca_cuml.transform(X_cudf_test)

CPU times: user 386 ms, sys: 7.47 ms, total: 393 ms
Wall time: 391 ms


In [18]:
print("We used cuML to dimensionally reduced our test data from ", X_cudf_test.shape, " to ", test_result_cuml.shape)

We dimensionally reduced our test data from  (9831, 400)  to  (9831, 10)


In [22]:
getattr(pca_sk,'components_')

array([[ 0.00821718, -0.06117351,  0.14099757, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.00923641,  0.04083196,  0.07532576, ..., -0.        ,
        -0.        , -0.        ],
       [ 0.09169033,  0.180816  , -0.52858433, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.05936958,  0.03559477, -0.04140643, ..., -0.        ,
        -0.        , -0.        ],
       [-0.04040538,  0.01181332, -0.03080275, ...,  0.        ,
         0.        ,  0.        ],
       [-0.05353305,  0.03764543, -0.05545315, ..., -0.        ,
        -0.        , -0.        ]])

In [24]:
print(getattr(pca_cuml,'components_'))

             0             1             2              3              4               5              6 ...  399
0 -0.008210497   0.061201617   -0.14096335     0.09916014      0.7642078   -0.0012773828 -4.1118496e-05 ...  0.0
1 -0.009240116  -0.040790435   -0.07543128   -0.011365796     -0.6295471   -0.0014375371 -4.8339625e-05 ...  0.0
2 -0.091713816   -0.18085569     0.5286859    -0.48931396    0.123012796  -0.00021641213  1.0599273e-05 ...  0.0
3  -0.08588919   -0.13999403     0.3784666     -0.3866326    0.037004605  -0.00029446953   5.616881e-06 ...  0.0
4 0.0056338655  -0.011295138  -0.010657858   -0.027682766   -0.002552322   -7.827068e-05   6.749615e-07 ...  0.0
5 -0.002175889   0.017884597  -0.016812105  -0.0035079569   0.0010034589    4.102837e-05 -2.9895273e-06 ...  0.0
6  0.039335795  -0.022694957   0.025151223    0.026568357   0.0033495026      8.3833e-06 -1.1302473e-06 ...  0.0
7  0.059595168   -0.03565905   0.041526083     0.04751735  -0.0027829488   0.00014503763   3.506

In [29]:
type(pca_sk.components_)

numpy.ndarray

In [30]:
type(pca_cuml.components_)

cudf.dataframe.dataframe.DataFrame

In [19]:
for attr in ['singular_values_','components_','explained_variance_',
             'explained_variance_ratio_']:
    passed = array_equal(getattr(pca_sk,attr),getattr(pca_cuml,attr))
    message = 'compare pca: cuml vs sklearn {:>25} {}'.format(attr,'equal' if passed else 'NOT equal')
    print(message)

compare pca: cuml vs sklearn          singular_values_ equal
compare pca: cuml vs sklearn               components_ NOT equal
compare pca: cuml vs sklearn       explained_variance_ equal
compare pca: cuml vs sklearn explained_variance_ratio_ equal


In [20]:
passed = array_equal(result_sk,result_cuml)
message = 'compare pca: cuml vs sklearn transformed results %s'%('equal'if passed else 'NOT equal')
print(message)

TypeError: Expected sequence or array-like, got estimator PCA(copy=True, iterated_power='auto', n_components=10, random_state=42,
  svd_solver='full', tol=0.0, whiten=False)

In [21]:
passed = array_equal(test_result_sk,test_result_cuml)
message = 'compare pca: cuml vs sklearn transformed results %s'%('equal'if passed else 'NOT equal')
print(message)

compare pca: cuml vs sklearn transformed results NOT equal
