In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD as skTSVD
from cuml import TruncatedSVD as cumlTSVD
import cudf
import os

# Helper Functions

In [2]:
from timeit import default_timer

class Timer(object):
    def __init__(self):
        self._timer = default_timer
    
    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self):
        """Start the timer."""
        self.start = self._timer()

    def stop(self):
        """Stop the timer. Calculate the interval in seconds."""
        self.end = self._timer()
        self.interval = self.end - self.start

In [3]:
# check if mortgage dataset is present and then extract the data from it, else throw an error
import gzip
# change the path of the mortgage dataset if you have saved it in a different directory
def load_data(nrows, ncols, cached = '/rapids/notebooks/wip/notebooks/cuml/data/mortgage.npy.gz'):
    train_rows = int(nrows*0.8)
    if os.path.exists(cached):
        print('use mortgage data')
        with gzip.open(cached) as f:
            X = np.load(f)
        # the 4th column is 'adj_remaining_months_to_maturity'
        # used as the label
        X = X[:,[i for i in range(X.shape[1]) if i!=4]]
        y = X[:,4:5]
        rindices = np.random.randint(0,X.shape[0]-1,nrows)
        X = X[rindices,:]
    else:
        # raise an exception if the dataset is not present
        raise FileNotFoundError('Please download the required dataset or check the path')
    df_X_train = pd.DataFrame({'fea%d'%i:X[0:train_rows,i] for i in range(X.shape[1])})
    df_X_test = pd.DataFrame({'fea%d'%i:X[train_rows:,i] for i in range(X.shape[1])})    
    return df_X_train, df_X_test

In [4]:
from sklearn.metrics import mean_squared_error
def array_equal(a,b,threshold=5e-3,with_sign=True):
    a = to_nparray(a)
    b = to_nparray(b)
    if with_sign == False:
        a,b = np.abs(a),np.abs(b)
    error = mean_squared_error(a,b)
    res = error<threshold
    return res

def to_nparray(x):
    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
        return np.array(x)
    elif isinstance(x,np.float64):
        return np.array([x])
    elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
        return x.to_pandas().values
    return x

# Run tests

In [None]:
%%time
nrows = 2**22
ncols = 40
# split the data into 80/20 split where 80% of the data is set as the training data and the remaining 20% is set as the testing data
X_train, X_test = load_data(nrows,ncols)
print('training data',X_train.shape)
print('testing data',X_test.shape)

use mortgage data


In [None]:
n_components = 10
random_state = 42

In [None]:
%%time
algorithm='arpack'
tsvd_sk = skTSVD(n_components=n_components,algorithm=algorithm, 
            random_state=random_state)
ft_sk = tsvd_sk.fit(X_train)
result_sk = tsvd_sk.transform(X_test)

In [11]:
%%time
# convert the pandas dataframe to cudf format
X_cudf = cudf.DataFrame.from_pandas(X_train)
X_cudf_test = cudf.DataFrame.from_pandas(X_test)

CPU times: user 2.88 s, sys: 725 ms, total: 3.61 s
Wall time: 3.64 s


In [12]:
%%time
algorithm='full'
tsvd_cuml = cumlTSVD(n_components=n_components,algorithm=algorithm, 
            random_state=random_state)
ft_cuml = tsvd_cuml.fit(X_cudf)
result_cuml = tsvd_cuml.transform(X_cudf_test)

CPU times: user 1.9 s, sys: 437 ms, total: 2.34 s
Wall time: 2.35 s


In [13]:
for attr in ['singular_values_','components_','explained_variance_','explained_variance_ratio_']:
    passed = array_equal(getattr(tsvd_sk,attr),getattr(tsvd_cuml,attr),threshold=0.1)
    # larger error margin due to different algorithms: arpack vs full
    message = 'compare tsvd: cuml vs sklearn {:>25} {}'.format(attr,'equal' if passed else 'NOT equal')
    print(message)

compare tsvd: cuml vs sklearn          singular_values_ NOT equal
compare tsvd: cuml vs sklearn               components_ equal
compare tsvd: cuml vs sklearn       explained_variance_ equal
compare tsvd: cuml vs sklearn explained_variance_ratio_ equal


In [14]:
passed = array_equal(ft_sk,ft_cuml,threshold=0.1)
# larger error margin due to different algorithms: arpack vs full
message = 'compare tsvd: cuml vs sklearn transformed results %s'%('equal'if passed else 'NOT equal')
print(message)

compare tsvd: cuml vs sklearn transformed results equal


In [15]:
tsvd_sk.singular_values_

array([5114.5073 ,  907.3044 ,  613.96014,  511.64966,  401.87973,
        384.82172,  319.4206 ,  277.6314 ,  276.33896,  274.1322 ],
      dtype=float32)

In [16]:
to_nparray(tsvd_cuml.singular_values_)


array([5115.269  ,  907.222  ,  613.4173 ,  510.58362,  401.65436,
        384.78455,  321.4075 ,  277.63147,  276.33966,  274.12833],
      dtype=float32)

In [17]:
tsvd_sk.components_

array([[ 2.19470486e-01,  7.78921172e-02,  5.55465557e-02, ...,
         3.99304181e-02,  2.96212675e-04,  3.17021132e-01],
       [ 9.43074655e-03, -5.59553243e-02,  3.31003904e-01, ...,
         6.66118443e-01,  2.62557575e-03, -6.00367069e-01],
       [ 9.11145732e-02,  5.09127900e-02,  9.90409479e-02, ...,
         1.13879502e-01,  1.94298682e-05,  8.83277506e-03],
       ...,
       [ 4.35403734e-03, -5.04888222e-03, -1.11680925e-02, ...,
         2.31409818e-03, -1.47589002e-04, -4.64289356e-03],
       [-8.47651716e-03, -8.18580389e-04, -1.90659761e-02, ...,
         5.16641140e-03, -2.78902473e-04, -7.70121813e-03],
       [ 1.75419003e-02, -2.04888731e-02, -4.00457419e-02, ...,
         7.91560858e-03, -6.20336505e-04, -1.93423331e-02]], dtype=float32)

In [31]:
to_nparray(tsvd_cuml.components_)

array([[ 2.1936791e-01,  7.7806652e-02,  5.5559669e-02, ...,
         4.0003870e-02,  2.9826164e-04,  3.1691140e-01],
       [ 9.2484560e-03, -5.5741414e-02,  3.3129090e-01, ...,
         6.6603416e-01,  2.6420206e-03, -6.0014218e-01],
       [ 9.1495402e-02,  5.1946219e-02,  9.9573754e-02, ...,
         1.1555344e-01,  1.0178168e-04,  8.6729219e-03],
       ...,
       [-7.3127989e-03,  2.5826369e-03, -1.8156429e-03, ...,
         1.3518544e-03, -4.1741019e-05, -4.7096619e-04],
       [-2.8143590e-03, -5.9185773e-03, -2.5262371e-02, ...,
         6.0600182e-03, -3.9165979e-04, -1.0809118e-02],
       [ 2.2940703e-02, -2.2529401e-02, -3.8742151e-02, ...,
         6.5671760e-03, -6.2616915e-04, -1.8944427e-02]], dtype=float32)

In [49]:
from sklearn.utils.testing import (assert_array_almost_equal, assert_equal)
assert_array_almost_equal(tsvd_sk.singular_values_, to_nparray(tsvd_cuml.singular_values_), decimal=1)

AssertionError: 
Arrays are not almost equal to 1 decimals

(mismatch 50.0%)
 x: array([5424.6,  962.2,  653.1,  542.7,  426.7,  407.8,  339.1,  294.9,
        293.8,  291.2], dtype=float32)
 y: array([5425.3,  962. ,  652.4,  541.9,  426.3,  407.7,  340.9,  294.9,
        293.8,  291.2], dtype=float32)