In [1]:
import os
GPU_id = 2
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [2]:
import numpy as np
import pandas as pd
import cudf

from sklearn.neighbors import NearestNeighbors as skKNN
from cuml.neighbors.nearest_neighbors import NearestNeighbors as cumlKNN

In [3]:
import gzip
# change the path of the mortgage dataset if you have saved it in a different directory
def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz',source='mortgage'):
    if os.path.exists(cached) and source=='mortgage':
        print('use mortgage data')
        with gzip.open(cached) as f:
            X = np.load(f)
        X = X[np.random.randint(0,X.shape[0]-1,nrows),:ncols]
    else:
        # create a random dataset
        print('use random data')
        X = np.random.random((nrows,ncols)).astype('float32')
    df = pd.DataFrame({'fea%d'%i:X[:,i] for i in range(X.shape[1])}).fillna(0)
    return df

In [4]:
from sklearn.metrics import mean_squared_error

In [5]:
def array_equal(a,b,threshold=1e-3,with_sign=True,metric='mse'):
    a = to_nparray(a)
    b = to_nparray(b)
    if with_sign == False:
        a,b = np.abs(a),np.abs(b)
    if metric=='mse':
        error = mean_squared_error(a,b)
        res = error<threshold
    elif metric=='abs':
        error = a-b
        res = len(error[error>threshold]) == 0
    elif metric == 'acc':
        error = np.sum(a!=b)/(a.shape[0]*a.shape[1])
        res = error<threshold
    return res

# calculate the accuracy 
def accuracy(a,b, threshold=1e-4):
    a = to_nparray(a)
    b = to_nparray(b)
    c = a-b
    c = len(c[c>1]) / (c.shape[0]*c.shape[1])
    return c<threshold

# the function converts a variable from ndarray or dataframe format to numpy array
def to_nparray(x):
    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
        return np.array(x)
    elif isinstance(x,np.float64):
        return np.array([x])
    elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
        return x.to_pandas().values
    return x

In [6]:
%%time
# nrows = number of samples
# ncols = number of features of each sample

nrows = 2**14
ncols = 40

X = load_data(nrows,ncols)
print('data',X.shape)
print(type(X))

use random data
data (16384, 40)
<class 'pandas.core.frame.DataFrame'>
CPU times: user 16 ms, sys: 0 ns, total: 16 ms
Wall time: 16.1 ms


In [7]:
n_neighbors = 10

In [8]:
%%time
# use the sklearn KNN model to fit the dataset 
knn_sk = skKNN(metric = 'euclidean')
knn_sk.fit(X)
D_sk,I_sk = knn_sk.kneighbors(X, n_neighbors)

CPU times: user 21.9 s, sys: 0 ns, total: 21.9 s
Wall time: 21.9 s


In [9]:
%%time
# convert the pandas dataframe to cudf dataframe
X = cudf.DataFrame.from_pandas(X)

CPU times: user 584 ms, sys: 716 ms, total: 1.3 s
Wall time: 2.43 s


In [10]:
%%time
# use cuml's KNN model to fit the dataset
knn_cuml = cumlKNN()
knn_cuml.fit(X)

# calculate the distance and the indices of the samples present in the dataset
D_cuml,I_cuml = knn_cuml.kneighbors(X, n_neighbors)

CPU times: user 3.16 s, sys: 596 ms, total: 3.75 s
Wall time: 3.57 s


In [14]:
passed = array_equal(D_sk,D_cuml, metric='abs') # metric used can be 'acc', 'mse', or 'abs'
message = 'compare knn: cuml vs sklearn distances %s'%('equal'if passed else 'NOT equal')
print(message)

compare knn: cuml vs sklearn distances equal


In [15]:
# compare the labels obtained while using sklearn and cuml models
passed = accuracy(I_sk, I_cuml, threshold=1e-1)
message = 'compare knn: cuml vs sklearn indexes %s'%('equal'if passed else 'NOT equal')
print(message)

compare knn: cuml vs sklearn indexes equal
