## Training Machine Learning Algorithms using Nvidia Rapids cuML

In [None]:
!nvidia-smi

Sat May 22 08:08:34 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Install RAPIDS
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!bash rapidsai-csp-utils/colab/rapids-colab.sh stable

import sys, os, shutil

sys.path.append('/usr/local/lib/python3.7/site-packages/')
os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'
os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'
os.environ["CONDA_PREFIX"] = "/usr/local"
for so in ['cudf', 'rmm', 'nccl', 'cuml', 'cugraph', 'xgboost', 'cuspatial']:
  fn = 'lib'+so+'.so'
  source_fn = '/usr/local/lib/'+fn
  dest_fn = '/usr/lib/'+fn
  if os.path.exists(source_fn):
    print(f'Copying {source_fn} to {dest_fn}')
    shutil.copyfile(source_fn, dest_fn)
# fix for BlazingSQL import issue
# ImportError: /usr/lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.26' not found (required by /usr/local/lib/python3.7/site-packages/../../libblazingsql-engine.so)
if not os.path.exists('/usr/lib64'):
    os.makedirs('/usr/lib64')
for so_file in os.listdir('/usr/local/lib'):
  if 'libstdc' in so_file:
    shutil.copyfile('/usr/local/lib/'+so_file, '/usr/lib64/'+so_file)
    shutil.copyfile('/usr/local/lib/'+so_file, '/usr/lib/x86_64-linux-gnu/'+so_file)

In [None]:
import cudf
from cuml import make_regression, train_test_split
from cuml.linear_model import LinearRegression as cuLinearRegression
from cuml.metrics.regression import r2_score
from sklearn.linear_model import LinearRegression as skLinearRegression

In [None]:
n_samples = 2**10
n_features = 399

random_state = 23

In [None]:
## Lets generate some random regression data

%%time
X, y = make_regression(n_samples=n_samples, n_features=n_features, random_state=random_state)

X = cudf.DataFrame(X)
y = cudf.DataFrame(y)[0]

X_cudf, X_cudf_test, y_cudf, y_cudf_test = train_test_split(X, y, test_size = 0.2, random_state=random_state)

CPU times: user 2.78 s, sys: 904 ms, total: 3.69 s
Wall time: 6.71 s


In [None]:

# Copy dataset from GPU memory to host memory.
# This is done to later compare CPU and GPU results.
X_train = X_cudf.to_pandas()
X_test = X_cudf_test.to_pandas()
y_train = y_cudf.to_pandas()
y_test = y_cudf_test.to_pandas()

In [None]:
y_train

845     -0.171938
480     -5.376313
776      1.431383
46      29.999413
730    -18.161991
          ...    
278    -18.315147
167     43.998138
999     -8.689876
316     -5.297952
325   -103.086761
Name: 0, Length: 820, dtype: float32

## Scikit-learn Model
## Fit, predict and evaluate

In [None]:
%%time
ols_sk = skLinearRegression(fit_intercept=True,
                            normalize=True,
                            n_jobs=-1)

ols_sk.fit(X_train, y_train)

CPU times: user 108 ms, sys: 99.9 ms, total: 208 ms
Wall time: 98.3 ms


In [None]:
%%time
predict_sk = ols_sk.predict(X_test)

CPU times: user 2.94 ms, sys: 0 ns, total: 2.94 ms
Wall time: 6.16 ms


In [None]:
%%time
r2_score_sk = r2_score(y_cudf_test, predict_sk)

CPU times: user 1.61 ms, sys: 219 µs, total: 1.83 ms
Wall time: 1.53 ms


## cuML Model
## Fit, predict and evaluate

In [None]:

%%time
ols_cuml = cuLinearRegression(fit_intercept=True,
                              normalize=True,
                              algorithm='eig')

ols_cuml.fit(X_cudf, y_cudf)

CPU times: user 48.7 ms, sys: 4.15 ms, total: 52.8 ms
Wall time: 69.7 ms


In [None]:
%%time
predict_cuml = ols_cuml.predict(X_cudf_test)

CPU times: user 43.6 ms, sys: 0 ns, total: 43.6 ms
Wall time: 42.8 ms


In [None]:

%%time
r2_score_cuml = r2_score(y_cudf_test, predict_cuml)

CPU times: user 1.3 ms, sys: 0 ns, total: 1.3 ms
Wall time: 1.31 ms


### Lets Compare Reuslts

In [None]:

print("R^2 score (SKL):  %s" % r2_score_sk)
print("R^2 score (cuML): %s" % r2_score_cuml)

R^2 score (SKL):  1.0
R^2 score (cuML): 1.0


In [None]:
from sklearn.ensemble import RandomForestRegressor
from cuml.ensemble import RandomForestRegressor