In [1]:
%load_ext watermark

In [2]:
%watermark

Last updated: 2023-05-31T11:22:17.871032-04:00

Python implementation: CPython
Python version       : 3.7.11
IPython version      : 7.10.1

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 4.15.0-161-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 40
Architecture: 64bit



In [3]:
%watermark --gpu

GPU Info: 
  GPU 0: Tesla V100-DGXS-32GB
  GPU 1: Tesla V100-DGXS-32GB
  GPU 2: Tesla V100-DGXS-32GB
  GPU 3: Tesla V100-DGXS-32GB



In [4]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
from dask import dataframe as dd
import pandas as pd
import xgboost as xgb

In [5]:
xgb.__version__

'1.6.2'

In [6]:
%%time
fname = '../input/HIGGS.csv'
colnames = ['label'] + ['feature-%02d' % i for i in range(1, 29)]
# By default dask dataframe uses pandas as data handling backend
dask_df = dd.read_csv(fname, low_memory=False)

CPU times: user 25.8 ms, sys: 966 µs, total: 26.8 ms
Wall time: 25.3 ms


In [7]:
dask_df.head()

Unnamed: 0,label,feature-01,feature-02,feature-03,feature-04,feature-05,feature-06,feature-07,feature-08,feature-09,...,feature-19,feature-20,feature-21,feature-22,feature-23,feature-24,feature-25,feature-26,feature-27,feature-28
0,1.0,0.869293,-0.635082,0.22569,0.32747,-0.689993,0.754202,-0.248573,-1.092064,0.0,...,-0.010455,-0.045767,3.101961,1.35376,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678
1,1.0,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,...,-1.13893,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343
2,1.0,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.0,...,1.128848,0.900461,0.0,0.909753,1.10833,0.985692,0.951331,0.803252,0.865924,0.780118
3,0.0,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,...,-0.678379,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904
4,1.0,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,...,-0.373566,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808487


In [8]:
%%time
pd_df = pd.read_csv(fname, low_memory=False)
pd_df.head()

CPU times: user 1min 12s, sys: 5.08 s, total: 1min 17s
Wall time: 1min 17s


Unnamed: 0,label,feature-01,feature-02,feature-03,feature-04,feature-05,feature-06,feature-07,feature-08,feature-09,...,feature-19,feature-20,feature-21,feature-22,feature-23,feature-24,feature-25,feature-26,feature-27,feature-28
0,1.0,0.869293,-0.635082,0.22569,0.32747,-0.689993,0.754202,-0.248573,-1.092064,0.0,...,-0.010455,-0.045767,3.101961,1.35376,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678
1,1.0,0.907542,0.329147,0.359412,1.49797,-0.31301,1.095531,-0.557525,-1.58823,2.173076,...,-1.13893,-0.000819,0.0,0.30222,0.833048,0.9857,0.978098,0.779732,0.992356,0.798343
2,1.0,0.798835,1.470639,-1.635975,0.453773,0.425629,1.104875,1.282322,1.381664,0.0,...,1.128848,0.900461,0.0,0.909753,1.10833,0.985692,0.951331,0.803252,0.865924,0.780118
3,0.0,1.344385,-0.876626,0.935913,1.99205,0.882454,1.786066,-1.646778,-0.942383,0.0,...,-0.678379,-1.360356,0.0,0.946652,1.028704,0.998656,0.728281,0.8692,1.026736,0.957904
4,1.0,1.105009,0.321356,1.522401,0.882808,-1.205349,0.681466,-1.070464,-0.921871,0.0,...,-0.373566,0.113041,0.0,0.755856,1.361057,0.98661,0.838085,1.133295,0.872245,0.808487


In [9]:
y = dask_df['label']
X = dask_df[dask_df.columns.difference(['label'])]

In [10]:
cluster = LocalCUDACluster(n_workers=4)

In [11]:
client = Client(cluster)

In [12]:
dtrain = xgb.dask.DaskDMatrix(client, X, y)

In [13]:
%%time
output = xgb.dask.train(client,
                            # Use GPU training algorithm
                            {'verbosity':1, 'tree_method': 'gpu_hist', 
                             'objective':'binary:logistic'},
                            dtrain,
                            num_boost_round=10000)

  client.wait_for_workers(n_workers)


CPU times: user 13.2 s, sys: 4.31 s, total: 17.5 s
Wall time: 2min 42s


In [14]:
%%time
booster = output['booster']  # booster is the trained model
booster.set_param({'predictor': 'gpu_predictor'})

predictions = xgb.dask.predict(client, booster, dtrain)

CPU times: user 1.65 s, sys: 405 ms, total: 2.05 s
Wall time: 11 s


In [15]:
prediction = xgb.dask.inplace_predict(client, output, X)

In [16]:
predictions.compute()

array([0.69795555, 0.8019834 , 0.983204  , ..., 0.22334789, 0.32207283,
       0.15013506], dtype=float32)