In [20]:
import time
import os
import dask
import cupy as cp
import xgboost 
import dask.array as da
from distributed import Client
from dask_hip import LocalHIPCluster

In [21]:
def main(client):
    # generate some random data for demonstration
    n = 1_000
    m = 1_600_000
    partition_size = 10_000
    with dask.config.set({"array.backend": "cupy"}):
        X = da.random.random(size=(m, n), chunks=partition_size)
        y = da.random.random(size=m, chunks=partition_size)

    regressor = xgboost.dask.DaskXGBRegressor(verbosity=1)
    regressor.set_params(tree_method='hist', device='gpu')
    # assigning client here is optional
    regressor.client = client

    start = time.time()
    regressor.fit(X, y, eval_set=[(X, y)])
    end = time.time()
    print(f'Fit time: {end - start:.4f} secs') 

    start = time.time()
    prediction = regressor.predict(X)
    bst = regressor.get_booster()
    history = regressor.evals_result()
    end = time.time()
    print(f'Eval time: {end - start:.4f} secs') 
    print('Evaluation history:', history)
    return bst                  # returning the trained model

In [28]:
# Set up Dask cluster & client, get dask dashboard

cluster = LocalHIPCluster(HIP_VISIBLE_DEVICES='0,1,2,3')
client = Client(cluster)
cluster

INFO:distributed.scheduler:State start
INFO:distributed.scheduler:  Scheduler at:     tcp://127.0.0.1:41611
INFO:distributed.scheduler:  dashboard at:            127.0.0.1:8787
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:37091'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:35993'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:37927'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:41521'
2023-11-09 03:49:45,836 - distributed.preloading - INFO - Creating preload: dask_hip.initialize
2023-11-09 03:49:45,836 - distributed.preloading - INFO - Import preload module: dask_hip.initialize
2023-11-09 03:49:45,889 - distributed.preloading - INFO - Creating preload: dask_hip.initialize
2023-11-09 03:49:45,890 - distributed.preloading - INFO - Import preload module: dask_hip.initialize
2023-11-09 03:49:45,890 - distributed.preloading - INFO - Creating preload: dask_hip.initialize
2023-11-09 03:49:45,890 - distributed.preloading 

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 4,Total memory: 503.72 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:41611,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 4
Started: Just now,Total memory: 503.72 GiB

0,1
Comm: tcp://127.0.0.1:41625,Total threads: 1
Dashboard: http://127.0.0.1:42827/status,Memory: 125.93 GiB
Nanny: tcp://127.0.0.1:37091,
Local directory: /tmp/dask-worker-space/worker-3l1ikqr_,Local directory: /tmp/dask-worker-space/worker-3l1ikqr_
GPU: Instinct MI210,GPU memory: 63.98 GiB

0,1
Comm: tcp://127.0.0.1:41183,Total threads: 1
Dashboard: http://127.0.0.1:37275/status,Memory: 125.93 GiB
Nanny: tcp://127.0.0.1:35993,
Local directory: /tmp/dask-worker-space/worker-oo30pffu,Local directory: /tmp/dask-worker-space/worker-oo30pffu
GPU: Instinct MI210,GPU memory: 63.98 GiB

0,1
Comm: tcp://127.0.0.1:34059,Total threads: 1
Dashboard: http://127.0.0.1:40021/status,Memory: 125.93 GiB
Nanny: tcp://127.0.0.1:37927,
Local directory: /tmp/dask-worker-space/worker-sqef5q0m,Local directory: /tmp/dask-worker-space/worker-sqef5q0m
GPU: Instinct MI210,GPU memory: 63.98 GiB

0,1
Comm: tcp://127.0.0.1:46681,Total threads: 1
Dashboard: http://127.0.0.1:43079/status,Memory: 125.93 GiB
Nanny: tcp://127.0.0.1:41521,
Local directory: /tmp/dask-worker-space/worker-zlahj1am,Local directory: /tmp/dask-worker-space/worker-zlahj1am
GPU: Instinct MI210,GPU memory: 63.98 GiB


In [29]:

# execute dask client
main(client)


INFO:distributed.worker:Run out-of-band function '_start_tracker'
INFO:distributed.scheduler:Receive client connection: Client-worker-0b2e432d-7eb3-11ee-94c6-368d33f8b47c
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:59792
INFO:distributed.scheduler:Receive client connection: Client-worker-0b2ed32e-7eb3-11ee-94c4-368d33f8b47c
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:59794
INFO:distributed.scheduler:Receive client connection: Client-worker-0b2f2565-7eb3-11ee-94cc-368d33f8b47c
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:59798
INFO:distributed.scheduler:Receive client connection: Client-worker-0b2fa0b3-7eb3-11ee-94ca-368d33f8b47c
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:59812
[03:49:54] task [xgboost.dask-0]:tcp://127.0.0.1:41625 got new rank 0
[03:49:54] task [xgboost.dask-1]:tcp://127.0.0.1:41183 got new rank 1
[03:49:54] task [xgboost.dask-2]:tcp://127.0.0.1:34059 got

[0]	validation_0-rmse:0.28866
[1]	validation_0-rmse:0.28861
[2]	validation_0-rmse:0.28856
[3]	validation_0-rmse:0.28851
[4]	validation_0-rmse:0.28846
[5]	validation_0-rmse:0.28840
[6]	validation_0-rmse:0.28835
[7]	validation_0-rmse:0.28830
[8]	validation_0-rmse:0.28825
[9]	validation_0-rmse:0.28820
[10]	validation_0-rmse:0.28816
[11]	validation_0-rmse:0.28811
[12]	validation_0-rmse:0.28806
[13]	validation_0-rmse:0.28801
[14]	validation_0-rmse:0.28796
[15]	validation_0-rmse:0.28792
[16]	validation_0-rmse:0.28787
[17]	validation_0-rmse:0.28782
[18]	validation_0-rmse:0.28777
[19]	validation_0-rmse:0.28773
[20]	validation_0-rmse:0.28768
[21]	validation_0-rmse:0.28763
[22]	validation_0-rmse:0.28758
[23]	validation_0-rmse:0.28753
[24]	validation_0-rmse:0.28748
[25]	validation_0-rmse:0.28743
[26]	validation_0-rmse:0.28738
[27]	validation_0-rmse:0.28733
[28]	validation_0-rmse:0.28728
[29]	validation_0-rmse:0.28723
[30]	validation_0-rmse:0.28719
[31]	validation_0-rmse:0.28714
[32]	validation_0-

<xgboost.core.Booster at 0x7fb36fb2ec40>

In [None]:
# Don't forget to clean-up after run

client.shutdown()
cluster.close()


INFO:distributed.nanny:Closing Nanny at 'tcp://127.0.0.1:37091'. Reason: nanny-close
INFO:distributed.nanny:Nanny asking worker to close. Reason: nanny-close
INFO:distributed.nanny:Closing Nanny at 'tcp://127.0.0.1:35993'. Reason: nanny-close
INFO:distributed.nanny:Nanny asking worker to close. Reason: nanny-close
INFO:distributed.nanny:Closing Nanny at 'tcp://127.0.0.1:37927'. Reason: nanny-close
INFO:distributed.nanny:Nanny asking worker to close. Reason: nanny-close
INFO:distributed.nanny:Closing Nanny at 'tcp://127.0.0.1:41521'. Reason: nanny-close
INFO:distributed.nanny:Nanny asking worker to close. Reason: nanny-close
INFO:distributed.scheduler:Remove client Client-worker-0b2ed32e-7eb3-11ee-94c4-368d33f8b47c
INFO:distributed.core:Received 'close-stream' from tcp://127.0.0.1:59794; closing.
INFO:distributed.scheduler:Remove client Client-worker-0b2e432d-7eb3-11ee-94c6-368d33f8b47c
INFO:distributed.core:Received 'close-stream' from tcp://127.0.0.1:59792; closing.
INFO:distributed.s