In [5]:
from xgboost import dask as dxgb
import dask.array as da
import dask.distributed
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import dask.dataframe as dd
import dask.array as da
import numpy as np

import time

import os
import sys
sys.stderr = open(os.devnull, 'w')

cluster = dask.distributed.LocalCluster()
client = dask.distributed.Client(cluster)

# Use scikit-learn to generate dataset
from sklearn.datasets import make_classification
X_np, y_np = make_classification(
    n_samples=1_000_000,
    n_features=20,
    random_state=42
)

# Split into train and test sets
from sklearn.model_selection import train_test_split
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    X_np, y_np, test_size=0.2, random_state=42
)

# Convert NumPy arrays to Dask arrays
X_train = da.from_array(X_train_np, chunks=(10000, -1))
y_train = da.from_array(y_train_np, chunks=(10000,))
X_test = da.from_array(X_test_np, chunks=(10000, -1))
y_test = da.from_array(y_test_np, chunks=(10000,))

# Create Dask DMatrix
dtrain = dxgb.DaskDMatrix(client, X_train, y_train)
dtest = dxgb.DaskDMatrix(client, X_test, y_test)

# Training parameters
params_gpu = {
    'tree_method': 'hist',
    'objective': 'binary:logistic',
    'verbosity': 1,
    'eta': 0.3
}

start = time.time()
output = dxgb.train(
    client,
    params_gpu,
    dtrain,
    num_boost_round=10,
    evals=[(dtrain, "train"), (dtest, "test")]
)
end = time.time()

# Prediction
y_pred_proba = dxgb.predict(client, output, dtest)
y_pred = (y_pred_proba > 0.5).astype(int)

# Compute accuracy
y_true = y_test.compute()
y_pred = y_pred.compute()
acc = accuracy_score(y_true, y_pred)
print(f"Accuracy: {acc:.4f}")
print(f"Dask Time: {end - start:.2f} seconds")


Accuracy: 0.9798
Dask Time: 1.71 seconds
