In [None]:
import dask
import dask.array as da
from dask_ml.model_selection import KFold
import xgboost as xgb
from dask.distributed import LocalCluster

X = da.asarray(X)
y = da.asarray(y)

X = X.rechunk((1000, X.shape[1]))
y = y.rechunk((1000,))

params = {
    'objective': 'multi:softmax',
    'max_depth': 4,
    'eta': 0.01,
    'subsample': 0.5,
    'min_child_weight': 0.5,
    'num_class': 7
}

n_splits = 5
cv = KFold(n_splits=n_splits, shuffle=True, random_state=94)

predictions = {}

with LocalCluster(n_workers=8) as cluster:
    with cluster.get_client() as client:
        for i, (train, test) in enumerate(cv.split(X, y)):
            X_train = X[train, :]
            X_test = X[test, :]
            y_train = y[train]
            y_test = y[test]

            d_train = xgb.dask.DaskDMatrix(client, X_train, y_train, enable_categorical=True)
            model = xgb.dask.train(client, params=params, dtrain=d_train)
            predictions[f'fold_{i}'] = xgb.dask.predict(client, model, X_test)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 64386 instead
  next(self.gen)

+---------+--------+-----------+---------+
| Package | Client | Scheduler | Workers |
+---------+--------+-----------+---------+
| numpy   | 2.1.1  | 2.1.1     | 2.0.2   |
+---------+--------+-----------+---------+
  p = blockwise(
  p = blockwise(
Windows is not officially supported for dask/xgboost, contribution are welcomed.
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
Windows is not officially 

In [None]:
import dask
import dask.array as da
from dask_ml.model_selection import KFold
import xgboost as xgb
from dask.distributed import LocalCluster
import joblib
from sklearn.metrics import confusion_matrix, classification_report

X = da.asarray(X).rechunk((1000, X.shape[1]))
y = da.asarray(y).rechunk((1000,))

params = {
    'objective': 'multi:softmax',
    'max_depth': 4,
    'eta': 0.01,
    'subsample': 0.5,
    'min_child_weight': 0.5,
    'num_class': 7
}

n_splits = 5
cv = KFold(n_splits=n_splits, shuffle=True, random_state=94)

with LocalCluster(n_workers=8) as cluster:
    with cluster.get_client() as client:
        for i, (train, test) in enumerate(cv.split(X, y)):
            X_train = X[train, :]
            X_test = X[test, :]
            y_train = y[train]
            y_test = y[test]

            d_train = xgb.dask.DaskDMatrix(client, X_train, y_train, enable_categorical=True)
            model = xgb.dask.train(client, params=params, dtrain=d_train)

            joblib.dump(model, f'model_fold_{i}.pkl')

            predictions_fold = xgb.dask.predict(client, model, X_test)
            predictions_fold = predictions_fold.compute()

            cm = confusion_matrix(y_test.compute(), predictions_fold)
            print(f"Macierz klasyfikacji dla fold {i}:\n", cm)
            print(classification_report(y_test.compute(), predictions_fold))

In [None]:
import dask.array as da
from dask_ml.model_selection import GridSearchCV
import xgboost as xgb
from dask.distributed import Client

X = da.asarray(X).rechunk((1000, X.shape[1]))
y = da.asarray(y).rechunk((1000,))

client = Client()

param_grid = {
    'max_depth': [3, 4, 5],
    'eta': [0.01, 0.1, 0.2],
    'subsample': [0.5, 0.75, 1.0],
    'min_child_weight': [1, 2, 3],
    'num_class': [7]
}

model = xgb.dask.DaskXGBClassifier(objective='multi:softmax', random_state=94)

grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X, y)

best_params = grid_search.best_params_
print("Najlepsze parametry:", best_params)

initial_params = {
    'max_depth': 4,
    'eta': 0.01,
    'subsample': 0.5,
    'min_child_weight': 0.5,
    'num_class': 7
}

print("Początkowe parametry:", initial_params)