In [1]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
from dask import dataframe as dd
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import optuna
import gc

In [2]:
sample_submission = pd.read_csv('../input/sample_submission.csv')
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [3]:
train.columns

Index(['id', 'f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8',
       ...
       'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99',
       'target'],
      dtype='object', length=102)

In [4]:
columns = test.columns[1:]

In [5]:
data = train[columns]

In [6]:
target = train['target'].values

In [7]:
train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)

In [8]:
train_x['target'] = train_y
test_x['target'] = test_y

In [9]:
train_x.to_csv('../input/train_x.csv', index=False)
test_x.to_csv('../input/test_x.csv', index=False)

In [10]:
train_x = dd.read_csv('../input/train_x.csv')
test_x = dd.read_csv('../input/test_x.csv')

In [11]:
train_y = train_x['target']
train_x = train_x[train_x.columns.difference(['target'])]

test_y = test_x['target']
test_x = test_x[test_x.columns.difference(['target'])]

In [12]:
cluster = LocalCUDACluster(n_workers=2)
client = Client(cluster)

distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize


In [13]:
dtrain = xgb.dask.DaskDMatrix(client, train_x, train_y)

In [14]:
dtest = xgb.dask.DaskDMatrix(client, test_x, test_y)

In [15]:
num_round = 1000

In [16]:
def objective(trial):
        
    params = {
        'objective': trial.suggest_categorical('objective',['binary:logistic']), 
        'tree_method': trial.suggest_categorical('tree_method',['gpu_hist']),  # 'gpu_hist','hist'
        'lambda': trial.suggest_loguniform('lambda',1e-3,10.0),
        'alpha': trial.suggest_loguniform('alpha',1e-3,10.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3,1.0),
        'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001,0.1),
        #'n_estimators': trial.suggest_categorical('n_estimators', [1000]),
        'max_depth': trial.suggest_categorical('max_depth', [3,5,7,9,11,13,15,17,20]),
        #'random_state': trial.suggest_categorical('random_state', [24,48,2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1,300),
        'eval_metric': trial.suggest_categorical('eval_metric',['logloss']),

    }

    output = xgb.dask.train(client, params, dtrain, num_round)
    
    booster = output['booster']  # booster is the trained model
    booster.set_param({'predictor': 'gpu_predictor'})

    predictions = xgb.dask.predict(client, booster, dtest)
    
    predictions = predictions.compute()

    roc = roc_auc_score(test_y, predictions)
    
    return roc

In [17]:
study = optuna.create_study(direction='maximize')


[32m[I 2021-11-09 16:56:56,190][0m A new study created in memory with name: no-name-7e901587-142e-499f-b5b2-c4173f494bf7[0m


In [18]:
%%time
study.optimize(objective, n_trials=2)

[16:56:56] task [xgboost.dask]:tcp://192.168.1.200:42753 got new rank 0
[16:56:56] task [xgboost.dask]:tcp://192.168.1.200:33013 got new rank 1
[32m[I 2021-11-09 16:57:56,627][0m Trial 0 finished with value: 0.7352219007126887 and parameters: {'objective': 'binary:logistic', 'tree_method': 'gpu_hist', 'lambda': 0.12761069658740856, 'alpha': 0.5844807231320448, 'colsample_bytree': 0.9664757369461481, 'subsample': 0.6608467419110977, 'learning_rate': 0.00514714073942699, 'max_depth': 9, 'min_child_weight': 5, 'eval_metric': 'logloss'}. Best is trial 0 with value: 0.7352219007126887.[0m
[16:57:56] task [xgboost.dask]:tcp://192.168.1.200:33013 got new rank 0
[16:57:56] task [xgboost.dask]:tcp://192.168.1.200:42753 got new rank 1
[32m[I 2021-11-09 16:58:49,310][0m Trial 1 finished with value: 0.7324377000215796 and parameters: {'objective': 'binary:logistic', 'tree_method': 'gpu_hist', 'lambda': 0.018470840902838615, 'alpha': 0.3955118073558127, 'colsample_bytree': 0.4288592957361059, 

CPU times: user 7.35 s, sys: 1.85 s, total: 9.2 s
Wall time: 1min 53s


In [19]:
%%time
study.optimize(objective, n_trials=50)

[16:58:59] task [xgboost.dask]:tcp://192.168.1.200:42753 got new rank 0
[16:58:59] task [xgboost.dask]:tcp://192.168.1.200:33013 got new rank 1
[32m[I 2021-11-09 16:59:32,274][0m Trial 2 finished with value: 0.7408484479645058 and parameters: {'objective': 'binary:logistic', 'tree_method': 'gpu_hist', 'lambda': 1.069455276982022, 'alpha': 0.006134905918959812, 'colsample_bytree': 0.7401074793315064, 'subsample': 0.7834687101649491, 'learning_rate': 0.059224784797189754, 'max_depth': 13, 'min_child_weight': 230, 'eval_metric': 'logloss'}. Best is trial 2 with value: 0.7408484479645058.[0m
[16:59:32] task [xgboost.dask]:tcp://192.168.1.200:33013 got new rank 0
[16:59:32] task [xgboost.dask]:tcp://192.168.1.200:42753 got new rank 1
[32m[I 2021-11-09 17:02:32,145][0m Trial 3 finished with value: 0.7252175579402744 and parameters: {'objective': 'binary:logistic', 'tree_method': 'gpu_hist', 'lambda': 7.889018867038529, 'alpha': 6.9149029803981525, 'colsample_bytree': 0.5019252330946251,

CPU times: user 3min 19s, sys: 51.4 s, total: 4min 10s
Wall time: 59min 6s


In [20]:
%%time
study.optimize(objective, n_trials=100)

[18:03:08] task [xgboost.dask]:tcp://192.168.1.200:33013 got new rank 0
[18:03:08] task [xgboost.dask]:tcp://192.168.1.200:42753 got new rank 1
[32m[I 2021-11-09 18:05:08,066][0m Trial 52 finished with value: 0.7467434326748467 and parameters: {'objective': 'binary:logistic', 'tree_method': 'gpu_hist', 'lambda': 0.9860271941424774, 'alpha': 0.07703358075771041, 'colsample_bytree': 0.6912324428114551, 'subsample': 0.8472077556118394, 'learning_rate': 0.012476978800581542, 'max_depth': 20, 'min_child_weight': 89, 'eval_metric': 'logloss'}. Best is trial 22 with value: 0.7475449934082301.[0m
[18:05:08] task [xgboost.dask]:tcp://192.168.1.200:42753 got new rank 0
[18:05:08] task [xgboost.dask]:tcp://192.168.1.200:33013 got new rank 1
[32m[I 2021-11-09 18:06:39,898][0m Trial 53 finished with value: 0.745546441269834 and parameters: {'objective': 'binary:logistic', 'tree_method': 'gpu_hist', 'lambda': 7.135796529000328, 'alpha': 0.8741289612976583, 'colsample_bytree': 0.882699791882525,

CPU times: user 4min 11s, sys: 54 s, total: 5min 5s
Wall time: 1h 7min 9s


In [21]:
%%time
study.optimize(objective, n_trials=250)

[22:02:17] task [xgboost.dask]:tcp://192.168.1.200:33013 got new rank 0
[22:02:17] task [xgboost.dask]:tcp://192.168.1.200:42753 got new rank 1
[32m[I 2021-11-09 22:02:49,366][0m Trial 152 finished with value: 0.7477377613579426 and parameters: {'objective': 'binary:logistic', 'tree_method': 'gpu_hist', 'lambda': 0.34505620571806894, 'alpha': 0.0012169353550819187, 'colsample_bytree': 0.38109597254884664, 'subsample': 0.880267502090314, 'learning_rate': 0.02532200006965027, 'max_depth': 13, 'min_child_weight': 250, 'eval_metric': 'logloss'}. Best is trial 102 with value: 0.7483121100487911.[0m
[22:02:49] task [xgboost.dask]:tcp://192.168.1.200:33013 got new rank 0
[22:02:49] task [xgboost.dask]:tcp://192.168.1.200:42753 got new rank 1
[32m[I 2021-11-09 22:03:16,864][0m Trial 153 finished with value: 0.7478639092499075 and parameters: {'objective': 'binary:logistic', 'tree_method': 'gpu_hist', 'lambda': 0.14292457448241044, 'alpha': 0.0018809941080713398, 'colsample_bytree': 0.3011

CPU times: user 7min 55s, sys: 1min 27s, total: 9min 22s
Wall time: 1h 53min 30s


In [22]:
study.best_trial.params

{'objective': 'binary:logistic',
 'tree_method': 'gpu_hist',
 'lambda': 0.04605965699636567,
 'alpha': 0.03842772979823298,
 'colsample_bytree': 0.3155289537736205,
 'subsample': 0.7947859156013958,
 'learning_rate': 0.026671327175212152,
 'max_depth': 13,
 'min_child_weight': 295,
 'eval_metric': 'logloss'}