In [1]:
%load_ext watermark

In [2]:
%watermark

Last updated: 2024-02-01T14:58:59.723282-08:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 8.21.0

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 5.15.0-1042-nvidia
Machine     : x86_64
Processor   : x86_64
CPU cores   : 224
Architecture: 64bit



In [3]:
%watermark --gpu

GPU Info: Install the gpu extra (pip install 'watermark[gpu]') to display GPU information for NVIDIA chipsets



In [4]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
from dask import dataframe as dd
from dask.delayed import delayed
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

import optuna
import gc
import logging

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
%watermark --iversions

xgboost: 2.0.3
numpy  : 1.26.3
optuna : 3.5.0
pandas : 1.5.3
logging: 0.5.1.2
dask   : 2023.11.0



In [6]:
!nvidia-smi -L

GPU 0: NVIDIA H100 80GB HBM3 (UUID: GPU-5c583ee7-8fb1-b26a-4bea-16e45d984a32)
GPU 1: NVIDIA H100 80GB HBM3 (UUID: GPU-a6ab06f5-a6e2-18c1-9dec-0dcd29f44a46)
GPU 2: NVIDIA H100 80GB HBM3 (UUID: GPU-bb8d5098-3c56-c48d-a0a3-fcdfcec6d3f5)
GPU 3: NVIDIA H100 80GB HBM3 (UUID: GPU-cdbe686b-1611-999b-8e8d-a3c5f35b40c4)
GPU 4: NVIDIA H100 80GB HBM3 (UUID: GPU-0df1cef0-fc95-cc88-b5f4-239889b3acba)
GPU 5: NVIDIA H100 80GB HBM3 (UUID: GPU-9dca657c-dbe4-08b7-fe7c-5653f183f0b6)
GPU 6: NVIDIA H100 80GB HBM3 (UUID: GPU-33389782-e2ad-5022-997d-cf470313879c)
GPU 7: NVIDIA H100 80GB HBM3 (UUID: GPU-926eaa05-87fb-35e0-0a98-adb2ad41d0be)


In [7]:
cluster = LocalCUDACluster(n_workers=8)
client = Client(cluster)

In [8]:
!nvidia-smi

Thu Feb  1 14:59:05 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA H100 80GB HBM3          On  | 00000000:1B:00.0 Off |                    0 |
| N/A   27C    P0              67W / 700W |      4MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA H100 80GB HBM3          On  | 00000000:43:00.0 Off |  

In [78]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

In [10]:
train_folds = []
val_folds = []
train_ys = []
val_ys = []

for i in range(5):
    print(f'Loading fold {i}')
    train_fold_d = delayed(pd.read_csv)(f'../input/xgtrain_oho_fold_{i}.csv.gz')
    train_fold = dd.from_delayed(train_fold_d)
    
    val_fold_d = delayed(pd.read_csv)(f'../input/xgval_oho_fold_{i}.csv.gz')
    val_fold = dd.from_delayed(val_fold_d)
    
    
    train_y = train_fold['target']
    train_fold = train_fold[train_fold.columns.difference(['target'])]
    
    val_y = val_fold['target']
    val_fold = val_fold[val_fold.columns.difference(['target'])]
    
    train_folds.append(train_fold)
    val_folds.append(val_fold)
    
    train_ys.append(train_y)
    val_ys.append(val_y)

Loading fold 0
Loading fold 1
Loading fold 2
Loading fold 3
Loading fold 4


In [11]:
train = pd.read_csv('../input/train.csv.zip')

target = train['target'].values
target

array([0, 0, 0, ..., 0, 0, 0])

In [12]:
train_oof = np.zeros((target.shape[0],))

num_round = 1000

def objective(trial):
        
    params = {
        'objective': trial.suggest_categorical('objective',['binary:logistic']),
        'tree_method': trial.suggest_categorical('tree_method',['hist']),  # 'gpu_hist','hist'
        'device': trial.suggest_categorical('device',['cuda']),
        'lambda': trial.suggest_float('lambda',1e-3,10.0, log=True),
        'alpha': trial.suggest_float('alpha',1e-3,10.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3,1.0),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.001,0.1, log=True),
        #'n_estimators': trial.suggest_categorical('n_estimators', [1000]),
        'max_depth': trial.suggest_int('max_depth', 3, 25),
        #'random_state': trial.suggest_categorical('random_state', [24,48,2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1,300),
        'eval_metric': trial.suggest_categorical('eval_metric',['logloss']),

    }

    kf = KFold(5, shuffle=True, random_state=137)

    for i, (train_index, val_index) in enumerate(kf.split(train,target)):
        dtrain = xgb.dask.DaskDMatrix(client, train_folds[i].values, train_ys[i], enable_categorical=True)
        dval = xgb.dask.DaskDMatrix(client, val_folds[i].values, val_ys[i], enable_categorical=True)
        
        output = xgb.dask.train(client, params, dtrain, num_round)
        booster = output['booster']  # booster is the trained model
        booster.set_param({'predictor': 'gpu_predictor'})
        predictions = xgb.dask.predict(client, booster, dval)
        predictions = predictions.compute()
        train_oof[val_index] = predictions
        del dtrain, dval, output
        gc.collect()
        gc.collect()

    gini = gini_normalized(target, train_oof)
    
    return gini

In [13]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)  # Setup the root logger.
logger.addHandler(logging.FileHandler("optuna_xgb_output_5.log", mode="w"))

optuna.logging.enable_propagation()  # Propagate logs to the root logger.
optuna.logging.disable_default_handler()  # Stop showing logs in sys.stderr.

study = optuna.create_study(storage="sqlite:///xgb_optuna_porto_5.db", study_name="five_fold_optuna_xgb_5", direction='maximize')

In [14]:
%%time
logger.info("Start optimization.")
study.optimize(objective, n_trials=3)

[14:59:48] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:00:10] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:00:24] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:00:38] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:00:51] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:01:04] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:01:22] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:01:40] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:01:57] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" 

CPU times: user 39.7 s, sys: 15.4 s, total: 55.1 s
Wall time: 4min 8s


In [15]:
df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))
df.head()

Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_device,params_eval_metric,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_objective,params_subsample,params_tree_method,state
0,0,0.261091,9.320316,0.477649,cuda,logloss,0.074662,0.00182,16,96,binary:logistic,0.524617,hist,COMPLETE
1,1,0.286779,0.62609,0.61729,cuda,logloss,0.014329,0.007502,19,64,binary:logistic,0.548255,hist,COMPLETE
2,2,0.271957,0.14741,0.453876,cuda,logloss,0.001651,0.002307,14,222,binary:logistic,0.770649,hist,COMPLETE


In [16]:
%%time
study.optimize(objective, n_trials=5)
df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))
df.to_csv('optuna_xgb_output_5.csv', index=False)
df.head(8)

[15:04:08] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:04:23] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:04:38] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:04:51] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:05:05] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:05:19] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:05:34] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:05:49] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:06:02] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" 

CPU times: user 1min 4s, sys: 16.6 s, total: 1min 20s
Wall time: 6min 14s


Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_device,params_eval_metric,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_objective,params_subsample,params_tree_method,state
0,0,0.261091,9.320316,0.477649,cuda,logloss,0.074662,0.00182,16,96,binary:logistic,0.524617,hist,COMPLETE
1,1,0.286779,0.62609,0.61729,cuda,logloss,0.014329,0.007502,19,64,binary:logistic,0.548255,hist,COMPLETE
2,2,0.271957,0.14741,0.453876,cuda,logloss,0.001651,0.002307,14,222,binary:logistic,0.770649,hist,COMPLETE
3,3,0.287178,0.186078,0.790183,cuda,logloss,0.141901,0.015706,10,229,binary:logistic,0.71829,hist,COMPLETE
4,4,0.274219,0.115748,0.739794,cuda,logloss,0.123462,0.047147,24,209,binary:logistic,0.444212,hist,COMPLETE
5,5,0.219778,0.152644,0.780991,cuda,logloss,0.795898,0.057804,23,99,binary:logistic,0.972187,hist,COMPLETE
6,6,0.273179,5.072753,0.46235,cuda,logloss,0.014858,0.002766,19,134,binary:logistic,0.69193,hist,COMPLETE
7,7,0.286466,0.003027,0.341739,cuda,logloss,2.955274,0.011005,17,185,binary:logistic,0.486281,hist,COMPLETE


In [17]:
study.best_trial.params

{'objective': 'binary:logistic',
 'tree_method': 'hist',
 'device': 'cuda',
 'lambda': 0.14190076165750204,
 'alpha': 0.1860780014774321,
 'colsample_bytree': 0.7901829038590573,
 'subsample': 0.7182899032077363,
 'learning_rate': 0.015706484870345004,
 'max_depth': 10,
 'min_child_weight': 229,
 'eval_metric': 'logloss'}

In [18]:
df.value.max()

0.2871776897327792

In [None]:
%%time
study.optimize(objective, n_trials=500)
df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))
df.to_csv('optuna_xgb_output_5.csv', index=False)
df.head(20)

[15:10:47] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:11:06] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:11:25] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:11:43] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:12:01] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:12:20] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:12:36] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:12:53] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.

[15:13:08] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" 

In [20]:
df.value.max()

0.2895797316411886

In [21]:
study.best_trial.params

{'objective': 'binary:logistic',
 'tree_method': 'hist',
 'device': 'cuda',
 'lambda': 0.016329246014877414,
 'alpha': 3.33029065395022,
 'colsample_bytree': 0.5456389940628341,
 'subsample': 0.7684743978358726,
 'learning_rate': 0.014586087705844453,
 'max_depth': 7,
 'min_child_weight': 135,
 'eval_metric': 'logloss'}

In [27]:
best_params = study.best_trial.params

In [31]:
test = delayed(pd.read_csv)(f'../input/X_test_df.csv.zip')
test = dd.from_delayed(test)
test = test[test.columns.difference(['id'])]

In [32]:
test.shape

(Delayed('int-54936ab0-f805-41aa-aaee-2750e5bf29f8'), 217)

In [33]:
test.head()

Unnamed: 0,col_0,col_1,col_10,col_100,col_101,col_102,col_103,col_104,col_105,col_106,...,col_90,col_91,col_92,col_93,col_94,col_95,col_96,col_97,col_98,col_99
0,0.0,8.0,0.5,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,5.0,0.9,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,3.0,0.4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,6.0,0.1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5.0,7.0,0.9,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
dtest = xgb.dask.DaskDMatrix(client, test.values, enable_categorical=True)

In [35]:
%%time
num_round = 1000
train_oof = np.zeros((target.shape[0],))
test_preds = 0

kf = KFold(5, shuffle=True, random_state=137)

for i, (train_index, val_index) in enumerate(kf.split(train,target)):
        print(i)
        dtrain = xgb.dask.DaskDMatrix(client, train_folds[i].values, train_ys[i], enable_categorical=True)
        dval = xgb.dask.DaskDMatrix(client, val_folds[i].values, val_ys[i], enable_categorical=True)
        
        output = xgb.dask.train(client, best_params, dtrain, num_round)
        booster = output['booster']  # booster is the trained model
        booster.set_param({'predictor': 'gpu_predictor'})
        val_predictions = xgb.dask.predict(client, booster, dval)
        val_predictions = val_predictions.compute()

        test_predictions = xgb.dask.predict(client, booster, dtest)
        test_predictions = test_predictions.compute()
    
        train_oof[val_index] = val_predictions
        test_preds += test_predictions/5
    
        del dtrain, dval, output
        gc.collect()
        gc.collect()

gini = gini_normalized(target, train_oof)
print(gini)

0


[07:34:31] task [xgboost.dask-7]:tcp://127.0.0.1:41419 got new rank 0
Parameters: { "predictor" } are not used.



1


[07:34:49] task [xgboost.dask-4]:tcp://127.0.0.1:38713 got new rank 0
Parameters: { "predictor" } are not used.



2


[07:35:08] task [xgboost.dask-1]:tcp://127.0.0.1:45615 got new rank 0
Parameters: { "predictor" } are not used.



3


[07:35:26] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.



4


[07:35:43] task [xgboost.dask-3]:tcp://127.0.0.1:38175 got new rank 0
Parameters: { "predictor" } are not used.



0.2895797316411886
CPU times: user 6.55 s, sys: 3.47 s, total: 10 s
Wall time: 1min 29s


In [36]:
test_preds

array([0.02825447, 0.02195566, 0.02305012, ..., 0.03104293, 0.02074758,
       0.0294082 ], dtype=float32)

In [38]:
test_preds.min()

0.006525852

In [40]:
train_oof.max()

0.27948254346847534

In [41]:
train_oof.min()

0.006003982853144407

In [42]:
submission = pd.read_csv('../input/sample_submission.csv.zip')

In [43]:
submission['target'] = test_preds
submission.head()

Unnamed: 0,id,target
0,0,0.028254
1,1,0.021956
2,2,0.02305
3,3,0.013815
4,4,0.035183


In [44]:
submission.to_csv('../submissions/best_xgb_oho_optuna_h100_0.csv', index=False)

0.2849 public, 0.29105 private

In [45]:
%%time
num_round = 1000
n_seeds = 5
n_folds = 5

train_oof = np.zeros((target.shape[0],))
test_preds = 0

kf = KFold(n_folds, shuffle=True, random_state=137)

for i, (train_index, val_index) in enumerate(kf.split(train,target)):
    for jj in range(n_seeds):

        best_params['random_state'] = 3*jj**2+1777
    
        print(i)
        dtrain = xgb.dask.DaskDMatrix(client, train_folds[i].values, train_ys[i], enable_categorical=True)
        dval = xgb.dask.DaskDMatrix(client, val_folds[i].values, val_ys[i], enable_categorical=True)
        
        output = xgb.dask.train(client, best_params, dtrain, num_round)
        booster = output['booster']  # booster is the trained model
        booster.set_param({'predictor': 'gpu_predictor'})
        val_predictions = xgb.dask.predict(client, booster, dval)
        val_predictions = val_predictions.compute()

        test_predictions = xgb.dask.predict(client, booster, dtest)
        test_predictions = test_predictions.compute()
    
        train_oof[val_index] += val_predictions/n_seeds
        test_preds += test_predictions/(n_folds*n_seeds)
    
        del dtrain, dval, output
        gc.collect()
        gc.collect()

gini = gini_normalized(target, train_oof)
print(gini)

0


[07:46:49] task [xgboost.dask-4]:tcp://127.0.0.1:38713 got new rank 0
Parameters: { "predictor" } are not used.



0


[07:47:08] task [xgboost.dask-1]:tcp://127.0.0.1:45615 got new rank 0
Parameters: { "predictor" } are not used.



0


[07:47:27] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.



0


[07:47:45] task [xgboost.dask-3]:tcp://127.0.0.1:38175 got new rank 0
Parameters: { "predictor" } are not used.



0


[07:48:05] task [xgboost.dask-2]:tcp://127.0.0.1:43677 got new rank 0
Parameters: { "predictor" } are not used.



1


[07:48:24] task [xgboost.dask-6]:tcp://127.0.0.1:38997 got new rank 0
Parameters: { "predictor" } are not used.



1


[07:48:43] task [xgboost.dask-5]:tcp://127.0.0.1:34431 got new rank 0
Parameters: { "predictor" } are not used.



1


[07:49:02] task [xgboost.dask-7]:tcp://127.0.0.1:41419 got new rank 0
Parameters: { "predictor" } are not used.



1


[07:49:21] task [xgboost.dask-4]:tcp://127.0.0.1:38713 got new rank 0
Parameters: { "predictor" } are not used.



1


[07:49:39] task [xgboost.dask-1]:tcp://127.0.0.1:45615 got new rank 0
Parameters: { "predictor" } are not used.



2


[07:49:58] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.



2


[07:50:15] task [xgboost.dask-3]:tcp://127.0.0.1:38175 got new rank 0
Parameters: { "predictor" } are not used.



2


[07:50:33] task [xgboost.dask-2]:tcp://127.0.0.1:43677 got new rank 0
Parameters: { "predictor" } are not used.



2


[07:50:52] task [xgboost.dask-6]:tcp://127.0.0.1:38997 got new rank 0
Parameters: { "predictor" } are not used.



2


[07:51:10] task [xgboost.dask-5]:tcp://127.0.0.1:34431 got new rank 0
Parameters: { "predictor" } are not used.



3


[07:51:28] task [xgboost.dask-7]:tcp://127.0.0.1:41419 got new rank 0
Parameters: { "predictor" } are not used.



3


[07:51:46] task [xgboost.dask-4]:tcp://127.0.0.1:38713 got new rank 0
Parameters: { "predictor" } are not used.



3


[07:52:04] task [xgboost.dask-1]:tcp://127.0.0.1:45615 got new rank 0
Parameters: { "predictor" } are not used.



3


[07:52:21] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.



3


[07:52:39] task [xgboost.dask-3]:tcp://127.0.0.1:38175 got new rank 0
Parameters: { "predictor" } are not used.



4


[07:52:57] task [xgboost.dask-2]:tcp://127.0.0.1:43677 got new rank 0
Parameters: { "predictor" } are not used.



4


[07:53:15] task [xgboost.dask-6]:tcp://127.0.0.1:38997 got new rank 0
Parameters: { "predictor" } are not used.



4


[07:53:34] task [xgboost.dask-5]:tcp://127.0.0.1:34431 got new rank 0
Parameters: { "predictor" } are not used.



4


[07:53:52] task [xgboost.dask-7]:tcp://127.0.0.1:41419 got new rank 0
Parameters: { "predictor" } are not used.



4


[07:54:10] task [xgboost.dask-4]:tcp://127.0.0.1:38713 got new rank 0
Parameters: { "predictor" } are not used.



TypeError: 'numpy.float64' object is not callable

In [48]:
train_oof.max()

0.2905692458152771

In [52]:
gini(target, train_oof)

TypeError: 'numpy.float64' object is not callable

In [55]:
train_oof

AttributeError: 'numpy.ndarray' object has no attribute 'type'

In [50]:
train_oof.astype(np.float32)

array([0.04382015, 0.03149953, 0.01924309, ..., 0.01722216, 0.02299856,
       0.01950087], dtype=float32)

In [58]:
gini_normalized(target, train_oof)

0.2892946965464846

In [59]:
submission['target'] = test_preds
submission.head()

Unnamed: 0,id,target
0,0,0.027935
1,1,0.02287
2,2,0.023015
3,3,0.013951
4,4,0.035421


In [60]:
submission.to_csv('../submissions/best_xgb_oho_optuna_h100_1.csv', index=False)

0.28486 public, 0.29113 private

In [61]:
%%time
num_round = 1000
n_seeds = 10
n_folds = 5

train_oof = np.zeros((target.shape[0],))
test_preds = 0

kf = KFold(n_folds, shuffle=True, random_state=137)

for i, (train_index, val_index) in enumerate(kf.split(train,target)):
    print(i)
    for jj in range(n_seeds):

        best_params['random_state'] = 3*jj**2+1777
    
        print(jj)
        dtrain = xgb.dask.DaskDMatrix(client, train_folds[i].values, train_ys[i], enable_categorical=True)
        dval = xgb.dask.DaskDMatrix(client, val_folds[i].values, val_ys[i], enable_categorical=True)
        
        output = xgb.dask.train(client, best_params, dtrain, num_round)
        booster = output['booster']  # booster is the trained model
        booster.set_param({'predictor': 'gpu_predictor'})
        val_predictions = xgb.dask.predict(client, booster, dval)
        val_predictions = val_predictions.compute()

        test_predictions = xgb.dask.predict(client, booster, dtest)
        test_predictions = test_predictions.compute()
    
        train_oof[val_index] += val_predictions/n_seeds
        test_preds += test_predictions/(n_folds*n_seeds)
    
        del dtrain, dval, output
        gc.collect()
        gc.collect()

gini = gini_normalized(target, train_oof)
print(gini)

0
0


[08:05:02] task [xgboost.dask-7]:tcp://127.0.0.1:41419 got new rank 0
Parameters: { "predictor" } are not used.



1


[08:05:21] task [xgboost.dask-4]:tcp://127.0.0.1:38713 got new rank 0
Parameters: { "predictor" } are not used.



2


[08:05:40] task [xgboost.dask-1]:tcp://127.0.0.1:45615 got new rank 0
Parameters: { "predictor" } are not used.



3


[08:05:59] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.



4


[08:06:18] task [xgboost.dask-3]:tcp://127.0.0.1:38175 got new rank 0
Parameters: { "predictor" } are not used.



5


[08:06:37] task [xgboost.dask-2]:tcp://127.0.0.1:43677 got new rank 0
Parameters: { "predictor" } are not used.



6


[08:06:57] task [xgboost.dask-6]:tcp://127.0.0.1:38997 got new rank 0
Parameters: { "predictor" } are not used.



7


[08:07:16] task [xgboost.dask-5]:tcp://127.0.0.1:34431 got new rank 0
Parameters: { "predictor" } are not used.



8


[08:07:35] task [xgboost.dask-7]:tcp://127.0.0.1:41419 got new rank 0
Parameters: { "predictor" } are not used.



9


[08:07:53] task [xgboost.dask-4]:tcp://127.0.0.1:38713 got new rank 0
Parameters: { "predictor" } are not used.



1
0


[08:08:12] task [xgboost.dask-1]:tcp://127.0.0.1:45615 got new rank 0
Parameters: { "predictor" } are not used.



1


[08:08:31] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.



2


[08:08:50] task [xgboost.dask-3]:tcp://127.0.0.1:38175 got new rank 0
Parameters: { "predictor" } are not used.



3


[08:09:09] task [xgboost.dask-2]:tcp://127.0.0.1:43677 got new rank 0
Parameters: { "predictor" } are not used.



4


[08:09:27] task [xgboost.dask-6]:tcp://127.0.0.1:38997 got new rank 0
Parameters: { "predictor" } are not used.



5


[08:09:49] task [xgboost.dask-5]:tcp://127.0.0.1:34431 got new rank 0
Parameters: { "predictor" } are not used.



6


[08:10:08] task [xgboost.dask-7]:tcp://127.0.0.1:41419 got new rank 0
Parameters: { "predictor" } are not used.



7


[08:10:26] task [xgboost.dask-4]:tcp://127.0.0.1:38713 got new rank 0
Parameters: { "predictor" } are not used.



8


[08:10:45] task [xgboost.dask-1]:tcp://127.0.0.1:45615 got new rank 0
Parameters: { "predictor" } are not used.



9


[08:11:05] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.



2
0


[08:11:24] task [xgboost.dask-3]:tcp://127.0.0.1:38175 got new rank 0
Parameters: { "predictor" } are not used.



1


[08:11:42] task [xgboost.dask-2]:tcp://127.0.0.1:43677 got new rank 0
Parameters: { "predictor" } are not used.



2


[08:12:00] task [xgboost.dask-6]:tcp://127.0.0.1:38997 got new rank 0
Parameters: { "predictor" } are not used.



3


[08:12:18] task [xgboost.dask-5]:tcp://127.0.0.1:34431 got new rank 0
Parameters: { "predictor" } are not used.



4


[08:12:36] task [xgboost.dask-7]:tcp://127.0.0.1:41419 got new rank 0
Parameters: { "predictor" } are not used.



5


[08:12:54] task [xgboost.dask-4]:tcp://127.0.0.1:38713 got new rank 0
Parameters: { "predictor" } are not used.



6


[08:13:12] task [xgboost.dask-1]:tcp://127.0.0.1:45615 got new rank 0
Parameters: { "predictor" } are not used.



7


[08:13:30] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.



8


[08:13:48] task [xgboost.dask-3]:tcp://127.0.0.1:38175 got new rank 0
Parameters: { "predictor" } are not used.



9


[08:14:06] task [xgboost.dask-2]:tcp://127.0.0.1:43677 got new rank 0
Parameters: { "predictor" } are not used.



3
0


[08:14:25] task [xgboost.dask-6]:tcp://127.0.0.1:38997 got new rank 0
Parameters: { "predictor" } are not used.



1


[08:14:43] task [xgboost.dask-5]:tcp://127.0.0.1:34431 got new rank 0
Parameters: { "predictor" } are not used.



2


[08:15:02] task [xgboost.dask-7]:tcp://127.0.0.1:41419 got new rank 0
Parameters: { "predictor" } are not used.



3


[08:15:20] task [xgboost.dask-4]:tcp://127.0.0.1:38713 got new rank 0
Parameters: { "predictor" } are not used.



4


[08:15:38] task [xgboost.dask-1]:tcp://127.0.0.1:45615 got new rank 0
Parameters: { "predictor" } are not used.



5


[08:15:56] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.



6


[08:16:14] task [xgboost.dask-3]:tcp://127.0.0.1:38175 got new rank 0
Parameters: { "predictor" } are not used.



7


[08:16:33] task [xgboost.dask-2]:tcp://127.0.0.1:43677 got new rank 0
Parameters: { "predictor" } are not used.



8


[08:16:51] task [xgboost.dask-6]:tcp://127.0.0.1:38997 got new rank 0
Parameters: { "predictor" } are not used.



9


[08:17:10] task [xgboost.dask-5]:tcp://127.0.0.1:34431 got new rank 0
Parameters: { "predictor" } are not used.



4
0


[08:17:28] task [xgboost.dask-7]:tcp://127.0.0.1:41419 got new rank 0
Parameters: { "predictor" } are not used.



1


[08:17:46] task [xgboost.dask-4]:tcp://127.0.0.1:38713 got new rank 0
Parameters: { "predictor" } are not used.



2


[08:18:04] task [xgboost.dask-1]:tcp://127.0.0.1:45615 got new rank 0
Parameters: { "predictor" } are not used.



3


[08:18:22] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.



4


[08:18:41] task [xgboost.dask-3]:tcp://127.0.0.1:38175 got new rank 0
Parameters: { "predictor" } are not used.



5


[08:18:59] task [xgboost.dask-2]:tcp://127.0.0.1:43677 got new rank 0
Parameters: { "predictor" } are not used.



6


[08:19:18] task [xgboost.dask-6]:tcp://127.0.0.1:38997 got new rank 0
Parameters: { "predictor" } are not used.



7


[08:19:36] task [xgboost.dask-5]:tcp://127.0.0.1:34431 got new rank 0
Parameters: { "predictor" } are not used.



8


[08:19:54] task [xgboost.dask-7]:tcp://127.0.0.1:41419 got new rank 0
Parameters: { "predictor" } are not used.



9


[08:20:12] task [xgboost.dask-4]:tcp://127.0.0.1:38713 got new rank 0
Parameters: { "predictor" } are not used.



0.2893686664415629
CPU times: user 1min 24s, sys: 29.8 s, total: 1min 53s
Wall time: 15min 28s


In [62]:
submission['target'] = test_preds
submission.to_csv('../submissions/best_xgb_oho_optuna_h100_2.csv', index=False)
submission.head()

Unnamed: 0,id,target
0,0,0.027973
1,1,0.022714
2,2,0.023101
3,3,0.013864
4,4,0.035546


0.28491 public, 0.2911 private

In [64]:
best_params

{'objective': 'binary:logistic',
 'tree_method': 'exact',
 'device': 'cpu',
 'lambda': 0.016329246014877414,
 'alpha': 3.33029065395022,
 'colsample_bytree': 0.5456389940628341,
 'subsample': 0.7684743978358726,
 'learning_rate': 0.014586087705844453,
 'max_depth': 7,
 'min_child_weight': 135,
 'eval_metric': 'logloss',
 'random_state': 1777}

In [67]:
%%time
num_round = 2000
n_seeds = 5
n_folds = 5

best_params['learning_rate'] = 0.008
best_params['device'] = 'cuda'
best_params['tree_method'] = 'hist'


train_oof = np.zeros((target.shape[0],))
test_preds = 0

kf = KFold(n_folds, shuffle=True, random_state=137)

for i, (train_index, val_index) in enumerate(kf.split(train,target)):
    print(i)
    for jj in range(n_seeds):

        best_params['random_state'] = 3*jj**2+1777
    
        print(jj)
        dtrain = xgb.dask.DaskDMatrix(client, train_folds[i].values, train_ys[i], enable_categorical=True)
        dval = xgb.dask.DaskDMatrix(client, val_folds[i].values, val_ys[i], enable_categorical=True)
        
        output = xgb.dask.train(client, best_params, dtrain, num_round)
        booster = output['booster']  # booster is the trained model
        booster.set_param({'predictor': 'gpu_predictor'})
        val_predictions = xgb.dask.predict(client, booster, dval)
        val_predictions = val_predictions.compute()

        test_predictions = xgb.dask.predict(client, booster, dtest)
        test_predictions = test_predictions.compute()
    
        train_oof[val_index] += val_predictions/n_seeds
        test_preds += test_predictions/(n_folds*n_seeds)
    
        del dtrain, dval, output
        gc.collect()
        gc.collect()

gini = gini_normalized(target, train_oof)
print(gini)

0
0


[13:48:57] task [xgboost.dask-7]:tcp://127.0.0.1:41419 got new rank 0
Parameters: { "predictor" } are not used.



1


[13:49:22] task [xgboost.dask-4]:tcp://127.0.0.1:38713 got new rank 0
Parameters: { "predictor" } are not used.



2


[13:49:46] task [xgboost.dask-1]:tcp://127.0.0.1:45615 got new rank 0
Parameters: { "predictor" } are not used.



3


[13:50:10] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.



4


[13:50:34] task [xgboost.dask-3]:tcp://127.0.0.1:38175 got new rank 0
Parameters: { "predictor" } are not used.



1
0


[13:50:59] task [xgboost.dask-2]:tcp://127.0.0.1:43677 got new rank 0
Parameters: { "predictor" } are not used.



1


[13:51:23] task [xgboost.dask-6]:tcp://127.0.0.1:38997 got new rank 0
Parameters: { "predictor" } are not used.



2


[13:51:47] task [xgboost.dask-5]:tcp://127.0.0.1:34431 got new rank 0
Parameters: { "predictor" } are not used.



3


[13:52:11] task [xgboost.dask-7]:tcp://127.0.0.1:41419 got new rank 0
Parameters: { "predictor" } are not used.



4


[13:52:35] task [xgboost.dask-4]:tcp://127.0.0.1:38713 got new rank 0
Parameters: { "predictor" } are not used.



2
0


[13:53:00] task [xgboost.dask-1]:tcp://127.0.0.1:45615 got new rank 0
Parameters: { "predictor" } are not used.



1


[13:53:23] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.



2


[13:53:46] task [xgboost.dask-3]:tcp://127.0.0.1:38175 got new rank 0
Parameters: { "predictor" } are not used.



3


[13:54:09] task [xgboost.dask-2]:tcp://127.0.0.1:43677 got new rank 0
Parameters: { "predictor" } are not used.



4


[13:54:33] task [xgboost.dask-6]:tcp://127.0.0.1:38997 got new rank 0
Parameters: { "predictor" } are not used.



3
0


[13:54:56] task [xgboost.dask-5]:tcp://127.0.0.1:34431 got new rank 0
Parameters: { "predictor" } are not used.



1


[13:55:20] task [xgboost.dask-7]:tcp://127.0.0.1:41419 got new rank 0
Parameters: { "predictor" } are not used.



2


[13:55:43] task [xgboost.dask-4]:tcp://127.0.0.1:38713 got new rank 0
Parameters: { "predictor" } are not used.



3


[13:56:07] task [xgboost.dask-1]:tcp://127.0.0.1:45615 got new rank 0
Parameters: { "predictor" } are not used.



4


[13:56:30] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.



4
0


[13:56:54] task [xgboost.dask-3]:tcp://127.0.0.1:38175 got new rank 0
Parameters: { "predictor" } are not used.



1


[13:57:17] task [xgboost.dask-2]:tcp://127.0.0.1:43677 got new rank 0
Parameters: { "predictor" } are not used.



2


[13:57:40] task [xgboost.dask-6]:tcp://127.0.0.1:38997 got new rank 0
Parameters: { "predictor" } are not used.



3


[13:58:04] task [xgboost.dask-5]:tcp://127.0.0.1:34431 got new rank 0
Parameters: { "predictor" } are not used.



4


[13:58:27] task [xgboost.dask-7]:tcp://127.0.0.1:41419 got new rank 0
Parameters: { "predictor" } are not used.



TypeError: 'numpy.float64' object is not callable

In [69]:
gini = gini_normalized(target, train_oof)
print(gini)

0.2894314359568208


In [70]:
submission['target'] = test_preds
submission.to_csv('../submissions/best_xgb_oho_optuna_h100_3.csv', index=False)
submission.head()

Unnamed: 0,id,target
0,0,0.028294
1,1,0.022451
2,2,0.022705
3,3,0.013813
4,4,0.035861


0.285 public, 0.2913 public

In [79]:
%%time
num_round = 4000
n_seeds = 5
n_folds = 5

best_params['learning_rate'] = 0.004
best_params['device'] = 'cuda'
best_params['tree_method'] = 'hist'


train_oof = np.zeros((target.shape[0],))
test_preds = 0

kf = KFold(n_folds, shuffle=True, random_state=137)

for i, (train_index, val_index) in enumerate(kf.split(train,target)):
    print(i)
    for jj in range(n_seeds):

        best_params['random_state'] = 3*jj**2+1777
    
        print(jj)
        dtrain = xgb.dask.DaskDMatrix(client, train_folds[i].values, train_ys[i], enable_categorical=True)
        dval = xgb.dask.DaskDMatrix(client, val_folds[i].values, val_ys[i], enable_categorical=True)
        
        output = xgb.dask.train(client, best_params, dtrain, num_round)
        booster = output['booster']  # booster is the trained model
        booster.set_param({'predictor': 'gpu_predictor'})
        val_predictions = xgb.dask.predict(client, booster, dval)
        val_predictions = val_predictions.compute()

        test_predictions = xgb.dask.predict(client, booster, dtest)
        test_predictions = test_predictions.compute()
    
        train_oof[val_index] += val_predictions/n_seeds
        test_preds += test_predictions/(n_folds*n_seeds)
    
        del dtrain, dval, output
        gc.collect()
        gc.collect()

gini = gini_normalized(target, train_oof)
print(gini)

0
0


[14:32:21] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.



1


[14:32:56] task [xgboost.dask-3]:tcp://127.0.0.1:38175 got new rank 0
Parameters: { "predictor" } are not used.



2


[14:33:30] task [xgboost.dask-2]:tcp://127.0.0.1:43677 got new rank 0
Parameters: { "predictor" } are not used.



3


[14:34:04] task [xgboost.dask-6]:tcp://127.0.0.1:38997 got new rank 0
Parameters: { "predictor" } are not used.



4


[14:34:38] task [xgboost.dask-5]:tcp://127.0.0.1:34431 got new rank 0
Parameters: { "predictor" } are not used.



1
0


[14:35:13] task [xgboost.dask-7]:tcp://127.0.0.1:41419 got new rank 0
Parameters: { "predictor" } are not used.



1


[14:35:47] task [xgboost.dask-4]:tcp://127.0.0.1:38713 got new rank 0
Parameters: { "predictor" } are not used.



2


[14:36:22] task [xgboost.dask-1]:tcp://127.0.0.1:45615 got new rank 0
Parameters: { "predictor" } are not used.



3


[14:36:56] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.



4


[14:37:30] task [xgboost.dask-3]:tcp://127.0.0.1:38175 got new rank 0
Parameters: { "predictor" } are not used.



2
0


[14:38:04] task [xgboost.dask-2]:tcp://127.0.0.1:43677 got new rank 0
Parameters: { "predictor" } are not used.



1


[14:38:37] task [xgboost.dask-6]:tcp://127.0.0.1:38997 got new rank 0
Parameters: { "predictor" } are not used.



2


[14:39:09] task [xgboost.dask-5]:tcp://127.0.0.1:34431 got new rank 0
Parameters: { "predictor" } are not used.



3


[14:39:42] task [xgboost.dask-7]:tcp://127.0.0.1:41419 got new rank 0
Parameters: { "predictor" } are not used.



4


[14:40:15] task [xgboost.dask-4]:tcp://127.0.0.1:38713 got new rank 0
Parameters: { "predictor" } are not used.



3
0


[14:40:48] task [xgboost.dask-1]:tcp://127.0.0.1:45615 got new rank 0
Parameters: { "predictor" } are not used.



1


[14:41:22] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.



2


[14:41:55] task [xgboost.dask-3]:tcp://127.0.0.1:38175 got new rank 0
Parameters: { "predictor" } are not used.



3


[14:42:28] task [xgboost.dask-2]:tcp://127.0.0.1:43677 got new rank 0
Parameters: { "predictor" } are not used.



4


[14:43:02] task [xgboost.dask-6]:tcp://127.0.0.1:38997 got new rank 0
Parameters: { "predictor" } are not used.



4
0


[14:43:35] task [xgboost.dask-5]:tcp://127.0.0.1:34431 got new rank 0
Parameters: { "predictor" } are not used.



1


[14:44:08] task [xgboost.dask-7]:tcp://127.0.0.1:41419 got new rank 0
Parameters: { "predictor" } are not used.



2


[14:44:40] task [xgboost.dask-4]:tcp://127.0.0.1:38713 got new rank 0
Parameters: { "predictor" } are not used.



3


[14:45:14] task [xgboost.dask-1]:tcp://127.0.0.1:45615 got new rank 0
Parameters: { "predictor" } are not used.



4


[14:45:46] task [xgboost.dask-0]:tcp://127.0.0.1:40795 got new rank 0
Parameters: { "predictor" } are not used.



0.289393324895057
CPU times: user 53 s, sys: 33.1 s, total: 1min 26s
Wall time: 13min 57s


In [80]:
submission['target'] = test_preds
submission.to_csv('../submissions/best_xgb_oho_optuna_h100_4.csv', index=False)
submission.head()

Unnamed: 0,id,target
0,0,0.028226
1,1,0.02236
2,2,0.02291
3,3,0.013812
4,4,0.035636


0.2849 public, 0.2913 private