In [1]:
%load_ext watermark

In [2]:
%watermark

Last updated: 2024-02-02T14:53:57.138826-08:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 8.21.0

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 5.15.0-1042-nvidia
Machine     : x86_64
Processor   : x86_64
CPU cores   : 224
Architecture: 64bit



In [3]:
%watermark --gpu

GPU Info: Install the gpu extra (pip install 'watermark[gpu]') to display GPU information for NVIDIA chipsets



In [4]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
from dask import dataframe as dd
from dask.delayed import delayed
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

import optuna
import gc
import logging

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
%watermark --iversions

dask   : 2023.11.0
numpy  : 1.26.3
pandas : 1.5.3
logging: 0.5.1.2
optuna : 3.5.0
xgboost: 2.0.3



In [6]:
!nvidia-smi -L

GPU 0: NVIDIA H100 80GB HBM3 (UUID: GPU-5c583ee7-8fb1-b26a-4bea-16e45d984a32)
GPU 1: NVIDIA H100 80GB HBM3 (UUID: GPU-a6ab06f5-a6e2-18c1-9dec-0dcd29f44a46)
GPU 2: NVIDIA H100 80GB HBM3 (UUID: GPU-bb8d5098-3c56-c48d-a0a3-fcdfcec6d3f5)
GPU 3: NVIDIA H100 80GB HBM3 (UUID: GPU-cdbe686b-1611-999b-8e8d-a3c5f35b40c4)
GPU 4: NVIDIA H100 80GB HBM3 (UUID: GPU-0df1cef0-fc95-cc88-b5f4-239889b3acba)
GPU 5: NVIDIA H100 80GB HBM3 (UUID: GPU-9dca657c-dbe4-08b7-fe7c-5653f183f0b6)
GPU 6: NVIDIA H100 80GB HBM3 (UUID: GPU-33389782-e2ad-5022-997d-cf470313879c)
GPU 7: NVIDIA H100 80GB HBM3 (UUID: GPU-926eaa05-87fb-35e0-0a98-adb2ad41d0be)


In [7]:
cluster = LocalCUDACluster(n_workers=8)
client = Client(cluster)

In [8]:
!nvidia-smi

Fri Feb  2 14:54:24 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA H100 80GB HBM3          On  | 00000000:1B:00.0 Off |                    0 |
| N/A   27C    P0              67W / 700W |      4MiB / 81559MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA H100 80GB HBM3          On  | 00000000:43:00.0 Off |  

In [52]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

In [10]:
%%time
train_folds = []
val_folds = []
train_ys = []
val_ys = []

for i in range(5):
    print(f'Loading fold {i}')
    train_fold_d = delayed(pd.read_csv)(f'../input/xgtrain_oho_fold_{i}_5X.csv.gz')
    train_fold = dd.from_delayed(train_fold_d)
    
    val_fold_d = delayed(pd.read_csv)(f'../input/xgval_oho_fold_{i}.csv.gz')
    val_fold = dd.from_delayed(val_fold_d)
    
    
    train_y = train_fold['target']
    train_fold = train_fold[train_fold.columns.difference(['target'])]
    
    val_y = val_fold['target']
    val_fold = val_fold[val_fold.columns.difference(['target'])]
    
    train_folds.append(train_fold)
    val_folds.append(val_fold)
    
    train_ys.append(train_y)
    val_ys.append(val_y)

Loading fold 0
Loading fold 1
Loading fold 2
Loading fold 3
Loading fold 4
CPU times: user 2.13 s, sys: 7.45 s, total: 9.58 s
Wall time: 1min 53s


In [11]:
train = pd.read_csv('../input/train.csv.zip')

target = train['target'].values
target

array([0, 0, 0, ..., 0, 0, 0])

In [12]:
train_oof = np.zeros((target.shape[0],))

num_round = 1000

def objective(trial):
        
    params = {
        'objective': trial.suggest_categorical('objective',['binary:logistic']),
        'tree_method': trial.suggest_categorical('tree_method',['hist']),  # 'gpu_hist','hist'
        'device': trial.suggest_categorical('device',['cuda']),
        'lambda': trial.suggest_float('lambda',1e-3,10.0, log=True),
        'alpha': trial.suggest_float('alpha',1e-3,10.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3,1.0),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.001,0.1, log=True),
        #'n_estimators': trial.suggest_categorical('n_estimators', [1000]),
        'max_depth': trial.suggest_int('max_depth', 3, 25),
        #'random_state': trial.suggest_categorical('random_state', [24,48,2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1,300),
        'eval_metric': trial.suggest_categorical('eval_metric',['logloss']),

    }

    kf = KFold(5, shuffle=True, random_state=137)

    for i, (train_index, val_index) in enumerate(kf.split(train,target)):
        dtrain = xgb.dask.DaskDMatrix(client, train_folds[i].values, train_ys[i], enable_categorical=True)
        dval = xgb.dask.DaskDMatrix(client, val_folds[i].values, val_ys[i], enable_categorical=True)
        
        output = xgb.dask.train(client, params, dtrain, num_round)
        booster = output['booster']  # booster is the trained model
        booster.set_param({'predictor': 'gpu_predictor'})
        predictions = xgb.dask.predict(client, booster, dval)
        predictions = predictions.compute()
        train_oof[val_index] = predictions
        del dtrain, dval, output
        gc.collect()
        gc.collect()

    gini = gini_normalized(target, train_oof)
    
    return gini

In [13]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)  # Setup the root logger.
logger.addHandler(logging.FileHandler("optuna_xgb_output_6.log", mode="w"))

optuna.logging.enable_propagation()  # Propagate logs to the root logger.
optuna.logging.disable_default_handler()  # Stop showing logs in sys.stderr.

study = optuna.create_study(storage="sqlite:///xgb_optuna_porto_6.db", study_name="five_fold_optuna_xgb_6", direction='maximize')

In [14]:
%%time
logger.info("Start optimization.")
study.optimize(objective, n_trials=3)

[14:57:47] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[14:58:58] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[15:00:03] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[15:01:04] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[15:02:04] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[15:03:05] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[15:04:10] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[15:05:12] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[15:06:10] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" 

CPU times: user 54.9 s, sys: 31.3 s, total: 1min 26s
Wall time: 14min 46s


In [15]:
df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))
df.head()

Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_device,params_eval_metric,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_objective,params_subsample,params_tree_method,state
0,0,0.203132,0.005223,0.66707,cuda,logloss,0.438581,0.056886,21,232,binary:logistic,0.873862,hist,COMPLETE
1,1,0.280209,1.319635,0.536039,cuda,logloss,0.220817,0.003086,13,244,binary:logistic,0.447068,hist,COMPLETE
2,2,0.255587,6.146192,0.655108,cuda,logloss,0.083982,0.054008,8,160,binary:logistic,0.599467,hist,COMPLETE


In [16]:
%%time
study.optimize(objective, n_trials=5)
df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))
df.to_csv('optuna_xgb_output_6.csv', index=False)
df.head(8)

[15:12:28] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[15:13:30] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[15:14:31] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[15:15:27] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[15:16:24] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[15:17:22] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[15:18:24] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[15:19:25] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[15:20:23] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" 

CPU times: user 53.4 s, sys: 1min 3s, total: 1min 56s
Wall time: 25min 50s


Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_device,params_eval_metric,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_objective,params_subsample,params_tree_method,state
0,0,0.203132,0.005223,0.66707,cuda,logloss,0.438581,0.056886,21,232,binary:logistic,0.873862,hist,COMPLETE
1,1,0.280209,1.319635,0.536039,cuda,logloss,0.220817,0.003086,13,244,binary:logistic,0.447068,hist,COMPLETE
2,2,0.255587,6.146192,0.655108,cuda,logloss,0.083982,0.054008,8,160,binary:logistic,0.599467,hist,COMPLETE
3,3,0.276599,0.006279,0.751818,cuda,logloss,0.396653,0.011434,13,201,binary:logistic,0.96805,hist,COMPLETE
4,4,0.278449,0.005882,0.694174,cuda,logloss,6.293813,0.014028,17,288,binary:logistic,0.570029,hist,COMPLETE
5,5,0.195938,0.00563,0.331,cuda,logloss,0.003866,0.02599,21,27,binary:logistic,0.572549,hist,COMPLETE
6,6,0.218852,0.131168,0.760137,cuda,logloss,0.136189,0.053183,17,179,binary:logistic,0.456171,hist,COMPLETE
7,7,0.286798,1.752776,0.425398,cuda,logloss,5.914039,0.006924,9,207,binary:logistic,0.688676,hist,COMPLETE


In [17]:
study.best_trial.params

{'objective': 'binary:logistic',
 'tree_method': 'hist',
 'device': 'cuda',
 'lambda': 5.914039063058422,
 'alpha': 1.7527764647364508,
 'colsample_bytree': 0.4253976417781009,
 'subsample': 0.688675862971879,
 'learning_rate': 0.0069238827296158306,
 'max_depth': 9,
 'min_child_weight': 207,
 'eval_metric': 'logloss'}

In [18]:
df.value.max()

0.2867979219969091

In [None]:
%%time
study.optimize(objective, n_trials=1000)
df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))
df.to_csv('optuna_xgb_output_6.csv', index=False)
df.head(20)

[15:38:19] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[15:39:12] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[15:40:04] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[15:40:53] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[15:41:42] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[15:44:22] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[15:45:15] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[15:46:07] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.

[15:47:00] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" 

In [25]:
df.value.max()

0.2893081424033739

In [26]:
study.best_trial.params

{'objective': 'binary:logistic',
 'tree_method': 'hist',
 'device': 'cuda',
 'lambda': 0.023609723689578282,
 'alpha': 2.401042111777277,
 'colsample_bytree': 0.5166833151741854,
 'subsample': 0.46914803789445053,
 'learning_rate': 0.012387688807599696,
 'max_depth': 7,
 'min_child_weight': 277,
 'eval_metric': 'logloss'}

In [27]:
best_params = study.best_trial.params

In [28]:
test = delayed(pd.read_csv)(f'../input/X_test_df.csv.zip')
test = dd.from_delayed(test)
test = test[test.columns.difference(['id'])]

In [32]:
test.shape

(Delayed('int-54936ab0-f805-41aa-aaee-2750e5bf29f8'), 217)

In [29]:
test.head()

Unnamed: 0,col_0,col_1,col_10,col_100,col_101,col_102,col_103,col_104,col_105,col_106,...,col_90,col_91,col_92,col_93,col_94,col_95,col_96,col_97,col_98,col_99
0,0.0,8.0,0.5,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,5.0,0.9,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,3.0,0.4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,6.0,0.1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5.0,7.0,0.9,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
dtest = xgb.dask.DaskDMatrix(client, test.values, enable_categorical=True)

In [31]:
%%time
num_round = 1000
train_oof = np.zeros((target.shape[0],))
test_preds = 0

kf = KFold(5, shuffle=True, random_state=137)

for i, (train_index, val_index) in enumerate(kf.split(train,target)):
        print(i)
        dtrain = xgb.dask.DaskDMatrix(client, train_folds[i].values, train_ys[i], enable_categorical=True)
        dval = xgb.dask.DaskDMatrix(client, val_folds[i].values, val_ys[i], enable_categorical=True)
        
        output = xgb.dask.train(client, best_params, dtrain, num_round)
        booster = output['booster']  # booster is the trained model
        booster.set_param({'predictor': 'gpu_predictor'})
        val_predictions = xgb.dask.predict(client, booster, dval)
        val_predictions = val_predictions.compute()

        test_predictions = xgb.dask.predict(client, booster, dtest)
        test_predictions = test_predictions.compute()
    
        train_oof[val_index] = val_predictions
        test_preds += test_predictions/5
    
        del dtrain, dval, output
        gc.collect()
        gc.collect()

gini = gini_normalized(target, train_oof)
print(gini)

0


[08:59:06] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.



1


[09:00:05] task [xgboost.dask-7]:tcp://127.0.0.1:40291 got new rank 0
Parameters: { "predictor" } are not used.



2


[09:01:03] task [xgboost.dask-2]:tcp://127.0.0.1:44181 got new rank 0
Parameters: { "predictor" } are not used.



3


[09:01:57] task [xgboost.dask-3]:tcp://127.0.0.1:42325 got new rank 0
Parameters: { "predictor" } are not used.



4


[09:02:52] task [xgboost.dask-5]:tcp://127.0.0.1:34867 got new rank 0
Parameters: { "predictor" } are not used.



0.2893081424033739
CPU times: user 11.6 s, sys: 9.41 s, total: 21 s
Wall time: 4min 39s


In [32]:
test_preds

array([0.02774813, 0.02304585, 0.02404194, ..., 0.03249284, 0.02122148,
       0.02917948], dtype=float32)

In [33]:
test_preds.min()

0.006559409

In [34]:
train_oof.max()

0.29273033142089844

In [35]:
train_oof.min()

0.006362931802868843

In [36]:
submission = pd.read_csv('../input/sample_submission.csv.zip')

In [37]:
submission['target'] = test_preds
submission.head()

Unnamed: 0,id,target
0,0,0.027748
1,1,0.023046
2,2,0.024042
3,3,0.013742
4,4,0.036637


In [38]:
submission.to_csv('../submissions/best_xgb_oho_optuna_h100_5X_augment_0.csv', index=False)

0.28541 public, 0.29097 private

In [39]:
%%time
num_round = 1000
n_seeds = 5
n_folds = 5

train_oof = np.zeros((target.shape[0],))
test_preds = 0

kf = KFold(n_folds, shuffle=True, random_state=137)

for i, (train_index, val_index) in enumerate(kf.split(train,target)):
    for jj in range(n_seeds):

        best_params['random_state'] = 3*jj**2+1777
    
        print(i)
        dtrain = xgb.dask.DaskDMatrix(client, train_folds[i].values, train_ys[i], enable_categorical=True)
        dval = xgb.dask.DaskDMatrix(client, val_folds[i].values, val_ys[i], enable_categorical=True)
        
        output = xgb.dask.train(client, best_params, dtrain, num_round)
        booster = output['booster']  # booster is the trained model
        booster.set_param({'predictor': 'gpu_predictor'})
        val_predictions = xgb.dask.predict(client, booster, dval)
        val_predictions = val_predictions.compute()

        test_predictions = xgb.dask.predict(client, booster, dtest)
        test_predictions = test_predictions.compute()
    
        train_oof[val_index] += val_predictions/n_seeds
        test_preds += test_predictions/(n_folds*n_seeds)
    
        del dtrain, dval, output
        gc.collect()
        gc.collect()

gini = gini_normalized(target, train_oof)
print(gini)

0


[09:10:05] task [xgboost.dask-4]:tcp://127.0.0.1:43803 got new rank 0
Parameters: { "predictor" } are not used.



0


[09:11:03] task [xgboost.dask-0]:tcp://127.0.0.1:41675 got new rank 0
Parameters: { "predictor" } are not used.



0


[09:12:01] task [xgboost.dask-6]:tcp://127.0.0.1:45909 got new rank 0
Parameters: { "predictor" } are not used.



0


[09:12:59] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.



0


[09:13:57] task [xgboost.dask-7]:tcp://127.0.0.1:40291 got new rank 0
Parameters: { "predictor" } are not used.



1


[09:14:58] task [xgboost.dask-2]:tcp://127.0.0.1:44181 got new rank 0
Parameters: { "predictor" } are not used.



1


[09:15:56] task [xgboost.dask-3]:tcp://127.0.0.1:42325 got new rank 0
Parameters: { "predictor" } are not used.



1


[09:16:55] task [xgboost.dask-5]:tcp://127.0.0.1:34867 got new rank 0
Parameters: { "predictor" } are not used.



1


[09:17:55] task [xgboost.dask-4]:tcp://127.0.0.1:43803 got new rank 0
Parameters: { "predictor" } are not used.



1


[09:18:53] task [xgboost.dask-0]:tcp://127.0.0.1:41675 got new rank 0
Parameters: { "predictor" } are not used.



2


[09:19:52] task [xgboost.dask-6]:tcp://127.0.0.1:45909 got new rank 0
Parameters: { "predictor" } are not used.



2


[09:22:37] task [xgboost.dask-2]:tcp://127.0.0.1:44181 got new rank 0
Parameters: { "predictor" } are not used.



2


[09:23:32] task [xgboost.dask-3]:tcp://127.0.0.1:42325 got new rank 0
Parameters: { "predictor" } are not used.



3


[09:24:28] task [xgboost.dask-5]:tcp://127.0.0.1:34867 got new rank 0
Parameters: { "predictor" } are not used.



3


[09:25:24] task [xgboost.dask-4]:tcp://127.0.0.1:43803 got new rank 0
Parameters: { "predictor" } are not used.



3


[09:26:18] task [xgboost.dask-0]:tcp://127.0.0.1:41675 got new rank 0
Parameters: { "predictor" } are not used.



3


[09:27:14] task [xgboost.dask-6]:tcp://127.0.0.1:45909 got new rank 0
Parameters: { "predictor" } are not used.



3


[09:28:10] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.



4


[09:29:05] task [xgboost.dask-7]:tcp://127.0.0.1:40291 got new rank 0
Parameters: { "predictor" } are not used.



4


[09:30:00] task [xgboost.dask-2]:tcp://127.0.0.1:44181 got new rank 0
Parameters: { "predictor" } are not used.



4


[09:30:55] task [xgboost.dask-3]:tcp://127.0.0.1:42325 got new rank 0
Parameters: { "predictor" } are not used.



4


[09:31:50] task [xgboost.dask-5]:tcp://127.0.0.1:34867 got new rank 0
Parameters: { "predictor" } are not used.



4


[09:32:45] task [xgboost.dask-4]:tcp://127.0.0.1:43803 got new rank 0
Parameters: { "predictor" } are not used.



TypeError: 'numpy.float64' object is not callable

In [40]:
train_oof.max()

0.31878868862986565

In [43]:
gini(target, train_oof)

0.1391760444592586

In [44]:
train_oof

array([0.04439605, 0.03109433, 0.01929239, ..., 0.01864094, 0.02432294,
       0.01950788])

In [50]:
train_oof.astype(np.float32)

array([0.04382015, 0.03149953, 0.01924309, ..., 0.01722216, 0.02299856,
       0.01950087], dtype=float32)

In [45]:
gini_normalized(target, train_oof)

0.2888810875148966

In [46]:
submission['target'] = test_preds
submission.head()

Unnamed: 0,id,target
0,0,0.027514
1,1,0.023573
2,2,0.024141
3,3,0.013834
4,4,0.036634


In [47]:
submission.to_csv('../submissions/best_xgb_oho_optuna_h100_5X_augment_5_seed.csv', index=False)

0.28531 public, 0.2912 private

In [None]:
%%time
num_round = 2000
n_seeds = 5
n_folds = 5

best_params['learning_rate'] = 0.008
best_params['device'] = 'cuda'
best_params['tree_method'] = 'hist'


train_oof = np.zeros((target.shape[0],))
test_preds = 0

kf = KFold(n_folds, shuffle=True, random_state=137)

for i, (train_index, val_index) in enumerate(kf.split(train,target)):
    print(i)
    for jj in range(n_seeds):

        best_params['random_state'] = 3*jj**2+1777
    
        print(jj)
        dtrain = xgb.dask.DaskDMatrix(client, train_folds[i].values, train_ys[i], enable_categorical=True)
        dval = xgb.dask.DaskDMatrix(client, val_folds[i].values, val_ys[i], enable_categorical=True)
        
        output = xgb.dask.train(client, best_params, dtrain, num_round)
        booster = output['booster']  # booster is the trained model
        booster.save_model(f'model_{i}_{jj}.json')
        booster.set_param({'predictor': 'gpu_predictor'})
        val_predictions = xgb.dask.predict(client, booster, dval)
        val_predictions = val_predictions.compute()

        test_predictions = xgb.dask.predict(client, booster, dtest)
        test_predictions = test_predictions.compute()
    
        train_oof[val_index] += val_predictions/n_seeds
        test_preds += test_predictions/(n_folds*n_seeds)
    
        del dtrain, dval, output
        gc.collect()
        gc.collect()

gini = gini_normalized(target, train_oof)
print(gini)

0
0


[09:34:23] task [xgboost.dask-0]:tcp://127.0.0.1:41675 got new rank 0
Parameters: { "predictor" } are not used.



1


[09:35:35] task [xgboost.dask-6]:tcp://127.0.0.1:45909 got new rank 0
Parameters: { "predictor" } are not used.



2


[09:36:46] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.



3


[09:37:56] task [xgboost.dask-7]:tcp://127.0.0.1:40291 got new rank 0
Parameters: { "predictor" } are not used.



4


[09:39:08] task [xgboost.dask-2]:tcp://127.0.0.1:44181 got new rank 0
Parameters: { "predictor" } are not used.



1
0


[09:40:20] task [xgboost.dask-3]:tcp://127.0.0.1:42325 got new rank 0
Parameters: { "predictor" } are not used.



1


[09:41:31] task [xgboost.dask-5]:tcp://127.0.0.1:34867 got new rank 0
Parameters: { "predictor" } are not used.



2


[09:42:44] task [xgboost.dask-4]:tcp://127.0.0.1:43803 got new rank 0
Parameters: { "predictor" } are not used.



3


[09:43:56] task [xgboost.dask-0]:tcp://127.0.0.1:41675 got new rank 0
Parameters: { "predictor" } are not used.



4


[09:45:09] task [xgboost.dask-6]:tcp://127.0.0.1:45909 got new rank 0
Parameters: { "predictor" } are not used.



2
0


[09:46:19] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.



1


[09:47:26] task [xgboost.dask-7]:tcp://127.0.0.1:40291 got new rank 0
Parameters: { "predictor" } are not used.



2


[09:48:34] task [xgboost.dask-2]:tcp://127.0.0.1:44181 got new rank 0
Parameters: { "predictor" } are not used.



3


[09:49:42] task [xgboost.dask-3]:tcp://127.0.0.1:42325 got new rank 0
Parameters: { "predictor" } are not used.



4


[09:50:51] task [xgboost.dask-5]:tcp://127.0.0.1:34867 got new rank 0
Parameters: { "predictor" } are not used.



3
0


[09:51:59] task [xgboost.dask-4]:tcp://127.0.0.1:43803 got new rank 0
Parameters: { "predictor" } are not used.



1


[09:53:06] task [xgboost.dask-0]:tcp://127.0.0.1:41675 got new rank 0
Parameters: { "predictor" } are not used.



2


[09:54:15] task [xgboost.dask-6]:tcp://127.0.0.1:45909 got new rank 0
Parameters: { "predictor" } are not used.



3


[09:55:22] task [xgboost.dask-1]:tcp://127.0.0.1:43387 got new rank 0
Parameters: { "predictor" } are not used.



4


[09:56:29] task [xgboost.dask-7]:tcp://127.0.0.1:40291 got new rank 0
Parameters: { "predictor" } are not used.



4
0


[09:57:37] task [xgboost.dask-2]:tcp://127.0.0.1:44181 got new rank 0
Parameters: { "predictor" } are not used.



1


[09:58:45] task [xgboost.dask-3]:tcp://127.0.0.1:42325 got new rank 0
Parameters: { "predictor" } are not used.



2


[09:59:53] task [xgboost.dask-5]:tcp://127.0.0.1:34867 got new rank 0
Parameters: { "predictor" } are not used.



3


[10:01:01] task [xgboost.dask-4]:tcp://127.0.0.1:43803 got new rank 0
Parameters: { "predictor" } are not used.



4


In [53]:
gini = gini_normalized(target, train_oof)
print(gini)

0.2888335622957465


In [70]:
submission['target'] = test_preds
submission.to_csv('../submissions/best_xgb_oho_optuna_h100_3.csv', index=False)
submission.head()

Unnamed: 0,id,target
0,0,0.028294
1,1,0.022451
2,2,0.022705
3,3,0.013813
4,4,0.035861


0.285 public, 0.2913 public

In [54]:
submission['target'] = test_preds
submission.to_csv('../submissions/best_xgb_oho_optuna_h100_5x_augment_slow_lr.csv', index=False)
submission.head()

Unnamed: 0,id,target
0,0,0.02802
1,1,0.02272
2,2,0.023056
3,3,0.013974
4,4,0.036825


0.28546 public, 0.29182 private