In [1]:
%load_ext watermark

In [2]:
%watermark

Last updated: 2023-05-27T06:42:43.167033-07:00

Python implementation: CPython
Python version       : 3.10.6
IPython version      : 8.13.2

Compiler    : GCC 11.3.0
OS          : Linux
Release     : 5.15.0-1017-nvidia
Machine     : x86_64
Processor   : x86_64
CPU cores   : 224
Architecture: 64bit



In [3]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
from dask import dataframe as dd
from dask.delayed import delayed
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import optuna
import gc
import logging

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
%watermark --iversions

pandas : 1.5.3
xgboost: 1.7.5
optuna : 3.1.1
dask   : 2023.3.2
logging: 0.5.1.2
numpy  : 1.23.5



In [5]:
!nvidia-smi -L

GPU 0: NVIDIA H100 80GB HBM3 (UUID: GPU-5c583ee7-8fb1-b26a-4bea-16e45d984a32)
GPU 1: NVIDIA H100 80GB HBM3 (UUID: GPU-a6ab06f5-a6e2-18c1-9dec-0dcd29f44a46)
GPU 2: NVIDIA H100 80GB HBM3 (UUID: GPU-bb8d5098-3c56-c48d-a0a3-fcdfcec6d3f5)
GPU 3: NVIDIA H100 80GB HBM3 (UUID: GPU-cdbe686b-1611-999b-8e8d-a3c5f35b40c4)
GPU 4: NVIDIA H100 80GB HBM3 (UUID: GPU-0df1cef0-fc95-cc88-b5f4-239889b3acba)
GPU 5: NVIDIA H100 80GB HBM3 (UUID: GPU-9dca657c-dbe4-08b7-fe7c-5653f183f0b6)
GPU 6: NVIDIA H100 80GB HBM3 (UUID: GPU-33389782-e2ad-5022-997d-cf470313879c)
GPU 7: NVIDIA H100 80GB HBM3 (UUID: GPU-926eaa05-87fb-35e0-0a98-adb2ad41d0be)


In [6]:
cluster = LocalCUDACluster(n_workers=8)
client = Client(cluster)

2023-05-27 06:42:55,621 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2023-05-27 06:42:55,621 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2023-05-27 06:42:55,621 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2023-05-27 06:42:55,621 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2023-05-27 06:42:55,632 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2023-05-27 06:42:55,632 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2023-05-27 06:42:55,632 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2023-05-27 06:42:55,632 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2023-05-27 06:42:55,632 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2023-05-27 06:42:55,632 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2023-

In [7]:
!nvidia-smi

Sat May 27 06:43:04 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA H100 80G...  On   | 00000000:1B:00.0 Off |                    0 |
| N/A   31C    P0   109W / 700W |    525MiB / 81559MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA H100 80G...  On   | 00000000:43:00.0 Off |                    0 |
| N/A   30C    P0   109W / 700W |    525MiB / 81559MiB |      0%      Default |
|       

In [8]:
train_folds = []
val_folds = []
train_ys = []
val_ys = []

for i in range(5):
    print(f'Loading fold {i}')
    train_fold_d = delayed(pd.read_csv)(f'../input/xgtrain_fold_{i}_l.csv.gz')
    train_fold = dd.from_delayed(train_fold_d)
    
    val_fold_d = delayed(pd.read_csv)(f'../input/xgval_fold_{i}_l.csv.gz')
    val_fold = dd.from_delayed(val_fold_d)
    
    
    train_y = train_fold['target']
    train_fold = train_fold[train_fold.columns.difference(['target'])]
    
    val_y = val_fold['target']
    val_fold = val_fold[val_fold.columns.difference(['target'])]
    
    train_folds.append(train_fold)
    val_folds.append(val_fold)
    
    train_ys.append(train_y)
    val_ys.append(val_y)

Loading fold 0
Loading fold 1
Loading fold 2
Loading fold 3
Loading fold 4


In [9]:
train = pd.read_csv('../input/train.csv.zip')

shift = 200

target0 = train['loss'].values
target = np.log(target0+shift)

In [10]:
train_oof = np.zeros((target.shape[0],))

num_round = 1000

def objective(trial):
        
    params = {
        'objective': 'reg:squarederror', 
        'base_score':7.76,
        'tree_method':'gpu_hist',  # 'gpu_hist','hist'
        'lambda': trial.suggest_loguniform('lambda',1e-3,10.0),
        'alpha': trial.suggest_loguniform('alpha',1e-3,10.0),
        'gamma': trial.suggest_loguniform('gamma',1e-3,10.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3,1.0),
        'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001,0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 25),
        'min_child_weight': trial.suggest_int('min_child_weight', 1,300),
        'eval_metric': trial.suggest_categorical('eval_metric',['rmse']),

    }

    kf = KFold(5, shuffle=True, random_state=137)

    for i, (train_index, val_index) in enumerate(kf.split(train,target)):
        dtrain = xgb.dask.DaskDMatrix(client, train_folds[i].values, train_ys[i], enable_categorical=True)
        dval = xgb.dask.DaskDMatrix(client, val_folds[i].values, val_ys[i], enable_categorical=True)
        
        output = xgb.dask.train(client, params, dtrain, num_round)
        booster = output['booster']  # booster is the trained model
        booster.set_param({'predictor': 'gpu_predictor'})
        predictions = xgb.dask.predict(client, booster, dval)
        predictions = predictions.compute()
        train_oof[val_index] = np.exp(predictions) - shift
        del dtrain, dval, output
        gc.collect()
        gc.collect()

    mae = mean_absolute_error(target0, train_oof)
    
    return mae

In [11]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)  # Setup the root logger.
logger.addHandler(logging.FileHandler("optuna_xgb_output_l_4.log", mode="w"))

optuna.logging.enable_propagation()  # Propagate logs to the root logger.
optuna.logging.disable_default_handler()  # Stop showing logs in sys.stderr.

study = optuna.create_study(storage="sqlite:///xgb_optuna_allstate_l_4.db", study_name="five_fold_optuna_xgb_l_4", direction='minimize')

In [12]:
%%time
logger.info("Start optimization.")
study.optimize(objective, n_trials=3)

  'lambda': trial.suggest_loguniform('lambda',1e-3,10.0),
  'alpha': trial.suggest_loguniform('alpha',1e-3,10.0),
  'gamma': trial.suggest_loguniform('gamma',1e-3,10.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3,1.0),
  'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001,0.1),
[06:44:50] task [xgboost.dask-6]:tcp://127.0.0.1:39581 got new rank 0
[06:45:00] task [xgboost.dask-5]:tcp://127.0.0.1:37117 got new rank 0
[06:45:09] task [xgboost.dask-7]:tcp://127.0.0.1:36377 got new rank 0
[06:45:16] task [xgboost.dask-2]:tcp://127.0.0.1:35535 got new rank 0
[06:45:24] task [xgboost.dask-3]:tcp://127.0.0.1:34427 got new rank 0
  'lambda': trial.suggest_loguniform('lambda',1e-3,10.0),
  'alpha': trial.suggest_loguniform('alpha',1e-3,10.0),
  'gamma': trial.suggest_loguniform('gamma',1e-3,10.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3,1.0),
  'subsample': trial.suggest

CPU times: user 37.5 s, sys: 5.71 s, total: 43.2 s
Wall time: 1min 49s


In [13]:
df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))
df.head()

Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_gamma,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_subsample,state
0,0,1153.90471,0.373965,0.980291,rmse,2.497604,0.014629,0.007077,21,103,0.583175,COMPLETE
1,1,1152.803715,0.004224,0.856359,rmse,0.004733,3.143904,0.041572,11,39,0.653761,COMPLETE
2,2,1152.649488,1.043722,0.903962,rmse,0.709577,3.394003,0.012027,6,49,0.431573,COMPLETE


In [14]:
%%time
study.optimize(objective, n_trials=5)
df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))
df.to_csv('optuna_xgb_output_l_4.csv', index=False)
df

  'lambda': trial.suggest_loguniform('lambda',1e-3,10.0),
  'alpha': trial.suggest_loguniform('alpha',1e-3,10.0),
  'gamma': trial.suggest_loguniform('gamma',1e-3,10.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3,1.0),
  'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001,0.1),
[06:46:39] task [xgboost.dask-0]:tcp://127.0.0.1:39641 got new rank 0
[06:46:56] task [xgboost.dask-6]:tcp://127.0.0.1:39581 got new rank 0
[06:47:14] task [xgboost.dask-5]:tcp://127.0.0.1:37117 got new rank 0
[06:47:33] task [xgboost.dask-7]:tcp://127.0.0.1:36377 got new rank 0
[06:47:51] task [xgboost.dask-2]:tcp://127.0.0.1:35535 got new rank 0
  'lambda': trial.suggest_loguniform('lambda',1e-3,10.0),
  'alpha': trial.suggest_loguniform('alpha',1e-3,10.0),
  'gamma': trial.suggest_loguniform('gamma',1e-3,10.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3,1.0),
  'subsample': trial.suggest

CPU times: user 24.3 s, sys: 21.5 s, total: 45.8 s
Wall time: 6min 29s


Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_gamma,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_subsample,state
0,0,1153.90471,0.373965,0.980291,rmse,2.497604,0.014629,0.007077,21,103,0.583175,COMPLETE
1,1,1152.803715,0.004224,0.856359,rmse,0.004733,3.143904,0.041572,11,39,0.653761,COMPLETE
2,2,1152.649488,1.043722,0.903962,rmse,0.709577,3.394003,0.012027,6,49,0.431573,COMPLETE
3,3,1200.088834,0.056953,0.339802,rmse,0.002243,0.008525,0.073172,16,45,0.594994,COMPLETE
4,4,1255.695543,0.246479,0.914929,rmse,0.445993,0.173519,0.002016,25,255,0.547362,COMPLETE
5,5,1139.048144,2.871421,0.473,rmse,0.001276,0.577053,0.032036,16,251,0.951706,COMPLETE
6,6,1270.241529,0.130132,0.585736,rmse,1.629854,0.507099,0.001669,17,17,0.891022,COMPLETE
7,7,1138.299682,0.988401,0.420743,rmse,0.004308,0.002085,0.014421,24,139,0.624062,COMPLETE


In [15]:
%%time
study.optimize(objective, n_trials=100)
df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))
df.to_csv('optuna_xgb_output_l_4.csv', index=False)
df.head()

  'lambda': trial.suggest_loguniform('lambda',1e-3,10.0),
  'alpha': trial.suggest_loguniform('alpha',1e-3,10.0),
  'gamma': trial.suggest_loguniform('gamma',1e-3,10.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3,1.0),
  'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001,0.1),
[06:53:08] task [xgboost.dask-6]:tcp://127.0.0.1:39581 got new rank 0
[06:53:12] task [xgboost.dask-5]:tcp://127.0.0.1:37117 got new rank 0
[06:53:16] task [xgboost.dask-7]:tcp://127.0.0.1:36377 got new rank 0
[06:53:19] task [xgboost.dask-2]:tcp://127.0.0.1:35535 got new rank 0
[06:53:23] task [xgboost.dask-3]:tcp://127.0.0.1:34427 got new rank 0
  'lambda': trial.suggest_loguniform('lambda',1e-3,10.0),
  'alpha': trial.suggest_loguniform('alpha',1e-3,10.0),
  'gamma': trial.suggest_loguniform('gamma',1e-3,10.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3,1.0),
  'subsample': trial.suggest

CPU times: user 5min 48s, sys: 3min 23s, total: 9min 11s
Wall time: 1h 19min 9s


Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_gamma,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_subsample,state
0,0,1153.90471,0.373965,0.980291,rmse,2.497604,0.014629,0.007077,21,103,0.583175,COMPLETE
1,1,1152.803715,0.004224,0.856359,rmse,0.004733,3.143904,0.041572,11,39,0.653761,COMPLETE
2,2,1152.649488,1.043722,0.903962,rmse,0.709577,3.394003,0.012027,6,49,0.431573,COMPLETE
3,3,1200.088834,0.056953,0.339802,rmse,0.002243,0.008525,0.073172,16,45,0.594994,COMPLETE
4,4,1255.695543,0.246479,0.914929,rmse,0.445993,0.173519,0.002016,25,255,0.547362,COMPLETE


In [16]:
df

Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_gamma,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_subsample,state
0,0,1153.904710,0.373965,0.980291,rmse,2.497604,0.014629,0.007077,21,103,0.583175,COMPLETE
1,1,1152.803715,0.004224,0.856359,rmse,0.004733,3.143904,0.041572,11,39,0.653761,COMPLETE
2,2,1152.649488,1.043722,0.903962,rmse,0.709577,3.394003,0.012027,6,49,0.431573,COMPLETE
3,3,1200.088834,0.056953,0.339802,rmse,0.002243,0.008525,0.073172,16,45,0.594994,COMPLETE
4,4,1255.695543,0.246479,0.914929,rmse,0.445993,0.173519,0.002016,25,255,0.547362,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...
103,103,1135.769548,3.706865,0.433950,rmse,0.003413,0.313685,0.017379,17,216,0.944976,COMPLETE
104,104,1144.954486,2.962983,0.489431,rmse,0.001621,0.438582,0.010861,15,5,0.989317,COMPLETE
105,105,1135.953851,1.841624,0.512196,rmse,0.003137,0.219404,0.014701,16,193,0.970973,COMPLETE
106,106,1138.415958,4.095898,0.458030,rmse,0.001146,0.372069,0.012790,12,207,0.934205,COMPLETE


In [17]:
df.value.min()

1135.212585472503