In [1]:
%load_ext watermark

In [2]:
%watermark

Last updated: 2023-08-24T09:29:17.872814-07:00

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 8.14.0

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 5.15.0-1029-nvidia
Machine     : x86_64
Processor   : x86_64
CPU cores   : 256
Architecture: 64bit



In [4]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
from dask import dataframe as dd
from dask.delayed import delayed
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import optuna
import gc
import logging

In [5]:
%watermark --iversions

logging: 0.5.1.2
optuna : 3.3.0
pandas : 1.5.3
dask   : 2023.3.2
xgboost: 1.7.6
numpy  : 1.24.4



In [6]:
!nvidia-smi -L

GPU 0: NVIDIA A100-SXM4-80GB (UUID: GPU-cf1d9297-d9c3-6b2a-8afa-e63e4b5abb2d)
GPU 1: NVIDIA A100-SXM4-80GB (UUID: GPU-6d18acdc-2136-40ac-5eeb-724e81476385)
GPU 2: NVIDIA A100-SXM4-80GB (UUID: GPU-a87c7d38-52d2-befc-e2de-3cf90f75d6eb)
GPU 3: NVIDIA A100-SXM4-80GB (UUID: GPU-8e290e7b-dec5-1828-2ec2-a6930ecd4b5b)
GPU 4: NVIDIA A100-SXM4-80GB (UUID: GPU-36f4174f-2697-e22e-3ae5-0d30e814c4f1)
GPU 5: NVIDIA A100-SXM4-80GB (UUID: GPU-d0929bb4-8c8c-8418-70d2-f658013f4b33)
GPU 6: NVIDIA A100-SXM4-80GB (UUID: GPU-78f54132-3566-5a8c-b51b-fb3cc695d648)
GPU 7: NVIDIA A100-SXM4-80GB (UUID: GPU-61b9f63d-cb20-77e8-8b22-40f40fbb8715)


In [7]:
cluster = LocalCUDACluster(n_workers=8)
client = Client(cluster)

2023-08-24 09:31:34,495 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2023-08-24 09:31:34,495 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2023-08-24 09:31:34,503 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2023-08-24 09:31:34,503 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2023-08-24 09:31:34,508 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2023-08-24 09:31:34,508 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2023-08-24 09:31:34,508 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2023-08-24 09:31:34,508 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2023-08-24 09:31:34,512 - distributed.preloading - INFO - Creating preload: dask_cuda.initialize
2023-08-24 09:31:34,512 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize
2023-

In [8]:
!nvidia-smi

Thu Aug 24 09:31:40 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.125.06   Driver Version: 525.125.06   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   35C    P0    71W / 400W |    417MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM...  On   | 00000000:0F:00.0 Off |                    0 |
| N/A   33C    P0    68W / 400W |    417MiB / 81920MiB |      0%      Default |
|       

In [9]:
train_folds = []
val_folds = []
train_ys = []
val_ys = []

for i in range(5):
    print(f'Loading fold {i}')
    train_fold_d = delayed(pd.read_csv)(f'../input/xgtrain_fold_{i}_l.csv.gz')
    train_fold = dd.from_delayed(train_fold_d)
    
    val_fold_d = delayed(pd.read_csv)(f'../input/xgval_fold_{i}_l.csv.gz')
    val_fold = dd.from_delayed(val_fold_d)
    
    
    train_y = train_fold['target']
    train_fold = train_fold[train_fold.columns.difference(['target'])]
    
    val_y = val_fold['target']
    val_fold = val_fold[val_fold.columns.difference(['target'])]
    
    train_folds.append(train_fold)
    val_folds.append(val_fold)
    
    train_ys.append(train_y)
    val_ys.append(val_y)

Loading fold 0
Loading fold 1
Loading fold 2
Loading fold 3
Loading fold 4


In [10]:
train = pd.read_csv('../input/train.csv.zip')

shift = 200

target0 = train['loss'].values
target = np.log(target0+shift)

In [11]:
train_oof = np.zeros((target.shape[0],))

num_round = 1000

def objective(trial):
        
    params = {
        'objective': 'reg:squarederror', 
        'base_score':7.76,
        'tree_method':'gpu_hist',  # 'gpu_hist','hist'
        'lambda': trial.suggest_loguniform('lambda',1e-3,10.0),
        'alpha': trial.suggest_loguniform('alpha',1e-3,10.0),
        'gamma': trial.suggest_loguniform('gamma',1e-3,10.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3,1.0),
        'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001,0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 25),
        'min_child_weight': trial.suggest_int('min_child_weight', 1,300),
        'eval_metric': trial.suggest_categorical('eval_metric',['rmse']),

    }

    kf = KFold(5, shuffle=True, random_state=137)

    for i, (train_index, val_index) in enumerate(kf.split(train,target)):
        dtrain = xgb.dask.DaskDMatrix(client, train_folds[i].values, train_ys[i], enable_categorical=True)
        dval = xgb.dask.DaskDMatrix(client, val_folds[i].values, val_ys[i], enable_categorical=True)
        
        output = xgb.dask.train(client, params, dtrain, num_round)
        booster = output['booster']  # booster is the trained model
        booster.set_param({'predictor': 'gpu_predictor'})
        predictions = xgb.dask.predict(client, booster, dval)
        predictions = predictions.compute()
        train_oof[val_index] = np.exp(predictions) - shift
        del dtrain, dval, output
        gc.collect()
        gc.collect()

    mae = mean_absolute_error(target0, train_oof)
    
    return mae

In [13]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)  # Setup the root logger.
logger.addHandler(logging.FileHandler("optuna_xgb_output_l_5.log", mode="w"))

optuna.logging.enable_propagation()  # Propagate logs to the root logger.
optuna.logging.disable_default_handler()  # Stop showing logs in sys.stderr.

study = optuna.create_study(study_name="five_fold_optuna_xgb_l_5", direction='minimize')

In [14]:
%%time
logger.info("Start optimization.")
study.optimize(objective, n_trials=3)

  'lambda': trial.suggest_loguniform('lambda',1e-3,10.0),
  'alpha': trial.suggest_loguniform('alpha',1e-3,10.0),
  'gamma': trial.suggest_loguniform('gamma',1e-3,10.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3,1.0),
  'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001,0.1),
[09:35:00] task [xgboost.dask-4]:tcp://127.0.0.1:37121 got new rank 0
[09:35:14] task [xgboost.dask-3]:tcp://127.0.0.1:37027 got new rank 0
[09:35:26] task [xgboost.dask-1]:tcp://127.0.0.1:36975 got new rank 0
[09:35:38] task [xgboost.dask-7]:tcp://127.0.0.1:35899 got new rank 0
[09:35:49] task [xgboost.dask-6]:tcp://127.0.0.1:33861 got new rank 0
  'lambda': trial.suggest_loguniform('lambda',1e-3,10.0),
  'alpha': trial.suggest_loguniform('alpha',1e-3,10.0),
  'gamma': trial.suggest_loguniform('gamma',1e-3,10.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3,1.0),
  'subsample': trial.suggest

CPU times: user 29.9 s, sys: 15.2 s, total: 45.1 s
Wall time: 3min 51s


In [15]:
df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))
df.head()

Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_gamma,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_subsample,state
0,0,1190.001472,0.002029,0.645026,rmse,0.188941,0.044873,0.003243,12,213,0.690521,COMPLETE
1,1,1214.553923,0.012982,0.508748,rmse,0.047713,0.661674,0.072797,20,47,0.567355,COMPLETE
2,2,1223.15593,7.095743,0.46423,rmse,0.026537,2.551509,0.005636,4,189,0.510813,COMPLETE


In [16]:
%%time
study.optimize(objective, n_trials=5)
df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))
df.to_csv('optuna_xgb_output_l_5.csv', index=False)
df

  'lambda': trial.suggest_loguniform('lambda',1e-3,10.0),
  'alpha': trial.suggest_loguniform('alpha',1e-3,10.0),
  'gamma': trial.suggest_loguniform('gamma',1e-3,10.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3,1.0),
  'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001,0.1),
[09:41:38] task [xgboost.dask-2]:tcp://127.0.0.1:41353 got new rank 0
[09:41:49] task [xgboost.dask-4]:tcp://127.0.0.1:37121 got new rank 0
[09:41:59] task [xgboost.dask-3]:tcp://127.0.0.1:37027 got new rank 0
[09:42:10] task [xgboost.dask-1]:tcp://127.0.0.1:36975 got new rank 0
[09:42:21] task [xgboost.dask-7]:tcp://127.0.0.1:35899 got new rank 0
  'lambda': trial.suggest_loguniform('lambda',1e-3,10.0),
  'alpha': trial.suggest_loguniform('alpha',1e-3,10.0),
  'gamma': trial.suggest_loguniform('gamma',1e-3,10.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3,1.0),
  'subsample': trial.suggest

CPU times: user 17.2 s, sys: 11.8 s, total: 29 s
Wall time: 3min 18s


Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_gamma,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_subsample,state
0,0,1190.001472,0.002029,0.645026,rmse,0.188941,0.044873,0.003243,12,213,0.690521,COMPLETE
1,1,1214.553923,0.012982,0.508748,rmse,0.047713,0.661674,0.072797,20,47,0.567355,COMPLETE
2,2,1223.15593,7.095743,0.46423,rmse,0.026537,2.551509,0.005636,4,189,0.510813,COMPLETE
3,3,1175.212443,0.001856,0.516115,rmse,0.089468,6.777876,0.004042,11,234,0.903623,COMPLETE
4,4,1167.556218,0.00265,0.836034,rmse,0.027081,0.007916,0.014568,4,5,0.64949,COMPLETE
5,5,1139.772621,0.0036,0.860221,rmse,0.002255,0.002139,0.02381,12,259,0.684108,COMPLETE
6,6,1172.661932,0.057375,0.983912,rmse,0.077003,0.01382,0.007072,6,285,0.76861,COMPLETE
7,7,1148.44335,0.001981,0.802602,rmse,0.310054,0.346462,0.042305,23,130,0.771725,COMPLETE


In [17]:
df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))
df.head(8)

Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_gamma,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_subsample,state
0,0,1190.001472,0.002029,0.645026,rmse,0.188941,0.044873,0.003243,12,213,0.690521,COMPLETE
1,1,1214.553923,0.012982,0.508748,rmse,0.047713,0.661674,0.072797,20,47,0.567355,COMPLETE
2,2,1223.15593,7.095743,0.46423,rmse,0.026537,2.551509,0.005636,4,189,0.510813,COMPLETE
3,3,1175.212443,0.001856,0.516115,rmse,0.089468,6.777876,0.004042,11,234,0.903623,COMPLETE
4,4,1167.556218,0.00265,0.836034,rmse,0.027081,0.007916,0.014568,4,5,0.64949,COMPLETE
5,5,1139.772621,0.0036,0.860221,rmse,0.002255,0.002139,0.02381,12,259,0.684108,COMPLETE
6,6,1172.661932,0.057375,0.983912,rmse,0.077003,0.01382,0.007072,6,285,0.76861,COMPLETE
7,7,1148.44335,0.001981,0.802602,rmse,0.310054,0.346462,0.042305,23,130,0.771725,COMPLETE


In [18]:
%%time
study.optimize(objective, n_trials=100)
df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))
df.to_csv('optuna_xgb_output_l_5.csv', index=False)
df.head()

  'lambda': trial.suggest_loguniform('lambda',1e-3,10.0),
  'alpha': trial.suggest_loguniform('alpha',1e-3,10.0),
  'gamma': trial.suggest_loguniform('gamma',1e-3,10.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3,1.0),
  'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001,0.1),
[09:48:24] task [xgboost.dask-4]:tcp://127.0.0.1:37121 got new rank 0
[09:48:27] task [xgboost.dask-3]:tcp://127.0.0.1:37027 got new rank 0
[09:48:31] task [xgboost.dask-1]:tcp://127.0.0.1:36975 got new rank 0
[09:48:35] task [xgboost.dask-7]:tcp://127.0.0.1:35899 got new rank 0
[09:48:39] task [xgboost.dask-6]:tcp://127.0.0.1:33861 got new rank 0
  'lambda': trial.suggest_loguniform('lambda',1e-3,10.0),
  'alpha': trial.suggest_loguniform('alpha',1e-3,10.0),
  'gamma': trial.suggest_loguniform('gamma',1e-3,10.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3,1.0),
  'subsample': trial.suggest

CPU times: user 7min 45s, sys: 4min 43s, total: 12min 29s
Wall time: 1h 29min 31s


Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_gamma,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_subsample,state
0,0,1190.001472,0.002029,0.645026,rmse,0.188941,0.044873,0.003243,12,213,0.690521,COMPLETE
1,1,1214.553923,0.012982,0.508748,rmse,0.047713,0.661674,0.072797,20,47,0.567355,COMPLETE
2,2,1223.15593,7.095743,0.46423,rmse,0.026537,2.551509,0.005636,4,189,0.510813,COMPLETE
3,3,1175.212443,0.001856,0.516115,rmse,0.089468,6.777876,0.004042,11,234,0.903623,COMPLETE
4,4,1167.556218,0.00265,0.836034,rmse,0.027081,0.007916,0.014568,4,5,0.64949,COMPLETE


In [19]:
df

Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_gamma,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_subsample,state
0,0,1190.001472,0.002029,0.645026,rmse,0.188941,0.044873,0.003243,12,213,0.690521,COMPLETE
1,1,1214.553923,0.012982,0.508748,rmse,0.047713,0.661674,0.072797,20,47,0.567355,COMPLETE
2,2,1223.155930,7.095743,0.464230,rmse,0.026537,2.551509,0.005636,4,189,0.510813,COMPLETE
3,3,1175.212443,0.001856,0.516115,rmse,0.089468,6.777876,0.004042,11,234,0.903623,COMPLETE
4,4,1167.556218,0.002650,0.836034,rmse,0.027081,0.007916,0.014568,4,5,0.649490,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...
103,103,1135.869617,0.005817,0.338481,rmse,0.003247,0.001389,0.016276,15,206,0.999222,COMPLETE
104,104,1135.593256,0.005858,0.335532,rmse,0.003024,0.001970,0.015954,16,212,0.997828,COMPLETE
105,105,1135.450475,0.006291,0.333303,rmse,0.003143,0.001345,0.016706,16,185,0.997285,COMPLETE
106,106,1135.652158,0.006019,0.334024,rmse,0.003323,0.001533,0.016839,17,186,0.971556,COMPLETE


In [20]:
df.head(20)

Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_gamma,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_subsample,state
0,0,1190.001472,0.002029,0.645026,rmse,0.188941,0.044873,0.003243,12,213,0.690521,COMPLETE
1,1,1214.553923,0.012982,0.508748,rmse,0.047713,0.661674,0.072797,20,47,0.567355,COMPLETE
2,2,1223.15593,7.095743,0.46423,rmse,0.026537,2.551509,0.005636,4,189,0.510813,COMPLETE
3,3,1175.212443,0.001856,0.516115,rmse,0.089468,6.777876,0.004042,11,234,0.903623,COMPLETE
4,4,1167.556218,0.00265,0.836034,rmse,0.027081,0.007916,0.014568,4,5,0.64949,COMPLETE
5,5,1139.772621,0.0036,0.860221,rmse,0.002255,0.002139,0.02381,12,259,0.684108,COMPLETE
6,6,1172.661932,0.057375,0.983912,rmse,0.077003,0.01382,0.007072,6,285,0.76861,COMPLETE
7,7,1148.44335,0.001981,0.802602,rmse,0.310054,0.346462,0.042305,23,130,0.771725,COMPLETE
8,8,1198.18898,0.001084,0.426592,rmse,1.048344,0.001766,0.007574,4,241,0.783647,COMPLETE
9,9,1149.864368,0.44148,0.869913,rmse,0.004837,0.007952,0.036756,18,227,0.629913,COMPLETE


In [21]:
df.tail(20)

Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_gamma,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_subsample,state
88,88,1135.905665,0.002532,0.348138,rmse,0.005808,0.00102,0.021711,14,256,0.999959,COMPLETE
89,89,1136.287056,0.002667,0.349966,rmse,0.00656,0.001145,0.017413,14,256,0.977861,COMPLETE
90,90,1136.588817,0.004426,0.343979,rmse,0.006071,0.001241,0.015965,16,255,0.939597,COMPLETE
91,91,1135.867863,0.00253,0.33584,rmse,0.003959,0.001019,0.018834,14,268,0.999887,COMPLETE
92,92,1135.790152,0.002582,0.33298,rmse,0.003714,0.001577,0.016956,14,248,0.981259,COMPLETE
93,93,1135.603978,0.003044,0.333913,rmse,0.003634,0.001176,0.017612,15,235,0.977843,COMPLETE
94,94,1135.808697,0.003065,0.337994,rmse,0.003622,0.001471,0.017698,14,237,0.952538,COMPLETE
95,95,1135.643982,0.006155,0.337812,rmse,0.004369,0.001343,0.021285,15,232,0.99888,COMPLETE
96,96,1136.818159,0.005935,0.329883,rmse,0.003633,0.001517,0.013661,15,235,0.952704,COMPLETE
97,97,1135.938297,0.00459,0.314994,rmse,0.004798,0.001886,0.02057,15,228,0.943804,COMPLETE


In [22]:
df.value.min()

1135.4086836779738