In [1]:
%load_ext watermark

In [2]:
%watermark

Last updated: 2023-05-25T20:15:26.809045-04:00

Python implementation: CPython
Python version       : 3.7.11
IPython version      : 7.10.1

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 4.15.0-161-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 40
Architecture: 64bit



In [3]:
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
from dask import dataframe as dd
from dask.delayed import delayed
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import optuna
import gc
import logging

In [4]:
%watermark --iversions

pandas : 1.3.5
optuna : 2.10.1
dask   : 2022.1.0
logging: 0.5.1.2
xgboost: 1.6.2
numpy  : 1.19.2



In [5]:
!nvidia-smi -L

GPU 0: Tesla V100-DGXS-32GB (UUID: GPU-d752af09-1f62-bf3b-4f70-78b84e9e41f6)
GPU 1: Tesla V100-DGXS-32GB (UUID: GPU-054a4a35-f98a-3ebc-9100-0f697c246b43)
GPU 2: Tesla V100-DGXS-32GB (UUID: GPU-454525d4-bebd-7fb8-0ba3-3b85e2f99abd)
GPU 3: Tesla V100-DGXS-32GB (UUID: GPU-af0fb74e-f5eb-0833-17ff-494cc6bdcee1)


In [6]:
cluster = LocalCUDACluster(n_workers=4)
client = Client(cluster)

In [7]:
!nvidia-smi

Thu May 25 20:15:36 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.142.00   Driver Version: 450.142.00   CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-DGXS...  On   | 00000000:07:00.0  On |                    0 |
| N/A   47C    P0    53W / 300W |    668MiB / 32499MiB |      9%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-DGXS...  On   | 00000000:08:00.0 Off |                    0 |
| N/A   48C    P0    51W / 300W |    309MiB / 32508MiB |      3%      Default |
|       

In [8]:
train_folds = []
val_folds = []
train_ys = []
val_ys = []

for i in range(5):
    print(f'Loading fold {i}')
    train_fold_d = delayed(pd.read_csv)(f'../input/xgtrain_fold_{i}_l.csv.gz')
    train_fold = dd.from_delayed(train_fold_d)
    
    val_fold_d = delayed(pd.read_csv)(f'../input/xgval_fold_{i}_l.csv.gz')
    val_fold = dd.from_delayed(val_fold_d)
    
    
    train_y = train_fold['target']
    train_fold = train_fold[train_fold.columns.difference(['target'])]
    
    val_y = val_fold['target']
    val_fold = val_fold[val_fold.columns.difference(['target'])]
    
    train_folds.append(train_fold)
    val_folds.append(val_fold)
    
    train_ys.append(train_y)
    val_ys.append(val_y)

Loading fold 0
Loading fold 1
Loading fold 2
Loading fold 3
Loading fold 4


In [9]:
train = pd.read_csv('../input/train.csv.zip')

shift = 200

target0 = train['loss'].values
target = np.log(target0+shift)

In [10]:
train_oof = np.zeros((target.shape[0],))

num_round = 1000

def objective(trial):
        
    params = {
        'objective': 'reg:squarederror', 
        'tree_method':'gpu_hist',  # 'gpu_hist','hist'
        'lambda': trial.suggest_loguniform('lambda',1e-3,10.0),
        'alpha': trial.suggest_loguniform('alpha',1e-3,10.0),
        'gamma': trial.suggest_loguniform('gamma',1e-3,10.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3,1.0),
        'subsample': trial.suggest_uniform('subsample', 0.4, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001,0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 25),
        'min_child_weight': trial.suggest_int('min_child_weight', 1,300),
        'eval_metric': trial.suggest_categorical('eval_metric',['rmse']),

    }

    kf = KFold(5, shuffle=True, random_state=137)

    for i, (train_index, val_index) in enumerate(kf.split(train,target)):
        dtrain = xgb.dask.DaskDMatrix(client, train_folds[i].values, train_ys[i], enable_categorical=True)
        dval = xgb.dask.DaskDMatrix(client, val_folds[i].values, val_ys[i], enable_categorical=True)
        
        output = xgb.dask.train(client, params, dtrain, num_round)
        booster = output['booster']  # booster is the trained model
        booster.set_param({'predictor': 'gpu_predictor'})
        predictions = xgb.dask.predict(client, booster, dval)
        predictions = predictions.compute()
        train_oof[val_index] = np.exp(predictions) - shift
        del dtrain, dval, output
        gc.collect()
        gc.collect()

    mae = mean_absolute_error(target0, train_oof)
    
    return mae

In [11]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)  # Setup the root logger.
logger.addHandler(logging.FileHandler("optuna_xgb_output_l_2.log", mode="w"))

optuna.logging.enable_propagation()  # Propagate logs to the root logger.
optuna.logging.disable_default_handler()  # Stop showing logs in sys.stderr.

study = optuna.create_study(storage="sqlite:///xgb_optuna_allstate_l_2.db", study_name="five_fold_optuna_xgb_l_2", direction='minimize')

In [12]:
%%time
logger.info("Start optimization.")
study.optimize(objective, n_trials=3)

  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)


CPU times: user 23.3 s, sys: 2.85 s, total: 26.2 s
Wall time: 3min 8s


In [13]:
df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))
df.head()

Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_gamma,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_subsample,state
0,0,3002.942327,0.102395,0.798416,rmse,3.018186,0.008297,0.001113,8,186,0.578304,COMPLETE
1,1,1154.936308,1.205273,0.699385,rmse,0.192682,5.308009,0.064321,11,131,0.521107,COMPLETE
2,2,2950.836243,0.001247,0.380844,rmse,0.657445,6.98442,0.001209,12,233,0.724222,COMPLETE


In [14]:
%%time
study.optimize(objective, n_trials=5)
df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))
df.to_csv('optuna_xgb_output_l_2.csv', index=False)
df

  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)


CPU times: user 30.9 s, sys: 4 s, total: 34.9 s
Wall time: 3min 52s


Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_gamma,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_subsample,state
0,0,3002.942327,0.102395,0.798416,rmse,3.018186,0.008297,0.001113,8,186,0.578304,COMPLETE
1,1,1154.936308,1.205273,0.699385,rmse,0.192682,5.308009,0.064321,11,131,0.521107,COMPLETE
2,2,2950.836243,0.001247,0.380844,rmse,0.657445,6.98442,0.001209,12,233,0.724222,COMPLETE
3,3,1268.632547,0.011421,0.76118,rmse,0.001195,0.004254,0.005523,3,191,0.609041,COMPLETE
4,4,1156.999418,0.09125,0.936719,rmse,0.009424,0.024693,0.007807,7,51,0.946939,COMPLETE
5,5,1172.27867,2.687309,0.929464,rmse,8.343771,0.275237,0.038481,17,139,0.957722,COMPLETE
6,6,1139.674258,0.046386,0.483151,rmse,0.102547,0.00198,0.01338,9,31,0.513972,COMPLETE
7,7,1144.334812,0.00101,0.751471,rmse,0.005705,1.646903,0.034477,5,57,0.690446,COMPLETE


In [15]:
%%time
study.optimize(objective, n_trials=100)
df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))
df.to_csv('optuna_xgb_output_l_2.csv', index=False)
df.head()

  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
  client.wait_for_workers(n_workers)
 

CPU times: user 41min 51s, sys: 7min 42s, total: 49min 34s
Wall time: 7h 43min 7s


Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_gamma,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_subsample,state
0,0,3002.942327,0.102395,0.798416,rmse,3.018186,0.008297,0.001113,8,186,0.578304,COMPLETE
1,1,1154.936308,1.205273,0.699385,rmse,0.192682,5.308009,0.064321,11,131,0.521107,COMPLETE
2,2,2950.836243,0.001247,0.380844,rmse,0.657445,6.98442,0.001209,12,233,0.724222,COMPLETE
3,3,1268.632547,0.011421,0.76118,rmse,0.001195,0.004254,0.005523,3,191,0.609041,COMPLETE
4,4,1156.999418,0.09125,0.936719,rmse,0.009424,0.024693,0.007807,7,51,0.946939,COMPLETE


In [16]:
df

Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_gamma,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_subsample,state
0,0,3002.942327,0.102395,0.798416,rmse,3.018186,0.008297,0.001113,8,186,0.578304,COMPLETE
1,1,1154.936308,1.205273,0.699385,rmse,0.192682,5.308009,0.064321,11,131,0.521107,COMPLETE
2,2,2950.836243,0.001247,0.380844,rmse,0.657445,6.984420,0.001209,12,233,0.724222,COMPLETE
3,3,1268.632547,0.011421,0.761180,rmse,0.001195,0.004254,0.005523,3,191,0.609041,COMPLETE
4,4,1156.999418,0.091250,0.936719,rmse,0.009424,0.024693,0.007807,7,51,0.946939,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...
103,103,1135.998724,0.013327,0.362895,rmse,0.014798,0.123017,0.012735,15,190,0.971885,COMPLETE
104,104,1135.802254,0.019072,0.315376,rmse,0.008756,0.094673,0.015937,14,229,0.935908,COMPLETE
105,105,1135.899878,0.009609,0.356282,rmse,0.018859,0.066969,0.013579,15,221,0.986455,COMPLETE
106,106,1135.985037,0.030414,0.347617,rmse,0.026399,0.106279,0.012216,16,171,0.953545,COMPLETE


In [17]:
df.value.min()

1134.8684593270546