In [1]:
%load_ext watermark

In [2]:
%watermark

Last updated: 2025-09-25T07:35:01.790164-07:00

Python implementation: CPython
Python version       : 3.13.3
IPython version      : 9.1.0

Compiler    : Clang 15.0.0 (clang-1500.3.9.4)
OS          : Darwin
Release     : 24.6.0
Machine     : arm64
Processor   : arm
CPU cores   : 16
Architecture: 64bit



In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import optuna
import gc
import logging

In [4]:
%watermark --iversions

optuna : 4.5.0
numpy  : 2.3.2
pandas : 2.3.2
sklearn: 1.7.1
xgboost: 3.0.4
logging: 0.5.1.2



In [5]:
%%time
train_folds = []
val_folds = []
train_ys = []
val_ys = []

for i in range(5):
    print(f'Loading fold {i}')
    train_fold = pd.read_csv(f'../input/xgtrain_fold_{i}.csv.gz')

    
    val_fold = pd.read_csv(f'../input/xgval_fold_{i}.csv.gz')

    
    
    train_y = train_fold['target']
    train_fold = train_fold[train_fold.columns.difference(['target'])]
    
    val_y = val_fold['target']
    val_fold = val_fold[val_fold.columns.difference(['target'])]
    
    train_folds.append(train_fold)
    val_folds.append(val_fold)
    
    train_ys.append(train_y)
    val_ys.append(val_y)

Loading fold 0
Loading fold 1
Loading fold 2
Loading fold 3
Loading fold 4
CPU times: user 1.62 s, sys: 197 ms, total: 1.82 s
Wall time: 1.83 s


In [6]:
train = pd.read_csv('../input/train.csv.zip')

#shift = 200

#target0 = train['loss'].values
#target = np.log(target0+shift)

target = train['loss'].values

In [8]:
train_oof = np.zeros((target.shape[0],))

num_round = 1000

def objective(trial):
        
    params = {
        'objective': 'reg:squarederror', 
        'base_score':7.76,
        'tree_method':'hist',  # 'gpu_hist','hist'
        'lambda': trial.suggest_float('lambda',1e-3,10.0, log=True),
        'alpha': trial.suggest_float('alpha',1e-3,10.0, log=True),
        'gamma': trial.suggest_float('gamma',1e-3,10.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3,1.0),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.001,0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 25),
        'min_child_weight': trial.suggest_int('min_child_weight', 1,300),
        'eval_metric': trial.suggest_categorical('eval_metric',['rmse']),

    }

    kf = KFold(5, shuffle=True, random_state=137)

    for i, (train_index, val_index) in enumerate(kf.split(train,target)):
        dtrain = xgb.DMatrix(train_folds[i].values, train_ys[i], enable_categorical=True)
        dval = xgb.DMatrix(val_folds[i].values, val_ys[i], enable_categorical=True)
        
        output = xgb.train(params, dtrain, num_round)
        #booster = output['booster']  # booster is the trained model
        #booster.set_param({'predictor': 'gpu_predictor'})
        predictions = output.predict(dval)
        train_oof[val_index] = predictions
        del dtrain, dval, output
        gc.collect()
        gc.collect()

    mae = mean_absolute_error(target, train_oof)
    
    return mae

In [9]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)  # Setup the root logger.
logger.addHandler(logging.FileHandler("optuna_xgb_output_4_M4.log", mode="w"))

optuna.logging.enable_propagation()  # Propagate logs to the root logger.
optuna.logging.disable_default_handler()  # Stop showing logs in sys.stderr.

study = optuna.create_study(storage="sqlite:///xgb_optuna_allstate_4_M4.db", study_name="five_fold_optuna_xgb_l_4", direction='minimize')

In [10]:
%%time
logger.info("Start optimization.")
study.optimize(objective, n_trials=3)

CPU times: user 7min 51s, sys: 3min 48s, total: 11min 39s
Wall time: 1min 48s


In [11]:
df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))
df.head()

Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_gamma,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_subsample,state
0,0,1174.652525,0.005408,0.823922,rmse,0.254102,0.001308,0.008459,14,72,0.813074,COMPLETE
1,1,1198.662635,0.710093,0.633897,rmse,0.268103,0.445519,0.033542,19,34,0.452828,COMPLETE
2,2,1183.883722,0.727648,0.75799,rmse,0.014168,3.809995,0.071799,7,99,0.528597,COMPLETE


In [12]:
%%time
study.optimize(objective, n_trials=5)
df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))
df.to_csv('optuna_xgb_output_4_M4.csv', index=False)
df

CPU times: user 12min 9s, sys: 5min 19s, total: 17min 28s
Wall time: 2min 38s


Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_gamma,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_subsample,state
0,0,1174.652525,0.005408,0.823922,rmse,0.254102,0.001308,0.008459,14,72,0.813074,COMPLETE
1,1,1198.662635,0.710093,0.633897,rmse,0.268103,0.445519,0.033542,19,34,0.452828,COMPLETE
2,2,1183.883722,0.727648,0.75799,rmse,0.014168,3.809995,0.071799,7,99,0.528597,COMPLETE
3,3,1195.369622,0.009371,0.827324,rmse,4.070143,2.02704,0.061405,4,98,0.447126,COMPLETE
4,4,1198.499387,4.307158,0.635089,rmse,1.156021,0.002189,0.025592,4,59,0.656956,COMPLETE
5,5,1278.314322,0.006108,0.685846,rmse,0.002233,0.001093,0.001324,21,124,0.922459,COMPLETE
6,6,1245.982161,0.005423,0.341299,rmse,0.046815,0.027086,0.09984,20,159,0.498116,COMPLETE
7,7,1205.423828,4.355739,0.438711,rmse,0.088649,0.00141,0.007426,6,142,0.619601,COMPLETE


In [13]:
%%time
study.optimize(objective, n_trials=100)
df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))
df.to_csv('optuna_xgb_output_4_M4.csv', index=False)
df.head(20)

CPU times: user 5h 43min 45s, sys: 2h 41min 19s, total: 8h 25min 5s
Wall time: 1h 23min 13s


Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_gamma,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_subsample,state
0,0,1174.652525,0.005408,0.823922,rmse,0.254102,0.001308,0.008459,14,72,0.813074,COMPLETE
1,1,1198.662635,0.710093,0.633897,rmse,0.268103,0.445519,0.033542,19,34,0.452828,COMPLETE
2,2,1183.883722,0.727648,0.75799,rmse,0.014168,3.809995,0.071799,7,99,0.528597,COMPLETE
3,3,1195.369622,0.009371,0.827324,rmse,4.070143,2.02704,0.061405,4,98,0.447126,COMPLETE
4,4,1198.499387,4.307158,0.635089,rmse,1.156021,0.002189,0.025592,4,59,0.656956,COMPLETE
5,5,1278.314322,0.006108,0.685846,rmse,0.002233,0.001093,0.001324,21,124,0.922459,COMPLETE
6,6,1245.982161,0.005423,0.341299,rmse,0.046815,0.027086,0.09984,20,159,0.498116,COMPLETE
7,7,1205.423828,4.355739,0.438711,rmse,0.088649,0.00141,0.007426,6,142,0.619601,COMPLETE
8,8,1194.538115,0.664259,0.951218,rmse,0.207578,0.328667,0.010628,7,211,0.537679,COMPLETE
9,9,1460.658193,0.520661,0.496057,rmse,1.235324,0.117394,0.001077,4,268,0.65649,COMPLETE


In [14]:
df

Unnamed: 0,number,value,params_alpha,params_colsample_bytree,params_eval_metric,params_gamma,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_subsample,state
0,0,1174.652525,0.005408,0.823922,rmse,0.254102,0.001308,0.008459,14,72,0.813074,COMPLETE
1,1,1198.662635,0.710093,0.633897,rmse,0.268103,0.445519,0.033542,19,34,0.452828,COMPLETE
2,2,1183.883722,0.727648,0.757990,rmse,0.014168,3.809995,0.071799,7,99,0.528597,COMPLETE
3,3,1195.369622,0.009371,0.827324,rmse,4.070143,2.027040,0.061405,4,98,0.447126,COMPLETE
4,4,1198.499387,4.307158,0.635089,rmse,1.156021,0.002189,0.025592,4,59,0.656956,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...
103,103,1171.527338,0.001413,0.580237,rmse,0.001347,5.688138,0.008162,24,96,0.780747,COMPLETE
104,104,1172.904859,0.002156,0.535187,rmse,0.004627,9.737602,0.006124,24,78,0.784790,COMPLETE
105,105,1171.666891,0.002591,0.610371,rmse,0.038993,4.167532,0.009437,22,101,0.766611,COMPLETE
106,106,1171.090958,0.001542,0.514734,rmse,0.001029,6.181112,0.008187,23,91,0.791562,COMPLETE


In [15]:
df.value.min()

np.float64(1170.4737772728065)

In [16]:
study.best_params

{'lambda': 4.135465980937047,
 'alpha': 1.230795809852035,
 'gamma': 2.487159270482105,
 'colsample_bytree': 0.5174849748565589,
 'subsample': 0.5750672450350401,
 'learning_rate': 0.010574501239103015,
 'max_depth': 16,
 'min_child_weight': 52,
 'eval_metric': 'rmse'}

In [17]:
%%time
train_oof = np.zeros((target.shape[0],))
test_preds = 0
kf = KFold(5, shuffle=True, random_state=137)

params = study.best_params

for i, (train_index, val_index) in enumerate(kf.split(train,target)):
    print(i)
    dtrain = xgb.DMatrix(train_folds[i].values, train_ys[i], enable_categorical=True)
    dval = xgb.DMatrix(val_folds[i].values, val_ys[i], enable_categorical=True)
        
    output = xgb.train(params, dtrain, num_round)

    oof_predictions = output.predict(dval)
    train_oof[val_index] = oof_predictions

0
1
2
3
4
CPU times: user 3min 13s, sys: 1min 33s, total: 4min 47s
Wall time: 47.8 s


In [18]:
mean_absolute_error(target, train_oof)

1170.9941324393053

In [19]:
%%time
num_round = 2500


train_oof = np.zeros((target.shape[0],))
test_preds = 0
kf = KFold(5, shuffle=True, random_state=137)

params = study.best_params

params['learning_rate'] = 0.01

for i, (train_index, val_index) in enumerate(kf.split(train,target)):
    print(i)
    dtrain = xgb.DMatrix(train_folds[i].values, train_ys[i], enable_categorical=True)
    dval = xgb.DMatrix(val_folds[i].values, val_ys[i], enable_categorical=True)
        
    output = xgb.train(params, dtrain, num_round)

    oof_predictions = output.predict(dval)
    train_oof[val_index] = oof_predictions

print(mean_absolute_error(target, train_oof))

0
1
2
3
4
1174.1179314438575
CPU times: user 6min 53s, sys: 3min 53s, total: 10min 47s
Wall time: 1min 46s


In [21]:
%%time
num_round = 2000


train_oof = np.zeros((target.shape[0],))
test_preds = 0
kf = KFold(5, shuffle=True, random_state=137)

params = study.best_params

params['learning_rate'] = 0.01

for i, (train_index, val_index) in enumerate(kf.split(train,target)):
    print(i)
    dtrain = xgb.DMatrix(train_folds[i].values, train_ys[i], enable_categorical=True)
    dval = xgb.DMatrix(val_folds[i].values, val_ys[i], enable_categorical=True)
        
    output = xgb.train(params, dtrain, num_round)

    oof_predictions = output.predict(dval)
    train_oof[val_index] = oof_predictions

print(mean_absolute_error(target, train_oof))

0
1
2
3
4
1172.1810921223614
CPU times: user 5min 38s, sys: 3min 7s, total: 8min 45s
Wall time: 1min 26s


In [23]:
%%time
num_round = 1700


train_oof = np.zeros((target.shape[0],))
test_preds = 0
kf = KFold(5, shuffle=True, random_state=137)

params = study.best_params

params['learning_rate'] = 0.01

for i, (train_index, val_index) in enumerate(kf.split(train,target)):
    print(i)
    dtrain = xgb.DMatrix(train_folds[i].values, train_ys[i], enable_categorical=True)
    dval = xgb.DMatrix(val_folds[i].values, val_ys[i], enable_categorical=True)
        
    output = xgb.train(params, dtrain, num_round)

    oof_predictions = output.predict(dval)
    train_oof[val_index] = oof_predictions

print(mean_absolute_error(target, train_oof))

0
1
2
3
4
1171.2860707260324
CPU times: user 4min 54s, sys: 2min 38s, total: 7min 32s
Wall time: 1min 14s


In [24]:
%%time
num_round = 1400


train_oof = np.zeros((target.shape[0],))
test_preds = 0
kf = KFold(5, shuffle=True, random_state=137)

params = study.best_params

params['learning_rate'] = 0.01

for i, (train_index, val_index) in enumerate(kf.split(train,target)):
    print(i)
    dtrain = xgb.DMatrix(train_folds[i].values, train_ys[i], enable_categorical=True)
    dval = xgb.DMatrix(val_folds[i].values, val_ys[i], enable_categorical=True)
        
    output = xgb.train(params, dtrain, num_round)

    oof_predictions = output.predict(dval)
    train_oof[val_index] = oof_predictions

print(mean_absolute_error(target, train_oof))

0
1
2
3
4
1170.7706300683164
CPU times: user 4min 13s, sys: 2min 10s, total: 6min 23s
Wall time: 1min 2s


In [25]:
%%time
num_round = 1200


train_oof = np.zeros((target.shape[0],))
test_preds = 0
kf = KFold(5, shuffle=True, random_state=137)

params = study.best_params

params['learning_rate'] = 0.01

for i, (train_index, val_index) in enumerate(kf.split(train,target)):
    print(i)
    dtrain = xgb.DMatrix(train_folds[i].values, train_ys[i], enable_categorical=True)
    dval = xgb.DMatrix(val_folds[i].values, val_ys[i], enable_categorical=True)
        
    output = xgb.train(params, dtrain, num_round)

    oof_predictions = output.predict(dval)
    train_oof[val_index] = oof_predictions

print(mean_absolute_error(target, train_oof))

0
1
2
3
4
1170.7723387321257
CPU times: user 3min 46s, sys: 1min 51s, total: 5min 38s
Wall time: 55.5 s


In [26]:
%%time
num_round = 2600


train_oof = np.zeros((target.shape[0],))
test_preds = 0
kf = KFold(5, shuffle=True, random_state=137)

params = study.best_params

params['learning_rate'] = 0.005

for i, (train_index, val_index) in enumerate(kf.split(train,target)):
    print(i)
    dtrain = xgb.DMatrix(train_folds[i].values, train_ys[i], enable_categorical=True)
    dval = xgb.DMatrix(val_folds[i].values, val_ys[i], enable_categorical=True)
        
    output = xgb.train(params, dtrain, num_round)

    oof_predictions = output.predict(dval)
    train_oof[val_index] = oof_predictions

print(mean_absolute_error(target, train_oof))

0
1
2
3
4
1169.3465728203666
CPU times: user 7min 54s, sys: 4min 1s, total: 11min 56s
Wall time: 1min 57s


In [30]:
x_test = pd.read_csv('../input/x_test_l.csv')

In [31]:
output.predict(xgb.DMatrix(x_test))

array([4785.83  , 8722.662 , 5049.362 , ..., 8381.605 , 3118.1438,
       4653.048 ], shape=(125546,), dtype=float32)