<a href="https://www.kaggle.com/tunguz/tps-11-21-histgradientboosting-with-optuna?scriptVersionId=82613741" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
import optuna
import gc

In [2]:
import os
for dirname, _, filenames in os.walk('../input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../input/tabular-playground-series-nov-2021/sample_submission.csv
../input/tabular-playground-series-nov-2021/train.csv
../input/tabular-playground-series-nov-2021/test.csv


In [3]:
train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')


In [4]:
columns = train.columns[1:-1]
target = train['target'].values

In [5]:
data = train[columns]


In [6]:
def objective(trial,data=data,target=target):
    
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
    params = {
        'l2_regularization': trial.suggest_loguniform('l2_regularization',1e-10,10.0),
        'early_stopping': trial.suggest_categorical('early_stopping', ['False']),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001,0.1),
        'max_iter': trial.suggest_categorical('max_iter', [1000]),
        'max_depth': trial.suggest_int('max_depth', 2,30),
        'max_bins': trial.suggest_int('max_bins', 100,255),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 20,100000),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 20,80),
    }

    model = HistGradientBoostingClassifier(**params)
    model.fit(train_x, train_y)
    predictions = model.predict_proba(test_x)[:,1]
    auc = roc_auc_score(test_y, predictions)
    
    return auc

In [7]:
%%time
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=150)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2021-12-17 18:14:38,346][0m A new study created in memory with name: no-name-7fee91ca-2e4f-4893-9780-8bd907df27fb[0m
[32m[I 2021-12-17 18:16:46,964][0m Trial 0 finished with value: 0.6856152580067415 and parameters: {'l2_regularization': 1.4722641411014514e-05, 'early_stopping': 'False', 'learning_rate': 0.002831001748487097, 'max_iter': 1000, 'max_depth': 5, 'max_bins': 231, 'min_samples_leaf': 50955, 'max_leaf_nodes': 49}. Best is trial 0 with value: 0.6856152580067415.[0m
[32m[I 2021-12-17 18:18:49,511][0m Trial 1 finished with value: 0.727595720117145 and parameters: {'l2_regularization': 4.9731492513102174e-08, 'early_stopping': 'False', 'learning_rate': 0.018077032376593197, 'max_iter': 1000, 'max_depth': 20, 'max_bins': 176, 'min_samples_leaf': 63594, 'max_leaf_nodes': 34}. Best is trial 1 with value: 0.727595720117145.[0m
[32m[I 2021-12-17 18:20:58,175][0m Trial 2 finished with value: 0.7423380055024136 and parameters: {'l2_regularization': 9.590520052456686e-

Number of finished trials: 150
Best trial: {'l2_regularization': 6.137174553545883e-09, 'early_stopping': 'False', 'learning_rate': 0.03284238446050584, 'max_iter': 1000, 'max_depth': 22, 'max_bins': 251, 'min_samples_leaf': 1211, 'max_leaf_nodes': 30}
CPU times: user 1d 2h 42min 16s, sys: 2min 17s, total: 1d 2h 44min 33s
Wall time: 6h 56min 55s


In [8]:
study.best_trial.params

{'l2_regularization': 6.137174553545883e-09,
 'early_stopping': 'False',
 'learning_rate': 0.03284238446050584,
 'max_iter': 1000,
 'max_depth': 22,
 'max_bins': 251,
 'min_samples_leaf': 1211,
 'max_leaf_nodes': 30}