In [1]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
import optuna
import gc

In [3]:
import os
for dirname, _, filenames in os.walk('../input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../input/sample_submission.csv
../input/train.csv
../input/test.csv


In [5]:
train = pd.read_csv('../input/train.csv')


In [7]:
columns = train.columns[1:-1]
target = train['target'].values

In [8]:
data = train[columns]


In [9]:
def objective(trial,data=data,target=target):
    
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
    params = {
        'l2_regularization': trial.suggest_loguniform('l2_regularization',1e-10,10.0),
        'early_stopping': trial.suggest_categorical('early_stopping', ['False']),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001,0.1),
        'max_iter': trial.suggest_categorical('max_iter', [1000]),
        'max_depth': trial.suggest_int('max_depth', 2,30),
        'max_bins': trial.suggest_int('max_bins', 100,255),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 20,100000),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 20,80),
    }

    model = HistGradientBoostingClassifier(**params)
    model.fit(train_x, train_y)
    predictions = model.predict_proba(test_x)[:,1]
    auc = roc_auc_score(test_y, predictions)
    
    return auc

In [10]:
%%time
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=2)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2021-11-23 09:15:10,645][0m A new study created in memory with name: no-name-d12f9dec-d651-433f-b58e-8ebc36164ad2[0m
[32m[I 2021-11-23 09:16:10,773][0m Trial 0 finished with value: 0.7255587331644143 and parameters: {'l2_regularization': 5.006988926584862e-06, 'early_stopping': 'False', 'learning_rate': 0.018942847307460108, 'max_iter': 1000, 'max_depth': 10, 'max_bins': 212, 'min_samples_leaf': 76242, 'max_leaf_nodes': 24}. Best is trial 0 with value: 0.7255587331644143.[0m
[32m[I 2021-11-23 09:17:34,662][0m Trial 1 finished with value: 0.7276100687662733 and parameters: {'l2_regularization': 9.264791813631477e-10, 'early_stopping': 'False', 'learning_rate': 0.009115259000632236, 'max_iter': 1000, 'max_depth': 25, 'max_bins': 124, 'min_samples_leaf': 20808, 'max_leaf_nodes': 80}. Best is trial 1 with value: 0.7276100687662733.[0m


Number of finished trials: 2
Best trial: {'l2_regularization': 9.264791813631477e-10, 'early_stopping': 'False', 'learning_rate': 0.009115259000632236, 'max_iter': 1000, 'max_depth': 25, 'max_bins': 124, 'min_samples_leaf': 20808, 'max_leaf_nodes': 80}
CPU times: user 13min 24s, sys: 1.76 s, total: 13min 26s
Wall time: 2min 24s


In [11]:
study.best_trial.params

{'l2_regularization': 9.264791813631477e-10,
 'early_stopping': 'False',
 'learning_rate': 0.009115259000632236,
 'max_iter': 1000,
 'max_depth': 25,
 'max_bins': 124,
 'min_samples_leaf': 20808,
 'max_leaf_nodes': 80}