In [1]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
import optuna
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os
for dirname, _, filenames in os.walk('../input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
train = pd.read_csv('../../TPS_2021/input/tabular-playground-series-nov-2021/train.csv')


In [4]:
columns = train.columns[1:-1]
target = train['target'].values

In [5]:
data = train[columns]


In [6]:
def objective(trial,data=data,target=target):
    
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=42)
    params = {
        'l2_regularization': trial.suggest_loguniform('l2_regularization',1e-10,10.0),
        'early_stopping': trial.suggest_categorical('early_stopping', ['False']),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001,0.1),
        'max_iter': trial.suggest_categorical('max_iter', [1000]),
        'max_depth': trial.suggest_int('max_depth', 2,30),
        'max_bins': trial.suggest_int('max_bins', 100,255),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 20,100000),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 20,80),
    }

    model = HistGradientBoostingClassifier(**params)
    model.fit(train_x, train_y)
    predictions = model.predict_proba(test_x)[:,1]
    auc = roc_auc_score(test_y, predictions)
    
    return auc

In [7]:
%%time
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=2)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2022-04-20 14:09:00,523][0m A new study created in memory with name: no-name-36ee43d9-284b-4f98-b4fc-57f85bdc2d13[0m
[32m[I 2022-04-20 14:09:59,176][0m Trial 0 finished with value: 0.7381471122224359 and parameters: {'l2_regularization': 5.601261885687851e-07, 'early_stopping': 'False', 'learning_rate': 0.009074136201017935, 'max_iter': 1000, 'max_depth': 18, 'max_bins': 222, 'min_samples_leaf': 605, 'max_leaf_nodes': 65}. Best is trial 0 with value: 0.7381471122224359.[0m
[32m[I 2022-04-20 14:10:30,864][0m Trial 1 finished with value: 0.6965191761167373 and parameters: {'l2_regularization': 1.0035623612206752e-10, 'early_stopping': 'False', 'learning_rate': 0.003762088391134226, 'max_iter': 1000, 'max_depth': 23, 'max_bins': 169, 'min_samples_leaf': 42381, 'max_leaf_nodes': 65}. Best is trial 0 with value: 0.7381471122224359.[0m


Number of finished trials: 2
Best trial: {'l2_regularization': 5.601261885687851e-07, 'early_stopping': 'False', 'learning_rate': 0.009074136201017935, 'max_iter': 1000, 'max_depth': 18, 'max_bins': 222, 'min_samples_leaf': 605, 'max_leaf_nodes': 65}
CPU times: user 2h 13min 51s, sys: 5.85 s, total: 2h 13min 57s
Wall time: 1min 30s


In [11]:
study.best_trial.params

{'l2_regularization': 9.264791813631477e-10,
 'early_stopping': 'False',
 'learning_rate': 0.009115259000632236,
 'max_iter': 1000,
 'max_depth': 25,
 'max_bins': 124,
 'min_samples_leaf': 20808,
 'max_leaf_nodes': 80}

In [8]:
study.best_trial.params

{'l2_regularization': 5.601261885687851e-07,
 'early_stopping': 'False',
 'learning_rate': 0.009074136201017935,
 'max_iter': 1000,
 'max_depth': 18,
 'max_bins': 222,
 'min_samples_leaf': 605,
 'max_leaf_nodes': 65}