In [9]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import logging
import optuna
import gc

In [5]:
train_x = pd.read_csv('../../TPS_2021/input/tabular-playground-series-dec-2021/xgtrain.csv')
test_x = pd.read_csv('../../TPS_2021/input/tabular-playground-series-dec-2021/xgval.csv')

train_x = train_x.replace([np.inf, -np.inf], np.nan)
train_y = train_x['target'] 
train_x = train_x[train_x.columns.difference(['target'])]

test_x = test_x.replace([np.inf, -np.inf], np.nan)
test_y = test_x['target']
test_x = test_x[test_x.columns.difference(['target'])]

In [42]:
def objective(trial, train_x=train_x, test_x=test_x, train_y=train_y, test_y=test_y):

    params = {
        'l2_regularization': trial.suggest_loguniform('l2_regularization',1e-10,10.0),
        'early_stopping': trial.suggest_categorical('early_stopping', ['False']),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001,0.1),
        'max_iter': trial.suggest_categorical('max_iter', [1000]),
        'max_depth': trial.suggest_int('max_depth', 2,30),
        'max_bins': trial.suggest_int('max_bins', 100,255),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 20,100000),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 20,80),
    }

    model = HistGradientBoostingClassifier(**params)
    model.fit(train_x, train_y)
    predictions = model.predict(test_x)
    
    acc = accuracy_score(test_y, predictions)
    
    return acc


In [43]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)  # Setup the root logger.
logger.addHandler(logging.FileHandler("optuna_hgb_output_2.log", mode="w"))

In [44]:
optuna.logging.enable_propagation()  # Propagate logs to the root logger.
optuna.logging.disable_default_handler()  # Stop showing logs in sys.stderr.

In [45]:
study = optuna.create_study(direction='maximize', storage="sqlite:///hgb_optuna_tests.db", study_name="dec_2021_test_2_8")
logger.info("Start optimization.")
study.optimize(objective, n_trials=2)

In [46]:
df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))
df.to_csv('optuna_hgb_output_2.csv', index=False)
df.head()

Unnamed: 0,number,value,params_early_stopping,params_l2_regularization,params_learning_rate,params_max_bins,params_max_depth,params_max_iter,params_max_leaf_nodes,params_min_samples_leaf,state
0,0,0.950109,False,0.005616,0.003857,112,17,1000,64,22051,COMPLETE
1,1,0.939852,False,1.50373,0.009993,115,24,1000,40,89916,COMPLETE


In [35]:
np.unique(test_y.values)

array([1, 2, 3, 4, 5, 6])

In [36]:
model = HistGradientBoostingClassifier(max_iter=1)
model.fit(train_x, train_y)

HistGradientBoostingClassifier(max_iter=1)

In [40]:
predictions = model.predict(test_x)
    
predictions 

array([1, 1, 5, ..., 1, 1, 1])

In [41]:
accuracy_score(test_y, predictions)

0.83001375

In [39]:
acc

0.0041325

In [47]:
study = optuna.load_study(storage="sqlite:///hgb_optuna_tests.db", study_name="dec_2021_test_2_8")
logger.info("Start optimization.")
study.optimize(objective, n_trials=100)

In [48]:
df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))
df.to_csv('optuna_hgb_output_2.csv', index=False)
df.head()

Unnamed: 0,number,value,params_early_stopping,params_l2_regularization,params_learning_rate,params_max_bins,params_max_depth,params_max_iter,params_max_leaf_nodes,params_min_samples_leaf,state
0,0,0.950109,False,0.005615559,0.003857,112,17,1000,64,22051,COMPLETE
1,1,0.939852,False,1.50373,0.009993,115,24,1000,40,89916,COMPLETE
2,2,0.93765,False,1.437565e-09,0.004745,217,13,1000,36,71714,COMPLETE
3,3,0.94905,False,6.570549e-07,0.002343,243,30,1000,79,10636,COMPLETE
4,4,0.94845,False,1.010396e-08,0.050985,232,18,1000,61,73142,COMPLETE


In [49]:
study.best_value

0.96216375

In [50]:
study.best_params

{'early_stopping': 'False',
 'l2_regularization': 8.31762676496183,
 'learning_rate': 0.0277004401573284,
 'max_bins': 218,
 'max_depth': 28,
 'max_iter': 1000,
 'max_leaf_nodes': 67,
 'min_samples_leaf': 5845}