# Models

Let's run a bayesian optimization to find good hyper parameters among different algorithms. Multiple libraries can be used such as Optuna, Hyperopt, scikit-optimize, and bayes_opt.

* https://neptune.ai/blog/optuna-vs-hyperopt


In [1]:
import os
import sys
import numpy as np
import pandas as pd
import datetime as dt

In [2]:
import pickle

In [3]:
#!conda install -c plotly plotly -y

In [4]:
import plotly

In [5]:
#!conda install -c conda-forge optuna -y

In [6]:
import optuna
print(optuna.__version__)

1.5.0


In [7]:
#!conda install -c conda-forge lightgbm -y

In [8]:
import lightgbm as lgb
print(lgb.__version__)

2.3.1


In [9]:
# not available for windows, try !conda install -c anaconda py-xgboost or !pip install xgboost
#!conda install -c conda-forge xgboost -y

In [10]:
import xgboost as xgb
print(xgb.__version__)

1.1.1


In [11]:
#!conda install -c conda-forge catboost -y

In [12]:
import catboost as cat
print(cat.__version__)

0.23.2


In [13]:
import sklearn as sk
print(sk.__version__)

0.22.1


In [14]:
#!conda install -c conda-forge tsfresh -y

In [15]:
import tsfresh as tsf
print(tsf.__version__)

0.16.0


In [16]:
models_path = os.path.join('..', 'models')

# Features

In [17]:
def create_tsfresh_dataframe(np_array, id_name='id', sort_name='time'):
    df = pd.DataFrame(data=np_array)
    df[id_name] = 0
    df[sort_name] = df.index
    return df

## Regression

Use these to calculate the transdermal alcohol concentration

In [18]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import (ExtraTreesRegressor,
                              RandomForestRegressor,
                              AdaBoostRegressor,
                              GradientBoostingRegressor,
                              HistGradientBoostingRegressor)


In [215]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error

class RegressionObjective(object):
    def __init__(self, dataset):
        self.dataset = dataset
        self.logging = optuna.logging.get_logger('RegressionObjective')
        self.n_jobs =  int(os.cpu_count() / 2) # setting to number of physical cores is recommend by lightgbm instead of -1
        self.name = 'regressor'
        
        self.models = {
            #'ETS': self.create_ETS,
            #'RF': self.create_RF,
            #'ADA': self.create_ADA,
            #'GB': self.create_GB,
            #'HGB': self.create_HGB,
            'LGB': self.create_LGB,
            #'XGB': self.create_XGB,
            #'CAT': self.create_CAT # currently not working correctly
        }
    
    def __call__(self, trial):
        x, y = self.dataset.data, self.dataset.target        
      
        shared_param = {
            'n_estimators': trial.suggest_int('n_estimators', 10, 1000, 5)
        }   
        
        if(len(self.models.keys()) == 1):
            regressor_name = list(self.models.keys())[0]
        else:
            regressor_name = trial.suggest_categorical('regressor', self.models.keys())
        
        self.name = regressor_name
        
        regressor_obj = self.models[regressor_name](trial, shared_param)        
        
        # https://scikit-learn.org/stable/modules/model_evaluation.html#scoring
        scoring = make_scorer(RegressionObjective.custom_scorer_metric, trial=trial)
        #scoring = 'neg_root_mean_squared_error'
        
        
        self.logging.info(f'{trial.number} - {regressor_name}')
        score = cross_val_score(regressor_obj, x, y, n_jobs=-1, cv=3, scoring=scoring)
        
        accuracy = score.mean()
        if np.isnan(accuracy):
            return -sys.maxsize
        
        return accuracy
    
    def custom_scorer_metric(y_true, y_pred, trial):
        result = mean_squared_error(y_true, y_pred)
        intermediate_value = -result
        trial.report(intermediate_value, trial.number)
        
        if trial.should_prune():
            raise optuna.TrialPruned()
        
        return intermediate_value
    
    def get_name(self, best_trial):        
        if(len(self.models.keys()) == 1):
            regressor_name = list(self.models.keys())[0]
        else:
            regressor_name = best_trial.params['regressor']
        return regressor_name
    
    def create_model(self, best_trial):        
        return self.models[self.get_name(best_trial)](optuna.trial.FixedTrial(study.best_trial.params), study.best_trial.params)
    
    def create_ETS(self, trial, param):
        # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html
        return ExtraTreesRegressor(n_estimators=param['n_estimators'])
    
    def create_RF(self, trial, param):
        # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
        return RandomForestRegressor(n_estimators=param['n_estimators'])
    
    def create_ADA(self, trial, param):
        # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html
        from sklearn.tree import DecisionTreeRegressor
        
        param.setdefault('max_depth', trial.suggest_int('max_depth', 1, 10))
        ada_base_estimator = DecisionTreeRegressor(max_depth=param['max_depth'])
        
        return AdaBoostRegressor(base_estimator=ada_base_estimator,
                                 learning_rate=param['learning_rate'],
                                 n_estimators=param['n_estimators']
                                )
    
    def create_GB(self, trial, param):
        # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
        param.setdefault('loss_GB', trial.suggest_categorical('loss_GB', ['ls', 'lad', 'huber', 'quantile']))
        param.setdefault('learning_rate', trial.suggest_discrete_uniform('learning_rate', 0.1, 1.0, 0.025))
        param.setdefault('subsample', trial.suggest_discrete_uniform('subsample', 0.05, 1.0, 0.025))
        
        return GradientBoostingRegressor(
            loss=param['loss_GB'],
            learning_rate=param['learning_rate'],
            n_estimators=param['n_estimators'],
            subsample=param['subsample']
        )
    
    
    def create_HGB(self, trial, param):
        # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingRegressor.html
        param.setdefault('loss_HGB', trial.suggest_categorical('loss_HGB', ['least_squares', 'least_absolute_deviation', 'poisson']))
        param.setdefault('learning_rate', trial.suggest_discrete_uniform('learning_rate', 0.1, 1.0, 0.025))
        param.setdefault('max_iter', trial.suggest_int('max_iter', 75, 150, 5))
        param.setdefault('num_leaves', trial.suggest_int('num_leaves', 2, 256, 2))
        param.setdefault('min_samples_leaf', trial.suggest_int('min_samples_leaf', 10, 30, 2))
        
        return HistGradientBoostingRegressor(
            loss=param['loss_HGB'],
            learning_rate=param['learning_rate'],
            max_iter=param['max_iter'],
            max_leaf_nodes=param['num_leaves'],
            min_samples_leaf=param['min_samples_leaf']
        )
    
    def create_LGB(self, trial, param):
        # https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html
        param.setdefault('boosting_type', trial.suggest_categorical('boosting_type', ['gbdt', 'dart', 'goss']))
        param.setdefault('num_leaves', trial.suggest_int('num_leaves', 2, 256, 2))
        param.setdefault('learning_rate', trial.suggest_discrete_uniform('learning_rate', 0.1, 1.0, 0.025))
        param.setdefault('subsample', trial.suggest_discrete_uniform('subsample', 0.05, 1.0, 0.025))
        param.setdefault('reg_alpha', trial.suggest_discrete_uniform('reg_alpha', 0, 0.25, 0.025))
        #param.setdefault('reg_lambda_lgb', trial.suggest_discrete_uniform('reg_lambda_lgb', 0, 0.25, 0.025))
        
        return lgb.LGBMRegressor(
            boosting_type=param['boosting_type'],
            num_leaves=param['num_leaves'],
            max_depth=-1, # no limit
            learning_rate=param['learning_rate'],
            n_estimators=param['n_estimators'],
            subsample=param['subsample'],
            reg_alpha=param['reg_alpha'],
            #reg_lambda=param['reg_lambda_lgb'],
            n_jobs=self.n_jobs
        )
    
    def create_XGB(self, trial, param):
        # https://xgboost.readthedocs.io/en/latest/parameter.html
        param.setdefault('boosting_type', trial.suggest_categorical('booster', ['gbtree', 'dart']))
        param.setdefault('learning_rate', trial.suggest_discrete_uniform('learning_rate', 0.1, 1.0, 0.025))
        param.setdefault('subsample', trial.suggest_discrete_uniform('subsample', 0.05, 1.0, 0.025))
        param.setdefault('reg_lambda_xgb', trial.suggest_discrete_uniform('reg_lambda_xgb', 0.75, 1, 0.025))
        param.setdefault('reg_alpha', trial.suggest_discrete_uniform('reg_alpha', 0, 0.25, 0.025))
        
        return xgb.XGBRegressor(        
            booster=param['boosting_type'],
            learning_rate=param['learning_rate'],
            max_depth=6, # default
            subsample=param['subsample'],
            reg_lambda=param['reg_lambda_xgb'],
            reg_alpha=param['reg_alpha'],
            n_jobs=self.n_jobs
        )
    
    
    def create_CAT(self, trial, param):
        # https://catboost.ai/docs/concepts/python-reference_parameters-list.html
        param.setdefault('reg_lambda_cat', trial.suggest_int('reg_lambda_cat', 1, 5))
        param.setdefault('learning_rate', trial.suggest_discrete_uniform('learning_rate', 0.1, 1.0, 0.025))
        param.setdefault('subsample', trial.suggest_discrete_uniform('subsample', 0.05, 1.0, 0.025))
        param.setdefault('max_depth', trial.suggest_int('max_depth', 1, 10))
        param.setdefault('num_leaves', trial.suggest_int('num_leaves', 2, 256, 2))
        
        return cat.CatBoostRegressor(            
            n_estimators=param['n_estimators'],
            learning_rate=param['learning_rate'],
            reg_lambda=param['reg_lambda_cat'],
            subsample=param['subsample'],
            max_depth=param['max_depth'],
            max_leaves=param['num_leaves'],
            thread_count=self.n_jobs
        )

## Model exploration

In [197]:
data_path = os.path.join('..','data','interim','no_gaps_resampled')

In [198]:
#file = 'JB3156_0.csv' # hardcoded, to choose the largest file
#df = pd.read_csv(os.path.join(data_path, file))
file = 'JB3156_0.feather' # hardcoded, to choose the largest file
df = pd.read_feather(os.path.join(data_path, file))
df.head(10)

Unnamed: 0,timestamp,x,y,z,tac_clean,tac_raw,ir_voltage,temperature
0,2017-05-02 15:42:12.000,-2.556815,8.312009,14.090558,0.027959,0.0,1.096,25.368868
1,2017-05-02 15:42:12.025,-2.499538,8.636412,13.906186,0.027959,0.0,1.096,25.368885
2,2017-05-02 15:42:12.050,-2.471457,8.687223,13.677473,0.027959,0.0,1.096,25.368903
3,2017-05-02 15:42:12.075,-2.349152,8.942586,12.949615,0.02796,0.0,1.096,25.36892
4,2017-05-02 15:42:12.100,-2.367242,9.273101,11.795341,0.02796,0.0,1.096,25.368937
5,2017-05-02 15:42:12.125,-2.322153,9.41102,10.971564,0.02796,0.0,1.096,25.368954
6,2017-05-02 15:42:12.150,-2.436511,9.456464,10.152822,0.02796,0.0,1.096,25.368971
7,2017-05-02 15:42:12.175,-2.389312,9.401663,8.984318,0.02796,0.0,1.096,25.368988
8,2017-05-02 15:42:12.200,-2.404972,9.184252,7.860963,0.027961,0.0,1.096,25.369005
9,2017-05-02 15:42:12.225,-2.434133,9.017045,7.175739,0.027961,0.0,1.096,25.369022


In [199]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1773740 entries, 0 to 1773739
Data columns (total 8 columns):
 #   Column       Dtype         
---  ------       -----         
 0   timestamp    datetime64[ns]
 1   x            float32       
 2   y            float32       
 3   z            float32       
 4   tac_clean    float64       
 5   tac_raw      float64       
 6   ir_voltage   float64       
 7   temperature  float64       
dtypes: datetime64[ns](1), float32(3), float64(4)
memory usage: 88.0 MB


## Sliding window

In [23]:
import numpy as np
from numpy.lib.stride_tricks import as_strided as ast

def norm_shape(shape):
    '''
    Normalize numpy array shapes so they're always expressed as a tuple,
    even for one-dimensional shapes.

    Parameters
        shape - an int, or a tuple of ints

    Returns
        a shape tuple
    '''
    try:
        i = int(shape)
        return (i,)
    except TypeError:
        # shape was not a number
        pass

    try:
        t = tuple(shape)
        return t
    except TypeError:
        # shape was not iterable
        pass

    raise TypeError('shape must be an int, or a tuple of ints')

def sliding_window(a,ws,ss = None,flatten = True):
    '''
    Return a sliding window over a in any number of dimensions

    Parameters:
        a  - an n-dimensional numpy array
        ws - an int (a is 1D) or tuple (a is 2D or greater) representing the size
             of each dimension of the window
        ss - an int (a is 1D) or tuple (a is 2D or greater) representing the
             amount to slide the window in each dimension. If not specified, it
             defaults to ws.
        flatten - if True, all slices are flattened, otherwise, there is an
                  extra dimension for each dimension of the input.

    Returns
        an array containing each n-dimensional window from a
    '''

    if None is ss:
        # ss was not provided. the windows will not overlap in any direction.
        ss = ws
    ws = norm_shape(ws)
    ss = norm_shape(ss)

    # convert ws, ss, and a.shape to numpy arrays so that we can do math in every
    # dimension at once.
    ws = np.array(ws)
    ss = np.array(ss)
    shape = np.array(a.shape)


    # ensure that ws, ss, and a.shape all have the same number of dimensions
    ls = [len(shape),len(ws),len(ss)]
    if 1 != len(set(ls)):
        raise ValueError(\
        'a.shape, ws and ss must all have the same length. They were %s' % str(ls))

    # ensure that ws is smaller than a in every dimension
    if np.any(ws > shape):
        raise ValueError(\
        'ws cannot be larger than a in any dimension.\
 a.shape was %s and ws was %s' % (str(a.shape),str(ws)))

    # how many slices will there be in each dimension?
    newshape = norm_shape(((shape - ws) // ss) + 1)
    # the shape of the strided array will be the number of slices in each dimension
    # plus the shape of the window (tuple addition)
    newshape += norm_shape(ws)
    # the strides tuple will be the array's strides multiplied by step size, plus
    # the array's strides (tuple addition)
    newstrides = norm_shape(np.array(a.strides) * ss) + a.strides
    strided = ast(a,shape = newshape,strides = newstrides)
    if not flatten:
        return strided

    # Collapse strided so that it has one more dimension than the window.  I.e.,
    # the new array is a flat list of slices.
    meat = len(ws) if ws.shape else 0
    firstdim = (np.product(newshape[:-meat]),) if ws.shape else ()
    dim = firstdim + (newshape[-meat:])
    # remove any dimensions with size 1
#     dim = filter(lambda i : i != 1,dim)
    return strided.reshape(dim)

---

## Features

In [241]:
#from librosa.feature import spectral_centroid
from sklearn.decomposition import PCA
from tsfresh.feature_extraction import feature_calculators as tffc


agg_ac_param = [{'f_agg': 'mean', 'maxlag': 10}]
agg_lin_param = [{'attr': 'intercept', 'chunk_len': 5, 'f_agg': 'mean'}]
ar_coef_param = [{'coeff': 2, 'k': 3}]
aug_dick_param = [{'attr': 'pvalue', 'autolag': 'AIC'}]
fft_agg_param = [{'aggtype': 'centroid'}]
lin_t_param = [{'attr': 'pvalue'},{'attr': 'rvalue'}, {'attr': 'intercept'}]

def apply_extraction(x):
    pca = PCA(n_components=1).fit_transform(x)
    return np.hstack([
    #return np.concatenate([
        x[0],
        x[-1],
        np.mean(x, axis=0),
        np.median(x, axis=0),
        np.var(x, axis=0),
        np.std(x, axis=0),
        np.max(x, axis=0),
        np.min(x, axis=0),
        np.argmax(x, axis=0),
        np.argmin(x, axis=0),
        np.sum(x, axis=0),
        np.quantile(x, q=.5, axis=0),
        np.quantile(x, q=.75, axis=0),
        np.quantile(x, q=.25, axis=0),
        #spectral_centroid(y=x, sr=40, hop_length=x.shape[0]+1),
        np.min([pca]),
        np.max([pca]),
        np.mean([pca]),
        np.array([pca]).flatten(),
        # 33.257962465286255
        tffc.abs_energy(x[:,0]),
        tffc.abs_energy(x[:,1]),
        tffc.abs_energy(x[:,2]),
        # 33.27296829223633
        tffc.absolute_sum_of_changes(x[:,0]),
        tffc.absolute_sum_of_changes(x[:,1]),
        tffc.absolute_sum_of_changes(x[:,2]),
        # 34.12496304512024
        #np.array(tffc.agg_autocorrelation(x[:,0], agg_ac_param)).flatten(), # weird return value
        #np.array(tffc.agg_autocorrelation(x[:,1], agg_ac_param)).flatten(),
        #np.array(tffc.agg_autocorrelation(x[:,2], agg_ac_param)).flatten(),
        #tffc.agg_linear_trend(x[:,0], agg_lin_param), # 130s extra, slow and unzip required
        #tffc.agg_linear_trend(x[:,1], agg_lin_param),
        #tffc.agg_linear_trend(x[:,2], agg_lin_param),
        #tffc.approximate_entropy(x[:,0], m=40, r=1.0), # suuuper slow
        #tffc.approximate_entropy(x[:,1], m=40, r=1.0),
        #tffc.approximate_entropy(x[:,2], m=40, r=1.0),
        #np.array(tffc.ar_coefficient(x[:,0], ar_coef_param)).flatten(), # spams warnings
        #np.array(tffc.ar_coefficient(x[:,1], ar_coef_param)).flatten(),
        #np.array(tffc.ar_coefficient(x[:,2], ar_coef_param)).flatten(),
        #np.array(tffc.augmented_dickey_fuller(x[:,0], aug_dick_param)).flatten(), # slow, spams warnings
        #np.array(tffc.augmented_dickey_fuller(x[:,1], aug_dick_param)).flatten(),
        #np.array(tffc.augmented_dickey_fuller(x[:,2], aug_dick_param)).flatten(),
        
        tffc.autocorrelation(x[:,0], 40),
        tffc.autocorrelation(x[:,1], 40),
        tffc.autocorrelation(x[:,2], 40),
        # 45.55896019935608
        tffc.binned_entropy(x[:,0], 40),
        tffc.binned_entropy(x[:,1], 40),
        tffc.binned_entropy(x[:,2], 40),
        # 52.41102385520935 + without 45
        
        tffc.c3(x[:,0], 40),
        tffc.c3(x[:,1], 40),
        tffc.c3(x[:,2], 40),
        # 37.15099740028381 without 52
        tffc.cid_ce(x[:,0], normalize=True),
        tffc.cid_ce(x[:,1], normalize=True),
        tffc.cid_ce(x[:,2], normalize=True),
        # 42.66399002075195
        tffc.count_above(x[:,0], t=0),
        tffc.count_above(x[:,1], t=0),
        tffc.count_above(x[:,2], t=0),
        # 44.1319637298584
        tffc.count_above_mean(x[:,0]),
        tffc.count_above_mean(x[:,1]),
        tffc.count_above_mean(x[:,2]),
        # 47.29397225379944
        tffc.count_below(x[:,0], t=0),
        tffc.count_below(x[:,1], t=0),
        tffc.count_below(x[:,2], t=0),
        # 48.55599308013916
        tffc.count_below_mean(x[:,0]),
        tffc.count_below_mean(x[:,1]),
        tffc.count_below_mean(x[:,2]),
        # 50.72899270057678
        
        #tffc.fft_aggregated(x[:,0], fft_agg_param).to_numpy(), # return zip not pd.series
        #tffc.fft_aggregated(x[:,1], fft_agg_param).to_numpy(),
        #tffc.fft_aggregated(x[:,2], fft_agg_param).to_numpy(),
        tffc.first_location_of_maximum(x[:,0]),
        tffc.first_location_of_maximum(x[:,1]),
        tffc.first_location_of_maximum(x[:,2]),
        tffc.first_location_of_minimum(x[:,0]),
        tffc.first_location_of_minimum(x[:,1]),
        tffc.first_location_of_minimum(x[:,2]),
        tffc.kurtosis(x[:,0]),
        tffc.kurtosis(x[:,1]),
        tffc.kurtosis(x[:,2]),
        tffc.last_location_of_maximum(x[:,0]),
        tffc.last_location_of_maximum(x[:,1]),
        tffc.last_location_of_maximum(x[:,2]),
        tffc.last_location_of_minimum(x[:,0]),
        tffc.last_location_of_minimum(x[:,1]),
        tffc.last_location_of_minimum(x[:,2]),
        #tffc.linear_trend(x[:,0], lin_t_param).to_numpy(), # returns list not pd.series
        #tffc.linear_trend(x[:,1], lin_t_param).to_numpy(),
        #tffc.linear_trend(x[:,2], lin_t_param).to_numpy(),
        tffc.longest_strike_above_mean(x[:,0]),
        tffc.longest_strike_above_mean(x[:,1]),
        tffc.longest_strike_above_mean(x[:,2]),
        tffc.longest_strike_below_mean(x[:,0]),
        tffc.longest_strike_below_mean(x[:,1]),
        tffc.longest_strike_below_mean(x[:,2]),
        tffc.mean_abs_change(x[:,0]),
        tffc.mean_abs_change(x[:,1]),
        tffc.mean_abs_change(x[:,2]),
        tffc.mean_change(x[:,0]),
        tffc.mean_change(x[:,1]),
        tffc.mean_change(x[:,2]),
        tffc.mean_second_derivative_central(x[:,0]),
        tffc.mean_second_derivative_central(x[:,1]),
        tffc.mean_second_derivative_central(x[:,2]),
        tffc.number_crossing_m(x[:,0], m=0),
        tffc.number_crossing_m(x[:,1], m=0),
        tffc.number_crossing_m(x[:,2], m=0),
        tffc.number_peaks(x[:,0], n=2),
        tffc.number_peaks(x[:,1], n=2),
        tffc.number_peaks(x[:,2], n=2),
        
        #tffc.sample_entropy(x[:,0]), # super slow
        #tffc.sample_entropy(x[:,1]),
        #tffc.sample_entropy(x[:,2]),
        tffc.skewness(x[:,0]), # 60s extra, slow
        tffc.skewness(x[:,1]),
        tffc.skewness(x[:,2]),
        
        tffc.time_reversal_asymmetry_statistic(x[:,0], lag=10),
        tffc.time_reversal_asymmetry_statistic(x[:,1], lag=10),
        tffc.time_reversal_asymmetry_statistic(x[:,2], lag=10),
        tffc.variation_coefficient(x[:,0]),
        tffc.variation_coefficient(x[:,1]),
        tffc.variation_coefficient(x[:,2]),
    ])

In [242]:
import time

def apply_sliding_window(X, y, ws=400, ss=80): # maybe use bigger windows to reduce the number of samples?
    
    begin = time.time()
    start = time.time()
    data_x = sliding_window(X,(ws,X.shape[1]),(ss,1))
    #print(f'window x: {time.time() - start}')
    
    start = time.time()
    data_x = np.array([ apply_extraction(x) for x in data_x])
    #print(f'features: {time.time() - start}')
    
    start = time.time()
    data_y = np.asarray([[i[-1]] for i in sliding_window(y,ws,ss)])
    #print(f'window y: {time.time() - start}')
    
    data_y *= 100 # Try to increate numbers to reduce error
    print(f'Total: {time.time() - begin}')
    
    return data_x.astype(np.float32), data_y.reshape(len(data_y))

In [165]:
from collections import namedtuple

def df_to_dataset(df):
    data = df.drop(['timestamp', 'tac_clean', 'tac_raw', 'ir_voltage', 'temperature'], axis='columns')
    target = df['tac_clean']
    
    data, target = apply_sliding_window(data.to_numpy(), target.to_numpy())
    
    dataset = namedtuple('Dataset', ['data', 'target'])
    dataset.data = data
    dataset.target = target

    return dataset    

---

In [243]:
dataset = df_to_dataset(df)

Total: 189.1859998703003


In [244]:
dataset.data.shape

(22167, 520)

In [245]:
dataset.target.shape

(22167,)

# HPO

In [246]:
objective = RegressionObjective(dataset)

In [247]:
# https://optuna.readthedocs.io/en/stable/faq.html#how-to-suppress-log-messages-of-optuna
#optuna.logging.set_verbosity(optuna.logging.WARNING) # default.INFO

In [248]:
sampler = optuna.samplers.TPESampler(seed=42) # Make the sampler behave in a deterministic way.

Use Hyperband pruning:
    
* https://tech.preferred.jp/en/blog/how-we-implement-hyperband-in-optuna/

In [249]:
pruner = optuna.pruners.MedianPruner(n_startup_trials=10)
#pruner = optuna.pruners.HyperbandPruner(min_resource=1, max_resource=100, reduction_factor=3)

In [250]:
study = optuna.create_study(direction='maximize', sampler=sampler, pruner=pruner)
study.optimize(objective, n_trials=1000)

[I 2020-07-26 16:08:14,271] Finished trial#0 with value: -6.226782020000555 with parameters: {'n_estimators': 520, 'boosting_type': 'gbdt', 'num_leaves': 30, 'learning_rate': 0.775, 'subsample': 0.6250000000000001, 'reg_alpha': 0.025, 'reg_lambda_lgb': 0.025}. Best is trial#0 with value: -6.226782020000555.
[I 2020-07-26 16:08:24,970] Finished trial#1 with value: -1.3923741595941456e+79 with parameters: {'n_estimators': 380, 'boosting_type': 'goss', 'num_leaves': 176, 'learning_rate': 0.4, 'subsample': 0.175, 'reg_alpha': 0.17500000000000002, 'reg_lambda_lgb': 0.0}. Best is trial#0 with value: -6.226782020000555.
[I 2020-07-26 16:09:33,890] Finished trial#2 with value: -4.925650870174299 with parameters: {'n_estimators': 445, 'boosting_type': 'dart', 'num_leaves': 76, 'learning_rate': 0.1, 'subsample': 1.0, 'reg_alpha': 0.15000000000000002, 'reg_lambda_lgb': 0.15000000000000002}. Best is trial#2 with value: -4.925650870174299.
[I 2020-07-26 16:10:15,453] Finished trial#3 with value: -6

[I 2020-07-26 16:43:51,778] Finished trial#51 with value: -4.448325776122324 with parameters: {'n_estimators': 665, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.375, 'subsample': 0.6500000000000001, 'reg_alpha': 0.17500000000000002, 'reg_lambda_lgb': 0.25}. Best is trial#16 with value: -4.413042242721769.
[I 2020-07-26 16:43:53,335] Finished trial#52 with value: -3.98650085988287 with parameters: {'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.375, 'subsample': 0.7000000000000001, 'reg_alpha': 0.15000000000000002, 'reg_lambda_lgb': 0.25}. Best is trial#52 with value: -3.98650085988287.
[I 2020-07-26 16:43:59,299] Finished trial#53 with value: -4.64764364797123 with parameters: {'n_estimators': 140, 'boosting_type': 'dart', 'num_leaves': 12, 'learning_rate': 0.275, 'subsample': 0.675, 'reg_alpha': 0.17500000000000002, 'reg_lambda_lgb': 0.2}. Best is trial#52 with value: -3.98650085988287.
[I 2020-07-26 16:44:05,074] Finished trial#54 wit

[I 2020-07-26 16:48:45,790] Finished trial#101 with value: -4.373646584839265 with parameters: {'n_estimators': 100, 'boosting_type': 'dart', 'num_leaves': 4, 'learning_rate': 0.25, 'subsample': 0.1, 'reg_alpha': 0.15000000000000002, 'reg_lambda_lgb': 0.07500000000000001}. Best is trial#82 with value: -3.8975948611898104.
[I 2020-07-26 16:48:47,887] Finished trial#102 with value: -4.083560848388084 with parameters: {'n_estimators': 85, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.17500000000000002, 'subsample': 0.15000000000000002, 'reg_alpha': 0.15000000000000002, 'reg_lambda_lgb': 0.125}. Best is trial#82 with value: -3.8975948611898104.
[I 2020-07-26 16:48:49,626] Finished trial#103 with value: -4.291779904028735 with parameters: {'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 14, 'learning_rate': 0.17500000000000002, 'subsample': 0.125, 'reg_alpha': 0.125, 'reg_lambda_lgb': 0.05}. Best is trial#82 with value: -3.8975948611898104.
[I 2020-07-26 16:48:51,6

[I 2020-07-26 16:53:42,869] Finished trial#151 with value: -4.1518163562516435 with parameters: {'n_estimators': 50, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.125, 'subsample': 0.1, 'reg_alpha': 0.15000000000000002, 'reg_lambda_lgb': 0.17500000000000002}. Best is trial#143 with value: -3.8011721050219904.
[I 2020-07-26 16:53:45,839] Finished trial#152 with value: -4.350069595082824 with parameters: {'n_estimators': 80, 'boosting_type': 'dart', 'num_leaves': 6, 'learning_rate': 0.1, 'subsample': 0.15000000000000002, 'reg_alpha': 0.15000000000000002, 'reg_lambda_lgb': 0.125}. Best is trial#143 with value: -3.8011721050219904.
[I 2020-07-26 16:53:47,737] Finished trial#153 with value: -4.063887986860357 with parameters: {'n_estimators': 30, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.17500000000000002, 'subsample': 0.175, 'reg_alpha': 0.125, 'reg_lambda_lgb': 0.17500000000000002}. Best is trial#143 with value: -3.8011721050219904.
[I 2020-07-26 16:53:51

[I 2020-07-26 16:55:30,712] Finished trial#202 with value: -3.8295601432112734 with parameters: {'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.125, 'subsample': 0.15000000000000002, 'reg_alpha': 0.225, 'reg_lambda_lgb': 0.0}. Best is trial#201 with value: -3.80117036930062.
[I 2020-07-26 16:55:32,276] Finished trial#203 with value: -3.8295601432112734 with parameters: {'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.125, 'subsample': 0.125, 'reg_alpha': 0.225, 'reg_lambda_lgb': 0.0}. Best is trial#201 with value: -3.80117036930062.
[I 2020-07-26 16:55:34,123] Finished trial#204 with value: -4.0855410900956715 with parameters: {'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 8, 'learning_rate': 0.125, 'subsample': 0.125, 'reg_alpha': 0.225, 'reg_lambda_lgb': 0.0}. Best is trial#201 with value: -3.80117036930062.
[I 2020-07-26 16:55:36,027] Finished trial#205 with value: -4.128289906078131 with parameters: {'n

[I 2020-07-26 16:57:42,900] Finished trial#255 with value: -4.0426228529832295 with parameters: {'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 6, 'learning_rate': 0.125, 'subsample': 0.2, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.0}. Best is trial#201 with value: -3.80117036930062.
[I 2020-07-26 16:57:44,600] Finished trial#256 with value: -3.949653810899758 with parameters: {'n_estimators': 30, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.1, 'subsample': 0.175, 'reg_alpha': 0.225, 'reg_lambda_lgb': 0.0}. Best is trial#201 with value: -3.80117036930062.
[I 2020-07-26 16:57:46,594] Finished trial#257 with value: -4.036443756075716 with parameters: {'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 10, 'learning_rate': 0.1, 'subsample': 0.1, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.0}. Best is trial#201 with value: -3.80117036930062.
[I 2020-07-26 16:57:48,542] Finished trial#258 with value: -4.1518165759912415 with parameters: {'n_estimators': 50, 'boost

[I 2020-07-26 17:00:58,362] Finished trial#309 with value: -4.450676125305645 with parameters: {'n_estimators': 25, 'boosting_type': 'dart', 'num_leaves': 16, 'learning_rate': 0.125, 'subsample': 0.15000000000000002, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.0}. Best is trial#201 with value: -3.80117036930062.
[I 2020-07-26 17:01:01,984] Finished trial#310 with value: -4.391866634057127 with parameters: {'n_estimators': 85, 'boosting_type': 'dart', 'num_leaves': 8, 'learning_rate': 0.1, 'subsample': 0.1, 'reg_alpha': 0.225, 'reg_lambda_lgb': 0.0}. Best is trial#201 with value: -3.80117036930062.
[I 2020-07-26 17:01:03,597] Finished trial#311 with value: -3.801170320792275 with parameters: {'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.1, 'subsample': 0.15000000000000002, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.025}. Best is trial#311 with value: -3.801170320792275.
[I 2020-07-26 17:01:05,544] Finished trial#312 with value: -4.053514218281574 with paramete

[I 2020-07-26 17:03:31,342] Finished trial#363 with value: -4.159866089057107 with parameters: {'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 16, 'learning_rate': 0.125, 'subsample': 0.07500000000000001, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.025}. Best is trial#311 with value: -3.801170320792275.
[I 2020-07-26 17:03:34,343] Finished trial#364 with value: -4.3209485815326545 with parameters: {'n_estimators': 55, 'boosting_type': 'dart', 'num_leaves': 8, 'learning_rate': 0.1, 'subsample': 0.125, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.05}. Best is trial#311 with value: -3.801170320792275.
[I 2020-07-26 17:03:36,165] Finished trial#365 with value: -84.67838763984467 with parameters: {'n_estimators': 10, 'boosting_type': 'goss', 'num_leaves': 2, 'learning_rate': 0.125, 'subsample': 0.2, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.025}. Best is trial#311 with value: -3.801170320792275.
[I 2020-07-26 17:03:38,825] Finished trial#366 with value: -4.472123985871144 with parameters: {'n

[I 2020-07-26 17:07:46,958] Finished trial#415 with value: -4.788974952011841 with parameters: {'n_estimators': 30, 'boosting_type': 'dart', 'num_leaves': 140, 'learning_rate': 0.1, 'subsample': 0.175, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.05}. Best is trial#389 with value: -3.8011702019246445.
[I 2020-07-26 17:07:50,237] Finished trial#416 with value: -4.350016233047502 with parameters: {'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 80, 'learning_rate': 0.125, 'subsample': 0.5750000000000001, 'reg_alpha': 0.225, 'reg_lambda_lgb': 0.0}. Best is trial#389 with value: -3.8011702019246445.
[I 2020-07-26 17:07:52,324] Finished trial#417 with value: -4.223796022626689 with parameters: {'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 20, 'learning_rate': 0.15000000000000002, 'subsample': 0.22500000000000003, 'reg_alpha': 0.2, 'reg_lambda_lgb': 0.0}. Best is trial#389 with value: -3.8011702019246445.
[I 2020-07-26 17:08:02,558] Finished trial#418 with value: -4.841452

[I 2020-07-26 17:10:42,846] Finished trial#468 with value: -4.027062742572048 with parameters: {'n_estimators': 45, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.125, 'subsample': 0.22500000000000003, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.0}. Best is trial#389 with value: -3.8011702019246445.
[I 2020-07-26 17:10:44,924] Finished trial#469 with value: -4.053555834001158 with parameters: {'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 12, 'learning_rate': 0.1, 'subsample': 0.2, 'reg_alpha': 0.225, 'reg_lambda_lgb': 0.0}. Best is trial#389 with value: -3.8011702019246445.
[I 2020-07-26 17:10:47,215] Finished trial#470 with value: -4.313932587210446 with parameters: {'n_estimators': 30, 'boosting_type': 'dart', 'num_leaves': 8, 'learning_rate': 0.1, 'subsample': 0.175, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.07500000000000001}. Best is trial#389 with value: -3.8011702019246445.
[I 2020-07-26 17:10:53,903] Finished trial#471 with value: -5.0756172814688165 with pa

[I 2020-07-26 17:13:53,999] Finished trial#521 with value: -4.081200887136621 with parameters: {'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 18, 'learning_rate': 0.1, 'subsample': 0.125, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.0}. Best is trial#389 with value: -3.8011702019246445.
[I 2020-07-26 17:14:15,011] Finished trial#522 with value: -4.7995997256321585 with parameters: {'n_estimators': 950, 'boosting_type': 'dart', 'num_leaves': 6, 'learning_rate': 0.15000000000000002, 'subsample': 0.125, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.0}. Best is trial#389 with value: -3.8011702019246445.
[I 2020-07-26 17:14:17,112] Finished trial#523 with value: -4.053534421422496 with parameters: {'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 12, 'learning_rate': 0.1, 'subsample': 0.15000000000000002, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.0}. Best is trial#389 with value: -3.8011702019246445.
[I 2020-07-26 17:14:19,949] Finished trial#524 with value: -4.504781937600861 with p

[I 2020-07-26 17:17:06,408] Finished trial#574 with value: -4.127874224032274 with parameters: {'n_estimators': 10, 'boosting_type': 'goss', 'num_leaves': 14, 'learning_rate': 0.1, 'subsample': 0.175, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.0}. Best is trial#389 with value: -3.8011702019246445.
[I 2020-07-26 17:17:08,613] Finished trial#575 with value: -4.065931966092685 with parameters: {'n_estimators': 45, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.15000000000000002, 'subsample': 0.15000000000000002, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.0}. Best is trial#389 with value: -3.8011702019246445.
[I 2020-07-26 17:17:11,053] Finished trial#576 with value: -4.265469223513601 with parameters: {'n_estimators': 25, 'boosting_type': 'dart', 'num_leaves': 8, 'learning_rate': 0.1, 'subsample': 0.125, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.0}. Best is trial#389 with value: -3.8011702019246445.
[I 2020-07-26 17:17:13,109] Finished trial#577 with value: -4.4631469067680145 with par

[I 2020-07-26 17:19:56,293] Finished trial#628 with value: -4.0038132360888 with parameters: {'n_estimators': 30, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.125, 'subsample': 0.175, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.0}. Best is trial#389 with value: -3.8011702019246445.
[I 2020-07-26 17:19:59,292] Finished trial#629 with value: -4.393625633388724 with parameters: {'n_estimators': 45, 'boosting_type': 'dart', 'num_leaves': 10, 'learning_rate': 0.1, 'subsample': 0.22500000000000003, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.0}. Best is trial#389 with value: -3.8011702019246445.
[I 2020-07-26 17:20:01,600] Finished trial#630 with value: -118.9817808923895 with parameters: {'n_estimators': 10, 'boosting_type': 'goss', 'num_leaves': 16, 'learning_rate': 0.125, 'subsample': 0.175, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.0}. Best is trial#389 with value: -3.8011702019246445.
[I 2020-07-26 17:20:05,037] Finished trial#631 with value: -4.41903811386322 with parameters: {'n_es

[I 2020-07-26 17:22:48,954] Finished trial#681 with value: -4.289940234054799 with parameters: {'n_estimators': 45, 'boosting_type': 'dart', 'num_leaves': 6, 'learning_rate': 0.1, 'subsample': 0.07500000000000001, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.07500000000000001}. Best is trial#659 with value: -3.801170145087335.
[I 2020-07-26 17:22:51,180] Finished trial#682 with value: -3.829558872754154 with parameters: {'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.125, 'subsample': 0.05, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.07500000000000001}. Best is trial#659 with value: -3.801170145087335.
[I 2020-07-26 17:22:53,349] Finished trial#683 with value: -3.8011702019246445 with parameters: {'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.1, 'subsample': 0.05, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.07500000000000001}. Best is trial#659 with value: -3.801170145087335.
[I 2020-07-26 17:22:56,213] Finished trial#684 with value:

[I 2020-07-26 17:25:24,277] Finished trial#733 with value: -4.065913128554988 with parameters: {'n_estimators': 45, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.15000000000000002, 'subsample': 0.05, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.125}. Best is trial#725 with value: -3.8011700914007047.
[I 2020-07-26 17:25:26,659] Finished trial#734 with value: -3.9825230392308875 with parameters: {'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 6, 'learning_rate': 0.1, 'subsample': 0.05, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.1}. Best is trial#725 with value: -3.8011700914007047.
[I 2020-07-26 17:25:32,559] Finished trial#735 with value: -4.224434626073736 with parameters: {'n_estimators': 540, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.1, 'subsample': 0.07500000000000001, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.1}. Best is trial#725 with value: -3.8011700914007047.
[I 2020-07-26 17:25:34,660] Finished trial#736 with value: -3.8011700914007047 with pa

[I 2020-07-26 17:29:35,002] Finished trial#786 with value: -3.8011700914007047 with parameters: {'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.1, 'subsample': 0.1, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.125}. Best is trial#725 with value: -3.8011700914007047.
[I 2020-07-26 17:29:38,078] Finished trial#787 with value: -4.388387495303182 with parameters: {'n_estimators': 45, 'boosting_type': 'dart', 'num_leaves': 10, 'learning_rate': 0.1, 'subsample': 0.1, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.125}. Best is trial#725 with value: -3.8011700914007047.
[I 2020-07-26 17:29:40,473] Finished trial#788 with value: -4.003798294133011 with parameters: {'n_estimators': 30, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.125, 'subsample': 0.07500000000000001, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.125}. Best is trial#725 with value: -3.8011700914007047.
[I 2020-07-26 17:29:50,214] Finished trial#789 with value: -4.5880260480266015 with parameters: {'

[I 2020-07-26 17:32:31,980] Finished trial#838 with value: -4.1314455505481495 with parameters: {'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 12, 'learning_rate': 0.125, 'subsample': 0.07500000000000001, 'reg_alpha': 0.225, 'reg_lambda_lgb': 0.1}. Best is trial#807 with value: -3.8011700330839697.
[I 2020-07-26 17:32:34,290] Finished trial#839 with value: -3.9314556536851186 with parameters: {'n_estimators': 25, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.1, 'subsample': 0.1, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.125}. Best is trial#807 with value: -3.8011700330839697.
[I 2020-07-26 17:32:36,746] Finished trial#840 with value: -4.0230883616223165 with parameters: {'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 8, 'learning_rate': 0.1, 'subsample': 0.05, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.125}. Best is trial#807 with value: -3.8011700330839697.
[I 2020-07-26 17:32:39,744] Finished trial#841 with value: -7.52819086218358e+75 with parameters

[I 2020-07-26 17:36:15,038] Finished trial#890 with value: -4.68869108380714 with parameters: {'n_estimators': 60, 'boosting_type': 'dart', 'num_leaves': 12, 'learning_rate': 0.35, 'subsample': 0.07500000000000001, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.15000000000000002}. Best is trial#807 with value: -3.8011700330839697.
[I 2020-07-26 17:36:17,354] Finished trial#891 with value: -3.8011700914007047 with parameters: {'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.1, 'subsample': 0.05, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.125}. Best is trial#807 with value: -3.8011700330839697.
[I 2020-07-26 17:36:21,627] Finished trial#892 with value: -4.581193336245966 with parameters: {'n_estimators': 45, 'boosting_type': 'dart', 'num_leaves': 20, 'learning_rate': 0.125, 'subsample': 0.05, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.125}. Best is trial#807 with value: -3.8011700330839697.
[I 2020-07-26 17:36:23,928] Finished trial#893 with value: -3.801170145087335 wit

[I 2020-07-26 17:39:19,294] Finished trial#942 with value: -4.496299268247015 with parameters: {'n_estimators': 50, 'boosting_type': 'dart', 'num_leaves': 12, 'learning_rate': 0.15000000000000002, 'subsample': 0.05, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.125}. Best is trial#807 with value: -3.8011700330839697.
[I 2020-07-26 17:39:21,979] Finished trial#943 with value: -4.0522433776174696 with parameters: {'n_estimators': 90, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.125, 'subsample': 0.05, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.15000000000000002}. Best is trial#807 with value: -3.8011700330839697.
[I 2020-07-26 17:39:24,754] Finished trial#944 with value: -4.100714511282641 with parameters: {'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 20, 'learning_rate': 0.1, 'subsample': 0.07500000000000001, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.125}. Best is trial#807 with value: -3.8011700330839697.
[I 2020-07-26 17:39:27,631] Finished trial#945 with value: -4.365

[I 2020-07-26 17:42:43,743] Finished trial#994 with value: -4.229837138930464 with parameters: {'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 38, 'learning_rate': 0.125, 'subsample': 0.1, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.125}. Best is trial#807 with value: -3.8011700330839697.
[I 2020-07-26 17:42:47,317] Finished trial#995 with value: -4.5321519353203525 with parameters: {'n_estimators': 45, 'boosting_type': 'dart', 'num_leaves': 12, 'learning_rate': 0.15000000000000002, 'subsample': 0.7750000000000001, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.125}. Best is trial#807 with value: -3.8011700330839697.
[I 2020-07-26 17:42:49,909] Finished trial#996 with value: -3.9314571515533445 with parameters: {'n_estimators': 25, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.1, 'subsample': 0.05, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.1}. Best is trial#807 with value: -3.8011700330839697.
[I 2020-07-26 17:42:52,868] Finished trial#997 with value: -4.189007336286164 with

In [302]:
df_s = study.trials_dataframe()

In [301]:
# dirty fix
import random
#tx = study.trials[7]
#tx.value = -5
#study._storage._set_trial(study.trials[7].number, tx)
#study.trials[7]
for t in study.trials:
    if t.value <= -5:
        t.value = -5 - random.random()
        study._storage._set_trial(t.number, t)

In [303]:
df_s.tail(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_boosting_type,params_learning_rate,params_n_estimators,params_num_leaves,params_reg_alpha,params_reg_lambda_lgb,params_subsample,state
990,990,-4.423465,2020-07-26 17:42:26.383186,2020-07-26 17:42:29.527409,00:00:03.144223,dart,0.125,30,10,0.225,0.125,0.05,COMPLETE
991,991,-4.727712,2020-07-26 17:42:29.531378,2020-07-26 17:42:32.710379,00:00:03.179001,dart,0.1,60,6,0.25,0.125,0.075,COMPLETE
992,992,-4.41541,2020-07-26 17:42:32.713378,2020-07-26 17:42:37.768480,00:00:05.055102,dart,0.125,10,154,0.25,0.1,0.05,COMPLETE
993,993,-3.949638,2020-07-26 17:42:37.771477,2020-07-26 17:42:40.489623,00:00:02.718146,dart,0.1,30,2,0.25,0.15,0.075,COMPLETE
994,994,-4.229837,2020-07-26 17:42:40.493619,2020-07-26 17:42:43.742423,00:00:03.248804,dart,0.125,10,38,0.25,0.125,0.1,COMPLETE
995,995,-4.532152,2020-07-26 17:42:43.746392,2020-07-26 17:42:47.317766,00:00:03.571374,dart,0.15,45,12,0.25,0.125,0.775,COMPLETE
996,996,-3.931457,2020-07-26 17:42:47.320732,2020-07-26 17:42:49.909582,00:00:02.588850,dart,0.1,25,2,0.25,0.1,0.05,COMPLETE
997,997,-4.189007,2020-07-26 17:42:49.913544,2020-07-26 17:42:52.867605,00:00:02.954061,dart,0.125,10,18,0.225,0.125,0.075,COMPLETE
998,998,-4.413591,2020-07-26 17:42:52.871607,2020-07-26 17:42:56.461994,00:00:03.590387,dart,0.1,75,6,0.25,0.125,0.075,COMPLETE
999,999,-4.065913,2020-07-26 17:42:56.465959,2020-07-26 17:42:59.116223,00:00:02.650264,dart,0.15,45,2,0.25,0.125,0.1,COMPLETE


In [305]:
print(f'Best trial until now: {study.best_trial.number}')
print(' Value: ', study.best_trial.value)
print(' Params: ')
for key, value in study.best_trial.params.items():
    print(f'    {key}: {value}')

Best trial until now: 807
 Value:  -3.8011700330839697
 Params: 
    n_estimators: 10
    boosting_type: dart
    num_leaves: 2
    learning_rate: 0.1
    subsample: 0.07500000000000001
    reg_alpha: 0.25
    reg_lambda_lgb: 0.15000000000000002


In [288]:
#optuna.visualization.plot_contour(study, params=study.best_trial.params)

In [289]:
optuna.visualization.plot_intermediate_values(study) # apparently not working in current conda version?

[W 2020-07-26 18:26:54,233] You need to set up the pruning feature to utilize `plot_intermediate_values()`


In [304]:
optuna.visualization.plot_optimization_history(study)

In [307]:
optuna.visualization.plot_parallel_coordinate(study)

In [308]:
optuna.importance.get_param_importances(study)
#optuna.visualization.plot_param_importances(study)


get_param_importances is experimental (supported from v1.3.0). The interface can change in the future.


MeanDecreaseImpurityImportanceEvaluator is experimental (supported from v1.5.0). The interface can change in the future.



OrderedDict([('num_leaves', 0.33549095833491716),
             ('n_estimators', 0.3329596561540712),
             ('learning_rate', 0.16027389715185955),
             ('boosting_type', 0.12930899842422203),
             ('subsample', 0.015124982533618367),
             ('reg_lambda_lgb', 0.014859234137949272),
             ('reg_alpha', 0.01198227326336244)])

In [231]:
##!conda install -c plotly plotly-orca -y
#!conda install -c plotly python-kaleido -y

In [232]:
import plotly.io as pio
pio.kaleido.scope.default_format = "png"
pio.kaleido.scope.default_width = 800
pio.kaleido.scope.default_height = 400
pio.kaleido.scope.default_scale = 2

optuna.visualization.plot_slice(study).write_image("fig1.png")

AttributeError: module 'plotly.io' has no attribute 'kaleido'

In [313]:
optuna.visualization.plot_slice(study)

---

## Training

In [312]:
study.best_trial

FrozenTrial(number=807, value=-3.8011700330839697, datetime_start=datetime.datetime(2020, 7, 26, 17, 30, 54, 987829), datetime_complete=datetime.datetime(2020, 7, 26, 17, 30, 57, 374692), params={'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.1, 'subsample': 0.07500000000000001, 'reg_alpha': 0.25, 'reg_lambda_lgb': 0.15000000000000002}, distributions={'n_estimators': IntUniformDistribution(high=1000, low=10, step=5), 'boosting_type': CategoricalDistribution(choices=('gbdt', 'dart', 'goss')), 'num_leaves': IntUniformDistribution(high=256, low=2, step=2), 'learning_rate': DiscreteUniformDistribution(high=1.0, low=0.1, q=0.025), 'subsample': DiscreteUniformDistribution(high=1.0, low=0.05, q=0.025), 'reg_alpha': DiscreteUniformDistribution(high=0.25, low=0, q=0.025), 'reg_lambda_lgb': DiscreteUniformDistribution(high=0.25, low=0, q=0.025)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=807, state=TrialState.COMPLETE)

In [181]:
study.best_trial

FrozenTrial(number=348, value=-1.830178203294662, datetime_start=datetime.datetime(2020, 7, 26, 6, 43, 44, 825069), datetime_complete=datetime.datetime(2020, 7, 26, 6, 43, 46, 263068), params={'n_estimators': 10, 'boosting_type': 'dart', 'num_leaves': 2, 'learning_rate': 0.1, 'subsample': 0.05, 'reg_alpha': 0.17500000000000002, 'reg_lambda_lgb': 0.07500000000000001}, distributions={'n_estimators': IntUniformDistribution(high=1000, low=10, step=5), 'boosting_type': CategoricalDistribution(choices=('gbdt', 'dart', 'goss')), 'num_leaves': IntUniformDistribution(high=256, low=2, step=2), 'learning_rate': DiscreteUniformDistribution(high=1.0, low=0.1, q=0.025), 'subsample': DiscreteUniformDistribution(high=1.0, low=0.05, q=0.025), 'reg_alpha': DiscreteUniformDistribution(high=0.25, low=0, q=0.025), 'reg_lambda_lgb': DiscreteUniformDistribution(high=0.25, low=0, q=0.025)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=348, state=TrialState.COMPLETE)

In [314]:
model = objective.create_model(study.best_trial)

In [315]:
model

LGBMRegressor(boosting_type='dart', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=10, n_jobs=12, num_leaves=2, objective=None,
              random_state=None, reg_alpha=0.25, reg_lambda=0.15000000000000002,
              silent=True, subsample=0.07500000000000001,
              subsample_for_bin=200000, subsample_freq=0)

In [316]:
lgb.plot_importance(model, max_num_features=10)

NotFittedError: No booster found. Need to call fit beforehand.

In [317]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

error_records = {}

for file in os.listdir(data_path):
    df = pd.read_feather(os.path.join(data_path, file))
    dataset = df_to_dataset(df)
    print(file)
    
    X_train, X_val, y_train, y_val = train_test_split(dataset.data, dataset.target, random_state=42)
    
    model.fit(X_train, y_train)    
    
    y_pred = model.predict(X_val)    
    
    error = mean_squared_error(y_val, y_pred)
    error_records[file] = error    

Total: 67.959965467453
BK7610_0.feather
Total: 104.6629991531372
BK7610_1.feather
Total: 4.2199976444244385
BU4707_0.feather
Total: 41.3999981880188
BU4707_1.feather
Total: 1.824997901916504
BU4707_2.feather
Total: 7.391995191574097
BU4707_3.feather
Total: 4.8919997215271
BU4707_4.feather
Total: 31.614002466201782
CC6740_0.feather
Total: 2.792994737625122
CC6740_1.feather
Total: 1.663999080657959
CC6740_10.feather
Total: 1.6829981803894043
CC6740_11.feather
Total: 1.7819995880126953
CC6740_12.feather
Total: 13.786998271942139
CC6740_13.feather
Total: 10.676993370056152
CC6740_14.feather
Total: 40.39599800109863
CC6740_15.feather
Total: 14.657029628753662
CC6740_16.feather
Total: 8.334999561309814
CC6740_17.feather
Total: 146.8540003299713
CC6740_18.feather
Total: 2.1239991188049316
CC6740_2.feather
Total: 1.6589956283569336
CC6740_3.feather
Total: 13.263997793197632
CC6740_4.feather
Total: 4.778994798660278
CC6740_5.feather
Total: 6.158995151519775
CC6740_6.feather
Total: 3.56399512290

In [318]:
error_records

{'BK7610_0.feather': 0.7784136459054397,
 'BK7610_1.feather': 20.107831370300758,
 'BU4707_0.feather': 0.013696581024420956,
 'BU4707_1.feather': 2.564417173710098,
 'BU4707_2.feather': 9.354013224797345e-07,
 'BU4707_3.feather': 0.12460574872668395,
 'BU4707_4.feather': 0.23677365037507284,
 'CC6740_0.feather': 0.3640912397262533,
 'CC6740_1.feather': 4.165267648767137e-05,
 'CC6740_10.feather': 0.005281239924077104,
 'CC6740_11.feather': 0.019646112936999867,
 'CC6740_12.feather': 0.03462278948621458,
 'CC6740_13.feather': 3.6212127225076176,
 'CC6740_14.feather': 0.3014053224303431,
 'CC6740_15.feather': 32.542652836979414,
 'CC6740_16.feather': 0.10422179200362967,
 'CC6740_17.feather': 1.2362607284774165,
 'CC6740_18.feather': 3.0036751107078246,
 'CC6740_2.feather': 0.0001208294775494167,
 'CC6740_3.feather': 0.0007644482149312817,
 'CC6740_4.feather': 1.4427598682518223,
 'CC6740_5.feather': 0.43942009507857765,
 'CC6740_6.feather': 0.5297979882838226,
 'CC6740_7.feather': 0.024

In [188]:
error_records

{'BK7610_0.feather': 0.7784345677019169,
 'BK7610_1.feather': 20.107696311072612,
 'BU4707_0.feather': 0.013598424215543085,
 'BU4707_1.feather': 2.564625092628471,
 'BU4707_2.feather': 9.354013224797345e-07,
 'BU4707_3.feather': 0.12453919152725415,
 'BU4707_4.feather': 0.23664450582715937,
 'CC6740_0.feather': 0.36403692624465045,
 'CC6740_1.feather': 3.680099868739128e-05,
 'CC6740_10.feather': 0.005114271537152934,
 'CC6740_11.feather': 0.01952403313367465,
 'CC6740_12.feather': 0.034809803624167246,
 'CC6740_13.feather': 3.5991827388525546,
 'CC6740_14.feather': 0.3013241846629774,
 'CC6740_15.feather': 32.54214569217832,
 'CC6740_16.feather': 0.10418906047206207,
 'CC6740_17.feather': 1.23589759733376,
 'CC6740_18.feather': 3.0099221704154613,
 'CC6740_2.feather': 0.0001164075099735769,
 'CC6740_3.feather': 0.0007485011416534549,
 'CC6740_4.feather': 1.4425868762219916,
 'CC6740_5.feather': 0.4391325872968607,
 'CC6740_6.feather': 0.5296724457758348,
 'CC6740_7.feather': 0.025016

In [193]:
name = objective.get_name(study.best_trial)
with open(os.path.join(models_path, f'{name}.pickle'), 'wb') as fout:
    pickle.dump(model, fout)