# Loading packages

In [1]:
import optuna
from sklearn.datasets import load_diabetes
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

# Constants

In [2]:
random_state = 54321

In [3]:
logging_server = 'mlflow'

# Sample data

In [4]:
X = load_diabetes()['data']
y = load_diabetes()['target']

# Validation scheme

In [5]:
kf = KFold(n_splits=5, random_state=random_state, shuffle=True)
cv_list = list(kf.split(X))

# Example of model training and logging

In [6]:
# Function which returns set of parameters
def params_func(trial):
    return (
        {
            'model_params': {'objective': trial.suggest_categorical('objective', ['huber', 'fair', 'l2', 'l1', 'mape']),
                             'boosting': trial.suggest_categorical('boosting', ['gbdt', 'dart', 'goss']),
                             'n_jobs': -1,
                             'n_estimators': 50,
                             'random_state': random_state,
                             'bagging_fraction': trial.suggest_float('bagging_fraction', 0.01, 1),
                             'feature_fraction': trial.suggest_float('feature_fraction', 0.01, 1),
                             'min_child_samples': trial.suggest_int('min_child_samples', 2, 256),
                             'num_leaves': trial.suggest_int('num_leaves', 2, 256),
                             'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.5)},
            'lgb_data_set_params': {},

        }
    )

In [7]:
from mlexp.trainers import LgbTrainer

In [8]:
# Init trainer object
trainer = LgbTrainer(
    # MSE will be used as validation metric
    validation_metric=mean_squared_error,
    # MSE should be minimised during hyperparameters optimization
    direction='minimize',
    # Before logging to server files will be saved to /home/logged_files
    saved_files_path=r'temp_files/',
    # During training model on test fold n_estimators will be set to the mean n_estimators on validation_folds
    use_average_n_estimators_on_test_fold=True,
    # During hyperparameters' optimization, mean metric on validation fold will be optimized
    optimization_metric='metric_mean_cv')

In [9]:
# Init mlflow run
run_id = trainer.init_run(
    # Init run on mlflow server
    logging_server='mlflow',
    # Run will be started in experiment 'example_exp'
    experiment_name='example_exp',
    # URI of mlflow server (it will be printed in console after starting mlflow server)
    tracking_uri='http://127.0.0.1:5000/',
    # Let's set run_name to 'Example. LGBM' and add tag Data = sklearn.datasets.oad_diabetes
    start_run_params={'run_name': 'Example. LGBM', 'tags': {'Data': 'sklearn.datasets.load_diabetes'}},
    # Let's also log example.ipynb to mlflow server
    upload_files=['example.ipynb'])

2022/10/03 23:49:41 INFO mlflow.tracking.fluent: Experiment with name 'example_exp' does not exist. Creating a new experiment.


In [11]:
%%capture
# Run hyperparameters search and logging
sampler = optuna.samplers.TPESampler(seed=random_state)
trainer.train(X=X,
              y=y,
              cv=cv_list,
              n_trials=20,
              params_func=params_func,
              sampler=sampler)

[32m[I 2022-10-03 23:49:55,685][0m A new study created in memory with name: optuna_study[0m
[32m[I 2022-10-03 23:49:55,949][0m Trial 0 finished with value: 5437.023094610712 and parameters: {'objective': 'huber', 'boosting': 'goss', 'bagging_fraction': 0.4940601862857099, 'feature_fraction': 0.06002537432948098, 'min_child_samples': 139, 'num_leaves': 12, 'learning_rate': 1.2140607335242475}. Best is trial 0 with value: 5437.023094610712.[0m
[32m[I 2022-10-03 23:49:56,166][0m Trial 1 finished with value: 28457.59344352733 and parameters: {'objective': 'huber', 'boosting': 'goss', 'bagging_fraction': 0.0831454234041396, 'feature_fraction': 0.2157826542576255, 'min_child_samples': 207, 'num_leaves': 235, 'learning_rate': 0.2318429220789418}. Best is trial 0 with value: 5437.023094610712.[0m
[32m[I 2022-10-03 23:49:56,519][0m Trial 2 finished with value: 5173.716578485264 and parameters: {'objective': 'huber', 'boosting': 'goss', 'bagging_fraction': 0.18719551926876488, 'featur

# Example of inference

In [12]:
from mlexp.inference import LgbInference

In [13]:
step = 'best'
fold_num = 'test'
trained_model = True

In [14]:
# Initialize inference object
inference = LgbInference(downloaded_files_path=r'temp_files/',
                         inference_server_params={'tracking_uri': 'http://127.0.0.1:5000/', 'run_id': run_id},
                         server='mlflow')
print(inference.get_params_model(step=step, fold_num=fold_num, trained_model=trained_model))

{'direction': 'minimize', 'model_type': 'lgb', 'validation_metric': <function mean_squared_error at 0x00000237F8D1AC10>, 'use_average_n_estimators_on_test_fold': 'True', 'step': 18, 'metric': {'metric_mean_cv': 3393.5389853697197}, 'params': {'model_params': {'objective': 'l2', 'boosting': 'dart', 'n_jobs': -1, 'n_estimators': 50, 'random_state': 54321, 'bagging_fraction': 0.4030800550542788, 'feature_fraction': 0.4021452640008721, 'min_child_samples': 81, 'num_leaves': 212, 'learning_rate': 0.7861663254820528}, 'lgb_data_set_params': {}, 'validation_mean_estimators': 50}, 'optuna_study': <optuna.study.study.Study object at 0x00000237816E9CA0>, 'trained_model': <lightgbm.basic.Booster object at 0x00000237827BC4F0>}
