In [1]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
import warnings
import mlflow

warnings.filterwarnings("ignore")

In [2]:
# connect to tracking URI
# URI can either be a HTTP/HTTPS URI for a remote server, or a local path to log data to a directory
mlflow.set_tracking_uri('./myml') 

# set experiment name to organize runs
experiment_name = 'iris'
mlflow.set_experiment(experiment_name)

2022/02/25 08:16:05 INFO mlflow.tracking.fluent: Experiment with name 'iris' does not exist. Creating a new experiment.


<Experiment: artifact_location='./myml/1', experiment_id='1', lifecycle_stage='active', name='iris', tags={}>

In [3]:
# import some data to play with
data = load_iris()
x, y = data.data, data.target

# split data
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=123)

In [8]:
params = {
    'num_class': 3,
    'learning_rate': 0.1,
    'num_leaves': 3,
    'max_depth': 2,
    'colsample_bytree': 0.7,
    'subsample': 0.7,
    'seed': 42,
}
train_params = {
    'num_boost_round': 200,
    'verbose_eval': 50,
    'early_stopping_rounds': 5,
}

In [9]:
def log_metrics(y_proba):
    y_pred = y_proba.argmax(axis=1)
    auc = roc_auc_score(y_val, y_proba, multi_class='ovo')
    acc = accuracy_score(y_val, y_pred)
    mlflow.log_metrics({"val_roc_auc": auc, "val_accuracy": acc})
    return auc, acc

In [10]:
# LGBM

mlflow.lightgbm.autolog()

# lgbm data format
dtrain = lgb.Dataset(x_train, label=y_train)
dval = lgb.Dataset(x_val, label=y_val)

with mlflow.start_run():
    # train model
    pp = {'objective': 'multiclass'}
    pp.update(params)
    model_lgb = lgb.train(
        params = pp,
        train_set = dtrain,
        valid_sets = [dtrain, dval],
        valid_names = ['train', 'val'],
        **train_params,
    )
    # evaluate model
    y_proba = model_lgb.predict(x_val)
    log_metrics(y_proba)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 89
[LightGBM] [Info] Number of data points in the train set: 120, number of used features: 4
[LightGBM] [Info] Start training from score -1.176574
[LightGBM] [Info] Start training from score -1.003302
[LightGBM] [Info] Start training from score -1.123930
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[38]	train's multi_logloss: 0.0633269	val's multi_logloss: 0.11451


In [11]:
# XGBM

mlflow.xgboost.autolog()

# xgboost data format
dtrain = xgb.DMatrix(x_train, label=y_train)
dval = xgb.DMatrix(x_val, label=y_val)

with mlflow.start_run():
    pp = {'objective': 'multi:softprob'}
    pp.update(params)
    # train model
    model_xgb = xgb.train(
        params = pp,
        dtrain = dtrain,
        evals=[(dtrain, 'train'), (dval, 'val')],
        **train_params,
    )
    # evaluate model
    y_proba = model_xgb.predict(dval)
    log_metrics(y_proba)

Parameters: { "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-mlogloss:0.97647	val-mlogloss:0.98181
[50]	train-mlogloss:0.06091	val-mlogloss:0.09176
[64]	train-mlogloss:0.04813	val-mlogloss:0.09109
