# Train with LightGBM in an Interactive Run

## Install requirements

In [1]:
!python --version

Python 3.7.13


In [2]:
!which python

/miniconda/bin/python


In [1]:
!conda --version

conda 4.12.0


In [11]:
#!conda list
!conda list | grep numpy
!conda list | grep scipy

numpy                     1.21.6                   pypi_0    pypi
scipy                     1.7.3                    pypi_0    pypi


In [8]:
%pip install -r requirements.txt

Collecting numpy
  Downloading numpy-1.21.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
[K     |████████████████████████████████| 15.7 MB 7.1 MB/s eta 0:00:01
[?25hCollecting scipy
  Downloading scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38.1 MB)
[K     |████████████████████████████████| 38.1 MB 157 kB/s  eta 0:00:01
[?25hCollecting pandas>=1.2.0
  Downloading pandas-1.3.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.3 MB)
[K     |████████████████████████████████| 11.3 MB 31.7 MB/s eta 0:00:01
[?25hCollecting adlfs>=2021.8.1
  Downloading adlfs-2022.2.0.tar.gz (39 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (24.8 MB)
[K     |████████████████████████████████| 24.8 MB 34.4 MB/s eta 0:00:01
[?25hCollecting lightgbm>=3.0.0
  Downloading lightgbm-3.3.2-py3-none-manylinux1_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 

## Setup cloud tracking

[Mlflow](https://github.com/mlflow/mlflow) is a great tool for local ML experimentation tracking. However, using it alone is like using git without GitHub. Your Azure Machine Learning workspace can easily be used to setup a remote tracking URI for mlflow:

In [12]:
!pip list | grep azureml.core

azureml-core                            1.41.0.post3


# 加载workspace


In [1]:
# 加载workspace


from azureml.core import Workspace

subscription_id = 'cc80fb14-49de-4506-997b-89f34562676e'
resource_group  = 'shoufei'
workspace_name  = 'xw-ml-ws'

if not os.path.exists('./aml_config'):
    os.mkdir('./aml_config')

if (os.path.exists('./aml_config/.azureml/config.json')):
    ws = Workspace.from_config(path='./aml_config')
    print(ws.name)
else:
    try:
        ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
        ws.write_config(path='./aml_config')
        print('Library configuration succeeded')
    except:
        print('Workspace not found')

xw-ml-ws


In [15]:
import mlflow


mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
mlflow.set_experiment("lightgbm-iris-local-example")

2022/05/24 09:12:08 INFO mlflow.tracking.fluent: Experiment with name 'lightgbm-iris-local-example' does not exist. Creating a new experiment.


<Experiment: artifact_location='', experiment_id='29e35629-fa97-4c0a-97cf-89631f845674', lifecycle_stage='active', name='lightgbm-iris-local-example', tags={}>

## Load data

You can read directly from public URIs into Pandas. For private Blob or ADLS data, you can use built in Azure data protocols and pass in `storage_options` for credentials.

In [16]:
data_uri = "https://azuremlexamples.blob.core.windows.net/datasets/iris.csv"

In [17]:
import pandas as pd

df = pd.read_csv(data_uri)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [58]:
print(df)

     sepal_length  sepal_width  petal_length  petal_width         species
0             5.1          3.5           1.4          0.2     Iris-setosa
1             4.9          3.0           1.4          0.2     Iris-setosa
2             4.7          3.2           1.3          0.2     Iris-setosa
3             4.6          3.1           1.5          0.2     Iris-setosa
4             5.0          3.6           1.4          0.2     Iris-setosa
..            ...          ...           ...          ...             ...
145           6.7          3.0           5.2          2.3  Iris-virginica
146           6.3          2.5           5.0          1.9  Iris-virginica
147           6.5          3.0           5.2          2.0  Iris-virginica
148           6.2          3.4           5.4          2.3  Iris-virginica
149           5.9          3.0           5.1          1.8  Iris-virginica

[150 rows x 5 columns]


## Write functions

After some experimentation, you may refactor your code into a few functions for logical steps in the ML training process:

In [18]:
# imports
import time

import lightgbm as lgb

from sklearn.metrics import log_loss, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# define functions
def preprocess_data(df):
    X = df.drop(["species"], axis=1)
    y = df["species"]

    enc = LabelEncoder()
    y = enc.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    return X_train, X_test, y_train, y_test, enc


def train_model(params, num_boost_round, X_train, X_test, y_train, y_test):
    t1 = time.time()
    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_test, label=y_test)
    model = lgb.train(
        params,
        train_data,
        num_boost_round=num_boost_round,
        valid_sets=[test_data],
        valid_names=["test"],
    )
    t2 = time.time()

    return model, t2 - t1


def evaluate_model(model, X_test, y_test):
    y_proba = model.predict(X_test)
    y_pred = y_proba.argmax(axis=1)
    loss = log_loss(y_test, y_proba)
    acc = accuracy_score(y_test, y_pred)

    return loss, acc

In [21]:
!pip install matplotlib

Collecting matplotlib
  Downloading matplotlib-3.5.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (11.2 MB)
[K     |████████████████████████████████| 11.2 MB 1.1 MB/s eta 0:00:01
Collecting fonttools>=4.22.0
  Downloading fonttools-4.33.3-py3-none-any.whl (930 kB)
[K     |████████████████████████████████| 930 kB 48.8 MB/s eta 0:00:01
[?25hCollecting cycler>=0.10
  Downloading cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting pillow>=6.2.0
  Downloading Pillow-9.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 50.1 MB/s eta 0:00:01
[?25hCollecting kiwisolver>=1.0.1
  Downloading kiwisolver-1.4.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 41.2 MB/s eta 0:00:01
Installing collected packages: pillow, kiwisolver, fonttools, cycler, matplotlib
Successfully installed cycler-0.11.0 fonttools-4.33.3 kiwisolver-1.4.2 matplotlib-3.5.2 pillow-9

## Run a trial

Now, you can easily run local trials, editing the parameters and seeing how the model performs.

In [32]:
!pip list | grep light


lightgbm                                3.3.2


In [31]:
from sklearn.metrics import accuracy_score, log_loss

# preprocess data
X_train, X_test, y_train, y_test, enc = preprocess_data(df)

# set training parameters
params = {
    "objective": "multiclass",
    "num_class": 3,
    "learning_rate": 0.1,
    "metric": "multi_logloss",
    "colsample_bytree": 1.0,
    "subsample": 1.0,
    "seed": 42,
}

num_boost_round = 32

# start run
run = mlflow.start_run()

# enable automatic logging
mlflow.lightgbm.autolog()

# train model
model, train_time = train_model(
    params, num_boost_round, X_train, X_test, y_train, y_test
)



print(model)
mlflow.log_metric("training_time", train_time)

# evaluate model
loss, acc = evaluate_model(model, X_test, y_test)
print(loss)
print(acc)
mlflow.log_metrics({"loss": loss, "accuracy": acc})

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 91
[LightGBM] [Info] Number of data points in the train set: 120, number of used features: 4
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.073920
[LightGBM] [Info] Start training from score -1.123930
<lightgbm.basic.Booster object at 0x7fa2a760d2d0>
0.05792422826443045
1.0


Note the run is still activate. You can continue experimenting with your model and log metrics or artifacts. For instance, you can log the notebook to the run. Save your notebook first to capture the outputs.

In [34]:
!pip list | grep joblib

joblib                                  1.1.0


# 通过joblib保存并加载模型 lgb2.pkl

In [81]:
import joblib
# save model
joblib.dump(model, 'lgb2.pkl')


['lgb2.pkl']

In [109]:
joblib.dump(model, 'model.pkl')

['model.pkl']

In [88]:
# load model
jlmodel = joblib.load('lgb2.pkl')

In [89]:
# 预测

#102           7.1          3.0           5.9          2.1,   Iris-virginica（变色鸢尾）,  对应分类结果：2
#73            6.1          2.8           4.7          1.2,   Iris-versicolor（维吉尼亚鸢尾）, 对应分类结果：1
#30            4.8          3.1           1.6          0.2,   Iris-setosa（山鸢尾）,     对应分类结果：0


X_new = [[7.1,3.0,5.9,2.1]]
#X_new = [[6.1,2.8,4.7,1.2]]
#X_new = [[4.8,3.1,1.6,0.2]]

# X_new = [[5.1,3.5,1.4,0.2]]
# X_new = [[5.9,3.2,4.8,1.8]]

print ('New sample: {}'.format(X_new))
#pred = model.predict(X_new)
#print(pred)
# print('Predicted class is {}'.format(pred))
print('*************************')
y_proba = jlmodel.predict(X_new)
print(y_proba)
print('*************************')
y_pred = y_proba.argmax(axis=1)
print('Predicted class is {}'.format(y_pred))
print('*************************')

New sample: [[7.1, 3.0, 5.9, 2.1]]
*************************
[[0.00586549 0.01606059 0.97807392]]
*************************
Predicted class is [2]
*************************


# 测试socrey.py

## 测试init

In [104]:
import joblib
from azureml.core.model import Model

model5_path = Model.get_model_path('lgb2.pkl')
model5 = joblib.load(model5_path)

In [105]:
# 预测

#102           7.1          3.0           5.9          2.1,   Iris-virginica（变色鸢尾）,  对应分类结果：2
#73            6.1          2.8           4.7          1.2,   Iris-versicolor（维吉尼亚鸢尾）, 对应分类结果：1
#30            4.8          3.1           1.6          0.2,   Iris-setosa（山鸢尾）,     对应分类结果：0


X_new = [[7.1,3.0,5.9,2.1]]
#X_new = [[6.1,2.8,4.7,1.2]]
#X_new = [[4.8,3.1,1.6,0.2]]

# X_new = [[5.1,3.5,1.4,0.2]]
# X_new = [[5.9,3.2,4.8,1.8]]

print ('New sample: {}'.format(X_new))
#pred = model.predict(X_new)
#print(pred)
# print('Predicted class is {}'.format(pred))
print('*************************')
y_proba = model5.predict(X_new)
print(y_proba)
print('*************************')
y_pred = y_proba.argmax(axis=1)
print('Predicted class is {}'.format(y_pred))
print('*************************')

New sample: [[7.1, 3.0, 5.9, 2.1]]
*************************
[[0.00586549 0.01606059 0.97807392]]
*************************
Predicted class is [2]
*************************


In [106]:
print(X_new)

[[7.1, 3.0, 5.9, 2.1]]


## 测试run

In [107]:

import json
import pandas

test_sample = json.dumps({ "sepal": { "length": 7.1, "width": 3.0 }, \
                          "petal": { "length": 5.9, "width": 2.1 } })

print(test_sample)

input_json = json.loads(test_sample)
print(input_json)

input_df = pandas.DataFrame([[input_json['sepal']['length'],input_json['sepal']['width'],input_json['petal']['length'],input_json['petal']['width']]])

print(input_df)


{"sepal": {"length": 7.1, "width": 3.0}, "petal": {"length": 5.9, "width": 2.1}}
{'sepal': {'length': 7.1, 'width': 3.0}, 'petal': {'length': 5.9, 'width': 2.1}}
     0    1    2    3
0  7.1  3.0  5.9  2.1


In [108]:
y_proba = model5.predict(input_df)
print(y_proba)
print('*************************')
y_pred = y_proba.argmax(axis=1)
print('Predicted class is {}'.format(y_pred))
print('*************************')

[[0.00586549 0.01606059 0.97807392]]
*************************
Predicted class is [2]
*************************


# 通过save_model保存模型lgb3.pkl

In [37]:
model.save_model('lgb3.pkl')


<lightgbm.basic.Booster at 0x7fa2a760d2d0>

In [38]:
# 下面的效果跟上面的lgb3.pkl是一样的，模型md5都一样
model.save_model('lgb_classifier.txt', num_iteration=model.best_iteration) 

<lightgbm.basic.Booster at 0x7fa2a760d2d0>

# 输出模型

In [39]:
print(model.model_to_string())

tree
version=v3
num_class=3
num_tree_per_iteration=3
label_index=0
max_feature_idx=3
objective=multiclass num_class:3
feature_names=sepal_length sepal_width petal_length petal_width
feature_infos=[4.2999999999999998:7.7000000000000002] [2:4.4000000000000004] [1:6.7000000000000002] [0.10000000000000001:2.5]
tree_sizes=549 526 543 652 542 646 651 541 647 650 528 655 652 544 651 649 529 650 654 545 652 654 528 654 555 543 652 656 530 656 657 544 652 651 527 658 651 546 551 639 525 549 653 544 549 652 530 548 656 545 547 651 528 757 654 545 758 656 546 758 651 544 652 640 643 649 654 646 545 641 645 651 651 647 547 644 547 544 662 649 654 656 650 547 657 651 543 645 548 662 642 648 546 660 650 648

Tree=0
num_leaves=4
num_cat=0
split_feature=2 2 0
split_gain=77.0732 0.0696864 3.55271e-15
threshold=3.1500000000000004 1.4500000000000002 6.1500000000000012
decision_type=2 2 2
left_child=1 -1 -2
right_child=2 -3 -4
leaf_value=-0.89861228866810972 -1.1986122886681096 -0.91289800295382406 -1.198

# 模型评估

In [45]:
# evaluate model
loss, acc = evaluate_model(model, X_test, y_test)

print({"loss": loss, "accuracy": acc})

{'loss': 0.05792422826443045, 'accuracy': 1.0}


# 测试模型

In [69]:
#102           7.1          3.0           5.9          2.1,   Iris-virginica（变色鸢尾）,  对应分类结果：2
#73            6.1          2.8           4.7          1.2,   Iris-versicolor（维吉尼亚鸢尾）, 对应分类结果：1
#30            4.8          3.1           1.6          0.2,   Iris-setosa（山鸢尾）,     对应分类结果：0


X_new = [[7.1,3.0,5.9,2.1]]
#X_new = [[6.1,2.8,4.7,1.2]]
#X_new = [[4.8,3.1,1.6,0.2]]

# X_new = [[5.1,3.5,1.4,0.2]]
# X_new = [[5.9,3.2,4.8,1.8]]

print ('New sample: {}'.format(X_new))
#pred = model.predict(X_new)
#print(pred)
# print('Predicted class is {}'.format(pred))
print('*************************')
y_proba = model.predict(X_new)
print(y_proba)
print('*************************')
y_pred = y_proba.argmax(axis=1)
print('Predicted class is {}'.format(y_pred))
print('*************************')

New sample: [[7.1, 3.0, 5.9, 2.1]]
*************************
[[0.00586549 0.01606059 0.97807392]]
*************************
Predicted class is [2]
*************************


In [55]:
print(X_train)

print('*************************')

print(y_train)

     sepal_length  sepal_width  petal_length  petal_width
22            4.6          3.6           1.0          0.2
15            5.7          4.4           1.5          0.4
65            6.7          3.1           4.4          1.4
11            4.8          3.4           1.6          0.2
42            4.4          3.2           1.3          0.2
..            ...          ...           ...          ...
71            6.1          2.8           4.0          1.3
106           4.9          2.5           4.5          1.7
14            5.8          4.0           1.2          0.2
92            5.8          2.6           4.0          1.2
102           7.1          3.0           5.9          2.1

[120 rows x 4 columns]
*************************
[0 0 1 0 0 2 1 0 0 0 2 1 1 0 0 1 2 2 1 2 1 2 1 0 2 1 0 0 0 1 2 0 0 0 1 0 1
 2 0 1 2 0 2 2 1 1 2 1 0 1 2 0 0 1 1 0 2 0 0 1 1 2 1 2 2 1 0 0 2 2 0 0 0 1
 2 0 2 2 0 1 1 2 1 2 0 2 1 2 1 1 1 0 1 1 0 1 2 2 0 1 2 2 0 2 0 1 2 2 1 2 1
 1 2 2 0 1 2 0 1 2]


In [51]:
print(X_test)

print('*************************')

print(y_test)

     sepal_length  sepal_width  petal_length  petal_width
73            6.1          2.8           4.7          1.2
18            5.7          3.8           1.7          0.3
118           7.7          2.6           6.9          2.3
78            6.0          2.9           4.5          1.5
76            6.8          2.8           4.8          1.4
31            5.4          3.4           1.5          0.4
64            5.6          2.9           3.6          1.3
141           6.9          3.1           5.1          2.3
68            6.2          2.2           4.5          1.5
82            5.8          2.7           3.9          1.2
110           6.5          3.2           5.1          2.0
12            4.8          3.0           1.4          0.1
36            5.5          3.5           1.3          0.2
9             4.9          3.1           1.5          0.1
19            5.1          3.8           1.5          0.3
56            6.3          3.3           4.7          1.6
104           

# 通过lgb加载模型并测试

In [76]:
model3 = lgb.Booster(model_file='lgb_classifier.txt')

print(model3)

<lightgbm.basic.Booster object at 0x7fa2a7574610>


In [77]:
# 测试
X_new = [[7.1,3.0,5.9,2.1]]
#X_new = [[6.1,2.8,4.7,1.2]]
#X_new = [[4.8,3.1,1.6,0.2]]

# X_new = [[5.1,3.5,1.4,0.2]]
# X_new = [[5.9,3.2,4.8,1.8]]

print ('New sample: {}'.format(X_new))
#pred = model.predict(X_new)
#print(pred)
# print('Predicted class is {}'.format(pred))
print('*************************')
y_proba = model3.predict(X_new)
print(y_proba)
print('*************************')
y_pred = y_proba.argmax(axis=1)
print('Predicted class is {}'.format(y_pred))
print('*************************')

New sample: [[7.1, 3.0, 5.9, 2.1]]
*************************
[[0.00586549 0.01606059 0.97807392]]
*************************
Predicted class is [2]
*************************


In [29]:
mlflow.log_artifact("1.train-lightgbm-local.ipynb")

Finally, mark the run as completed:

In [2]:
# end run
mlflow.end_run()

NameError: name 'mlflow' is not defined

# 通过pickle保存并加载模型

In [90]:
# save model 
import pickle

f = open('lgb4.pkl', 'wb')
pickle.dump(model, f)
f.close()
print ("Exported the model to lgb4.pkl")

Exported the model to lgb4.pkl


In [91]:
# load model 

print("Import the model from model-arm64.pkl")
f2 = open('lgb4.pkl', 'rb')
model4 = pickle.load(f2)

Import the model from model-arm64.pkl


In [95]:
# test model

# 测试
X_new = [[7.1,3.0,5.9,2.1]]
#X_new = [[6.1,2.8,4.7,1.2]]
#X_new = [[4.8,3.1,1.6,0.2]]

# X_new = [[5.1,3.5,1.4,0.2]]
# X_new = [[5.9,3.2,4.8,1.8]]

print ('New sample: {}'.format(X_new))
#pred = model.predict(X_new)
#print(pred)
# print('Predicted class is {}'.format(pred))
print('*************************')
y_proba = model4.predict(X_new)
print(y_proba)
print('*************************')
y_pred = y_proba.argmax(axis=1)
print('Predicted class is {}'.format(y_pred))
print('*************************')

New sample: [[7.1, 3.0, 5.9, 2.1]]
*************************
[[0.00586549 0.01606059 0.97807392]]
*************************
Predicted class is [2]
*************************


# 注册模型

In [3]:


# 加载workspace


from azureml.core import Workspace

subscription_id = 'cc80fb14-49de-4506-997b-89f34562676e'
resource_group  = 'shoufei'
workspace_name  = 'xw-ml-ws'

try:
    ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
    ws.write_config()
    print('Library configuration succeeded')
except:
    print('Workspace not found')

Library configuration succeeded


In [5]:


from azureml.core.model import Model

model7 = Model.register(model_path = "lgb2.pkl",
                       model_name = "lgb2.pkl",
                       tags = {'area': "irsi", 'type': "classification"},
                       description = "Sample irsi detection model for IOT tutorial",
                       workspace = ws)

KeyboardInterrupt: 

# create images

In [5]:

from azureml.core.image import Image, ContainerImage

image_config = ContainerImage.image_configuration(runtime= "python",
                                 execution_script="iot_score.py",
                                 conda_file="myenv.yml",
                                 tags = {'area': "iot", 'type': "classification"},
                                 description = "iris classification")


image = Image.create(name = "irisclassify2",
                     # this is the model object 
                     models = [model6],
                     image_config = image_config, 
                     workspace = ws)

  import sys
  


Creating image


In [6]:
image.wait_for_creation(show_output = True)


Running...........................
Succeeded
Image creation operation finished for image irisclassify2:2, operation "Succeeded"


In [7]:
for i in Image.list(workspace = ws,tags = ["area"]):
    print('{}(v.{} [{}]) stored at {} with build log {}'.format(i.name, i.version, i.creation_state, i.image_location, i.image_build_log_uri))


  """Entry point for launching an IPython kernel.


irisclassify2(v.2 [Succeeded]) stored at xwcontainerregistry.azurecr.io/irisclassify2:2 with build log https://storageavaxw.blob.core.windows.net/azureml/ImageLogs/6e95f7a9-e2f5-4be6-b01a-726101a61074/build.log?sv=2019-07-07&sr=b&sig=TOMhFMvZy9h7oERaXdJQ6rNDiLOtzl957PAS7ik8UOE%3D&st=2022-05-25T01%3A44%3A50Z&se=2022-06-24T01%3A49%3A50Z&sp=rl
irisclassify2(v.1 [Failed]) stored at xwcontainerregistry.azurecr.io/irisclassify2:1 with build log https://storageavaxw.blob.core.windows.net/azureml/ImageLogs/19cfc54d-6f56-4181-a2d5-f76eb3b1bbf7/build.log?sv=2019-07-07&sr=b&sig=CF27Csx05XCzJG052qWkumaRb%2BGKdxkLO9ePct9Qe5g%3D&st=2022-05-25T01%3A44%3A50Z&se=2022-06-24T01%3A49%3A50Z&sp=rl
irisclassify(v.1 [Failed]) stored at xwcontainerregistry.azurecr.io/irisclassify:1 with build log https://storageavaxw.blob.core.windows.net/azureml/ImageLogs/962eb001-48ee-41dc-8e43-e318008876f5/build.log?sv=2019-07-07&sr=b&sig=sfX8qiGdYi2N0CbDtZVRuhIhy2T4F45Qe4onY3OsUKU%3D&st=2022-05-25T01%3A44%3A50Z&se=2022-06-