# 学習ジョブを使わず、ノートブック上で動作を確認する

# LightGBMを、SageMakerカスタムコンテナで実行する

https://dev.classmethod.jp/articles/sagemaker-container-image-lightgbm/


https://github.com/aws/amazon-sagemaker-examples/blob/main/advanced_functionality/scikit_bring_your_own/scikit_bring_your_own.ipynb


* カスタムコンテナ作成
* SageMaker学習ジョブ - ローカルモード
* Sagemaker学習ジョブ
* エンドポイントデプロイ
* 推論実施

* データセットはiris(動作確認が目的のため)

# データロード、分割

In [4]:
import boto3
import re
import os
from os import path
import numpy as np
import pandas as pd
from sagemaker import get_execution_role
import sagemaker as sage
from sagemaker.predictor import csv_serializer
import lightgbm as lgb
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics
import json

In [5]:
# irisデータを読み込む
iris = datasets.load_iris()

# 学習用と検証用にデータを分ける
train_x, validation_x, train_y, validation_y = train_test_split(iris.data, iris.target, test_size=0.2, stratify=iris.target)

In [9]:
print(type(train_x))
print(type(validation_x))
print(type(train_y))
print(type(validation_y))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


# LGBM用に変換

In [10]:
# lgb用データセットを作成する
train = lgb.Dataset(train_x, label=train_y)

# validationデータは学習用データと関連づける
validation = train.create_valid(validation_x, label=validation_y)

S3にデータ配置は省略。そのまま使う

In [15]:
# ハイパーパラメータ
params = dict(
    #num_round = 10,
    objective = 'multiclass',
    num_class = len(iris.target_names)
)

In [16]:
model = lgb.train(
    #hyperparams,
    params,
    #inputdata_dic['train'],
    train,
    #valid_sets= [inputdata_dic['validation']] if 'validation' in inputdata_dic else None
    valid_sets=validation
)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 92
[LightGBM] [Info] Number of data points in the train set: 120, number of used features: 4
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[1]	valid_0's multi_logloss: 0.960385
[2]	valid_0's multi_logloss: 0.857109
[3]	valid_0's multi_logloss: 0.778504
[4]	valid_0's multi_logloss: 0.717401
[5]	valid_0's multi_logloss: 0.663228
[6]	valid_0's multi_logloss: 0.615628
[7]	valid_0's multi_logloss: 0.576664
[8]	valid_0's multi_logloss: 0.544561
[9]	valid_0's multi_logloss: 0.518526
[10]	valid_0's multi_logloss: 0.493947
[11]	valid_0's multi_logloss: 0.475208
[12]	valid_0's multi_logloss: 0.460197
[13]	valid_0's multi_logloss: 0.448248
[14]	valid_0's multi_logloss: 0.43888
[15]	valid_0's multi_logloss: 0.432943
[16]	valid_0's multi_logloss: 0.428266
[17]	valid_0's multi_logloss: 0.425343
[18]	

# 別サンプルで確認
https://blog.amedama.jp/entry/2018/05/01/081842

In [17]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import lightgbm as lgb

from sklearn import datasets
from sklearn.model_selection import train_test_split

import numpy as np

"""LightGBM を使った多値分類のサンプルコード"""


def main():
    # Iris データセットを読み込む
    iris = datasets.load_iris()
    X, y = iris.data, iris.target

    # 訓練データとテストデータに分割する
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    # データセットを生成する
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

    # LightGBM のハイパーパラメータ
    lgbm_params = {
        # 多値分類問題
        'objective': 'multiclass',
        # クラス数は 3
        'num_class': 3,
    }

    # 上記のパラメータでモデルを学習する
    model = lgb.train(lgbm_params, lgb_train, valid_sets=lgb_eval)

    # テストデータを予測する
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    y_pred_max = np.argmax(y_pred, axis=1)  # 最尤と判断したクラスの値にする

    # 精度 (Accuracy) を計算する
    accuracy = sum(y_test == y_pred_max) / len(y_test)
    print(accuracy)


if __name__ == '__main__':
    main()

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 85
[LightGBM] [Info] Number of data points in the train set: 112, number of used features: 4
[LightGBM] [Info] Start training from score -1.221991
[LightGBM] [Info] Start training from score -1.029619
[LightGBM] [Info] Start training from score -1.054937
[1]	valid_0's multi_logloss: 0.956585
[2]	valid_0's multi_logloss: 0.827216
[3]	valid_0's multi_logloss: 0.72509
[4]	valid_0's multi_logloss: 0.636561
[5]	valid_0's multi_logloss: 0.566888
[6]	valid_0's multi_logloss: 0.508384
[7]	valid_0's multi_logloss: 0.454896
[8]	valid_0's multi_logloss: 0.413161
[9]	valid_0's multi_logloss: 0.378837
[10]	valid_0's multi_logloss: 0.34437
[11]	valid_0's multi_logloss: 0.319728
[12]	valid_0's multi_logloss: 0.295413
[13]	valid_0's multi_logloss: 0.273789
[14]	valid_0's multi_logloss: 0.255989
[15]	valid_0's multi_logloss: 0.23932
[16]	valid_0's multi_logloss: 0.227785
[17]	valid_0's multi_logloss: 0.216424
[18]	va

# train.pyの動作確認

https://dev.classmethod.jp/articles/sagemaker-container-image-lightgbm/

In [25]:
print(train)

<function train at 0x7fa9f2a0fee0>


In [28]:
# lgb用データセットを作成する
train_data = lgb.Dataset(train_x, label=train_y)

# validationデータは学習用データと関連づける
valid_data = train_data.create_valid(validation_x, label=validation_y)

In [54]:
#!/usr/bin/env python3

import os
import json
import sys
import traceback
import lightgbm as lgb


# sagemakerがデータを渡すためにコンテナにマウントするパス
#prefix = '/opt/ml/'
prefix = './data/' ### ノートブックインスタンスでの動作確認用

input_path = prefix + 'input/data'
output_path = os.path.join(prefix, 'output')
model_path = os.path.join(prefix, 'model')
param_path = os.path.join(prefix, 'input/config/hyperparameters.json')
inputdataconfig_path = os.path.join(prefix, 'input/config/inputdataconfig.json')


# 有効なデータチャネル(Fileモードのみ対応)
valid_channel_names = ['train', 'validation']


def train():
    print('Starting the training.')
    try:
        # ハイパーパラメータを読み込みます
        with open(param_path, 'r') as f:
            hyperparams = json.load(f)
        print(hyperparams) ### 確認

        # 入力データコンフィグを読み込みます
        with open(inputdataconfig_path, 'r') as f:
            inputdataconfig = json.load(f)
        print(inputdataconfig) ### 確認

        # 入力データを読み込みます。
        inputdata_dic = {}
        for channel_name in inputdataconfig.keys():
            print(f'channel_name: {channel_name}') ### 確認
            assert channel_name in valid_channel_names, 'input data channel must be included in '+str(valid_channel_names)
            inputdata_path = os.path.join(input_path, channel_name, channel_name+'.bin')
            inputdata_dic[channel_name] = lgb.Dataset(inputdata_path)
        print(f'inputdata_dic: {inputdata_dic}') ### 確認
        print(inputdata_dic['train']) ### 確認
        print(inputdata_dic['validation']) ### 確認
        print('='*10)


        # light-gbmで学習
        model = lgb.train(
            hyperparams,
            #params,
            inputdata_dic['train'],
            #train_data,
            #valid_sets= [inputdata_dic['validation']] if 'validation' in inputdata_dic else None
            valid_sets=inputdata_dic['validation']
            #valid_sets=valid_data
        )
        
        #model = lgb.train(
        #    #hyperparams,
        #    params,
        #    #inputdata_dic['train'],
        #    train_data,
        #    #valid_sets= [inputdata_dic['validation']] if 'validation' in inputdata_dic else None
        #    valid_sets=valid_data
        #)

        # モデルを保存
        model.save_model(os.path.join(model_path, 'lightgbm_model.txt'))
        print('Training complete.')

    except Exception as e:
        # 何かエラーが発生したら、その内容をfailureに吐き出すことで失敗理由を伝達する
        trc = traceback.format_exc()
        with open(os.path.join(output_path, 'failure'), 'w') as s:
            s.write('Exception during training: ' + str(e) + '\n' + trc)
        # 標準出力に出すことでログにも送る
        print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr)
        # 0以外の値を返すことで実行失敗を伝える
        sys.exit(255)
        

if __name__ == '__main__':
    train()

    # 0を返すことで実行成功を伝える
    #sys.exit(0)   ### ノートブックインスタンスでの実行のときはコメントアウトする（エラーになる）

Starting the training.
{'num_round': 10, 'objective': 'multiclass', 'num_class': 3}
{'train': {'TrainingInputMode': 'File', 'S3DistributionType': 'FullyReplicated', 'RecordWrapperType': 'None'}, 'validation': {'TrainingInputMode': 'File', 'S3DistributionType': 'FullyReplicated', 'RecordWrapperType': 'None'}}
channel_name: train
channel_name: validation
inputdata_dic: {'train': <lightgbm.basic.Dataset object at 0x7fa9f28f6400>, 'validation': <lightgbm.basic.Dataset object at 0x7fa9f2993610>}
<lightgbm.basic.Dataset object at 0x7fa9f28f6400>
<lightgbm.basic.Dataset object at 0x7fa9f2993610>
[LightGBM] [Info] Load from binary file ./data/input/data/train/train.bin
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 87
[LightGBM] [Info] Number of data points in the train set: 120, number of used features: 4
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from scor

# END: 以上 ==========================