# MLflow Trackingを利用したDriverless AI Experimentの管理と比較

#### MLflow Tracking
https://mlflow.org/docs/latest/tracking.html#mlflow-tracking

In [1]:
import os
import json
#from random import random, randint

import driverlessai
import mlflow
import pandas as pd
from sklearn import metrics

In [2]:
# Driverless AIのuser nameとpasswordの読み込み
with open(os.path.join('..', 'idpass.json')) as f:
    idpass = json.load(f)

In [3]:
# Driverless AIサーバーへの接続
dai = driverlessai.Client(address='http://3.93.24.9', username=idpass['id'], password=idpass['pass110'])
dai

<class 'driverlessai._core.Client'> http://3.93.24.9

In [4]:
# 接続先Driverless AIのDatasets
dai.datasets.list()

    | Type    | Key                                  | Name
----+---------+--------------------------------------+------------------------------------------------
  0 | Dataset | 0888b202-7f20-11ec-9c75-0242ac110002 | UCI_Credit_Card_mli.csv
  1 | Dataset | 5e986a6c-7e70-11ec-a429-0242ac110002 | credit_test
  2 | Dataset | 5e984d48-7e70-11ec-a429-0242ac110002 | credic_train
  3 | Dataset | c5270fa4-79b6-11ec-8b1f-0242ac110002 | AirPassengers.csv
  4 | Dataset | 9d1cb68e-78bd-11ec-9b1a-0242ac110002 | amazon_reviews_JP_Books_Test_TopLowPredict.csv
  5 | Dataset | 6d54837a-7500-11ec-856a-0242ac110002 | TS_Simulation_202112_train.csv
  6 | Dataset | 6cfd729c-7500-11ec-856a-0242ac110002 | TS_Simulation_202112_test.csv
  7 | Dataset | 08323dee-710d-11ec-89d0-0242ac110002 | interaction_sample.csv
  8 | Dataset | 22b86e7a-63b3-11ec-b15b-0242ac110002 | BostonHousing.csv
  9 | Dataset | 9d4726e8-62b6-11ec-9725-0242ac110002 | UCI_Credit_Card3.csv
 10 | Dataset | 8883ff6c-6232-11ec-af63-0242ac1100

In [5]:
ds = dai.datasets.get('22b86e7a-63b3-11ec-b15b-0242ac110002')   # 利用データ
ds

<class 'Dataset'> 22b86e7a-63b3-11ec-b15b-0242ac110002 BostonHousing.csv

In [6]:
print(ds.columns)
print(ds.shape)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
(506, 14)


In [7]:
# データセットの分割
ds_split = ds.split_to_train_test(train_size=0.8, train_name='boston_train', test_name='boston_test', seed=12345)
ds_split

Complete


{'train_dataset': <class 'Dataset'> fd3e19f8-83e5-11ec-be01-0242ac110002 boston_train,
 'test_dataset': <class 'Dataset'> fd3e3a8c-83e5-11ec-be01-0242ac110002 boston_test}

In [8]:
print(ds_split['train_dataset'].shape)
print(ds_split['test_dataset'].shape)

(404, 14)
(102, 14)


## 実施ステップ
MLFlow Experimentの作成  
(1) MLFlow Runの開始  
(2) DAI Experimentの設定と実施  
(3) DAI Experimentの設定(Parameter)をMLFlowへロギング  
(4) DAI Experimentの予測結果(Metric)をMLFlowへロギング  
(5) DAI Experiment成果物のMLFlowへロギング  
(6) MLFlow Runの終了  

(1)~(6)を実験を試したい回数繰り返す

#### MLFlow Logging関数に関して  
https://mlflow.org/docs/latest/tracking.html#logging-functions


### MLFlow Experimentの作成

In [10]:
# MLFlow Experiment情報の保存先を指定。指定しない場合カレントディレクトリに'mlruns'ディレクトリが作成される
mlflow.set_tracking_uri("file://{}/../tmp/mlflow_tracking".format(os.getcwd()))    # 'mlflow_tracking'の名前のディレクトリを保存先に指定
#print(mlflow.get_tracking_uri())    # 保存先ローカルパスの表示

In [11]:
# MLFlow Experimentに名前を付ける
mlflow_experiment_name = 'dai-experiment-test'
mlflow.set_experiment(experiment_name=mlflow_experiment_name)

print(mlflow.get_experiment_by_name(mlflow_experiment_name).experiment_id)     # MLFlow ExperimentのID
#print(mlflow.get_experiment_by_name(mlflow_experiment_name).artifact_location)     # MLFlow Experimentの保存先パス

2022/02/02 14:07:44 INFO mlflow.tracking.fluent: Experiment with name 'dai-experiment-test' does not exist. Creating a new experiment.


0


### (1) MLFlow Runの開始

In [12]:
# MLFlow Runの開始
mlflow.start_run()

<ActiveRun: >

In [13]:
# MLFlow Runにタグで作成するモデルのメモを付けておく
mlflow.set_tag(key='model', value='TestModel1')
mlflow.set_tag(key='memo', value='This is a test')

### (2) DAI Experimentの設定と実施

In [14]:
ds.columns[-1]         # Targetカラム

'MEDV'

#### DAI Experimentの設定

In [15]:
# DAI Experiment 設定
dai_settings = {
    'train_dataset': ds_split['train_dataset'],
    'test_dataset': ds_split['test_dataset'],
    'task': 'regression',
    'target_column': 'MEDV',
    'accuracy': 2,
    'time': 4,
    'interpretability': 6,
    'scorer': 'RMSE',
    'target_transformer':'identity',    # Expert Settings > Experiment. ターゲット変数の変換を実施しない
    'enable_xgboost_gbm':'off',     # Expert Settings > Models. XGBoostを使わない
    'enable_lightgbm':'off'               # Expert Settings > Models. LightGBMを使わない
}

In [16]:
dai.experiments.preview(**dai_settings)

ACCURACY [2/10]:
- Training data size: *404 rows, 14 cols*
- Feature evolution: *[Constant, GLM]*, *1/3 validation split*
- Final pipeline: *One of [Constant, GLM], single final model, validated with 3-fold CV*

TIME [4/10]:
- Feature evolution: *2 individuals*, up to *40 iterations*
- Early stopping: After *5* iterations of no improvement

INTERPRETABILITY [6/10]:
- Feature pre-pruning strategy: None
- Monotonicity constraints: disabled
- Feature engineering search space: [CVCatNumEncode, CVTargetEncode, ClusterTE, Frequent, Interactions, NumCatTE, NumToCatTE, OneHotEncoding, Original]

[Constant, GLM] models to train:
- Model and feature tuning: *2*
- Feature evolution: *82*
- Final pipeline: *7*

Estimated runtime: *minutes*
Auto-click Finish/Abort if not done in: *1 day*/*7 days*


#### dai.experiments.preview  
https://docs.h2o.ai/driverless-ai/pyclient/docs/html/client.html#driverlessai._experiments.Experiments.preview

Keyword Arguments（Expert Settings）のサーチ方法  
https://docs.h2o.ai/driverless-ai/pyclient/docs/html/examples/experiments/search-expert-settings.html#Search-Expert-Settings  

`dai.experiments.search_expert_settings(search_term)`  
search_term = 'experiment' , 'model', 'features', 'timeseries', 'nlp', 'image', 'recipes', 'system', or 'autodoc'

In [17]:
#dai.experiments.search_expert_settings('model')

#### Expert Settingsに関して
https://docs.h2o.ai/driverless-ai/latest-stable/docs/userguide/expert-settings.html

#### DAI Experimentの実行

In [18]:
# Experimentの実行
ex = dai.experiments.create(**dai_settings)
ex

Experiment launched at: http://3.93.24.9/#/experiment?key=37081a26-83e6-11ec-be01-0242ac110002
Complete 100.00% - Status: Complete                                           


<class 'Experiment'> 37081a26-83e6-11ec-be01-0242ac110002 mopotunu

In [19]:
ex.is_complete()

True

In [20]:
# Experimentサマリ
ex.summary()

Status: Complete
Experiment: mopotunu (37081a26-83e6-11ec-be01-0242ac110002)
  Version: 1.10.0, 2022-02-02 05:10
  Settings: 2/4/6, seed=537750190, GPUs disabled
  Train data: boston_train (404, 14)
  Validation data: N/A
  Test data: [Test] (102, 13)
  Target column: MEDV (regression)
System specs: Docker/Linux, 31 GB, 8 CPU cores, 0/0 GPU
  Max memory usage: 0.549 GB, 0 GB GPU
Recipe: AutoDL (7 iterations, 2 individuals)
  Validation scheme: random, 1 internal holdout
  Feature engineering: 32 features scored (21 selected)
Timing: MOJO latency 0.0358 millis (40.6kB), Python latency 65.9525 millis (28.8kB)
  Data preparation: 5.50 secs
  Shift/Leakage detection: 3.81 secs
  Model and feature tuning: 7.33 secs (3 models trained)
  Feature evolution: 16.60 secs (5 of 82 models trained)
  Final pipeline training: 22.56 secs (7 models trained)
  Python / MOJO scorer building: 31.93 secs / 11.28 secs
Validation score: RMSE = 9.461989 (constant preds of 22.58)
Validation score: RMSE = 19.83

### (3) DAI Experimentの設定(Parameter)をMLFlowへロギング

In [21]:
# DAI Experiment 設定
dai_settings

{'train_dataset': <class 'Dataset'> fd3e19f8-83e5-11ec-be01-0242ac110002 boston_train,
 'test_dataset': <class 'Dataset'> fd3e3a8c-83e5-11ec-be01-0242ac110002 boston_test,
 'task': 'regression',
 'target_column': 'MEDV',
 'accuracy': 2,
 'time': 4,
 'interpretability': 6,
 'scorer': 'RMSE',
 'target_transformer': 'identity',
 'enable_xgboost_gbm': 'off',
 'enable_lightgbm': 'off'}

In [22]:
mlflow.log_params(dai_settings)

### (4) DAI Experimentの予測結果(Metric)をMLFlowへロギング

Python Client(1.10)では、Testデータに対する各種スコアの取得機能が実装されていないため、Testデータに対するスコアリングを実施し、その結果に対してMetrics（R2, RMSE）の計算を実施

In [23]:
# Testデータに対するスコアリングの実施と結果のダウンロード（実行パス上）
download_pred_file = ex.predict(dataset=ds_split['test_dataset'], include_columns=['MEDV']).download(overwrite=True)
download_pred_file    # ダウンロードしたファイル名

Complete
Downloaded '37081a26-83e6-11ec-be01-0242ac110002_preds_8a2f1424.csv'


'37081a26-83e6-11ec-be01-0242ac110002_preds_8a2f1424.csv'

In [24]:
df = pd.read_csv(download_pred_file)
print(df.shape)
df.head()

(102, 4)


Unnamed: 0,MEDV,MEDV.predicted,MEDV.predicted.lower,MEDV.predicted.upper
0,21.7,24.31793,18.897108,34.200844
1,24.8,26.09036,20.669538,35.973274
2,20.6,15.781856,10.361034,25.66477
3,23.0,16.669207,11.248385,26.55212
4,19.1,12.200898,6.780076,22.083813


In [25]:
r2_test = metrics.r2_score(y_true=df['MEDV'], y_pred=df['MEDV.predicted'])    # R2
print('R2 on test data: {}'.format(r2_test))
rmse_test = metrics.mean_squared_error(y_true=df['MEDV'], y_pred=df['MEDV.predicted'], squared=False)   # RMSE
print('RMSE on test data: {}'.format(rmse_test))

R2 on test data: 0.11628904061183798
RMSE on test data: 7.528240298303414


In [26]:
mlflow.log_metrics({'R2':r2_test, 'RMSE':rmse_test})

In [27]:
# ダウンロードしたスコアリングデータの削除
try:
    os.remove(download_pred_file)
except FileNotFoundError as err:
    pass

### (5) DAI Experiment成果物のMLFlowへロギング

In [28]:
ex.artifacts.list()

['autodoc',
 'logs',
 'mojo_pipeline',
 'python_pipeline',
 'summary',
 'test_predictions',
 'train_predictions']

In [29]:
# AutoDocをダウンロード
ex.artifacts.download(only='autodoc', dst_dir='', overwrite=True)    # 実行パス上にreport.docxの名前でダウンロード

Downloaded 'report.docx'


{'autodoc': 'report.docx'}

In [30]:
mlflow.log_artifact('report.docx')

# mlflow.log_artifacts()  # デイレクトリの中身を指定する場合

In [31]:
# ダウンロードしたスコアリングデータの削除
try:
    os.remove('report.docx')
except FileNotFoundError as err:
    pass

### (6) MLFlow Runの終了

In [32]:
# DAI Experimentの削除
ex.delete()

Driverless AI Server reported experiment 37081a26-83e6-11ec-be01-0242ac110002 deleted.


In [33]:
# DAIサーバ上にある分割したTrain,Testデータの削除
ds_split['train_dataset'].delete()
ds_split['test_dataset'].delete()

Driverless AI Server reported dataset fd3e19f8-83e5-11ec-be01-0242ac110002 deleted.
Driverless AI Server reported dataset fd3e3a8c-83e5-11ec-be01-0242ac110002 deleted.


In [34]:
mlflow.end_run()

## mlflow uiの実行  
`$ mlflow ui --backend-store-uri 'file:///Path_to/tmp/mlflow_tracking'`  

ブラウザからlocalhost:5000へアクセス

<img src="img/mlflow1.png" width=800px>

<img src="img/mlflow2.png" width=800px>