In [1]:
import mlflow
from mlflow.tracking import MlflowClient

In [2]:
import torch
import numpy as np

import pprint

## 创建mlruns文件夹和mlrun.db

mlruns文件夹对应储存artifacts，mlrun.db则存储对应tracking和registry相关的内容。

In [3]:
ROOT_PATH = '/data16/defeng'

In [4]:
# step1
import sqlite3
import os

if os.path.exists(os.path.join(ROOT_PATH, 'mlruns')):
    import shutil
    shutil.rmtree(os.path.join(ROOT_PATH, 'mlruns'), ignore_errors=True)
    print("deleting... Done!")
    
os.mkdir(os.path.join(ROOT_PATH, 'mlruns'))

conn = sqlite3.connect(os.path.join(ROOT_PATH, 'mlruns.db'))
conn.close()

deleting... Done!


## 启动mlflow server服务

In [16]:
# step2
# bash kill_proc_by_name.sh mlflow 首先杀掉之前的服务
# bash Mlflow.sh

**然后执行bash Mlflow.sh**

通过这个下面的命令，可以查看使用后的mlruns.db内含有的表情况。<br>
这里为了方便，toggle了下面这个cell的输出。

In [6]:
conn = sqlite3.connect(os.path.join(ROOT_PATH, 'mlruns/mlruns.db'))

cur = conn.cursor()
cur.execute("select name from sqlite_master where type='table' order by name")

import pprint
pprint.pprint(cur.fetchall())

conn.close()

[('alembic_version',),
 ('experiment_tags',),
 ('experiments',),
 ('latest_metrics',),
 ('metrics',),
 ('model_version_tags',),
 ('model_versions',),
 ('params',),
 ('registered_model_tags',),
 ('registered_models',),
 ('runs',),
 ('tags',)]


In [24]:
conn = sqlite3.connect(os.path.join(ROOT_PATH, 'mlruns/mlruns.db'))

cur = conn.cursor()
cur.execute("select * from experiments")
#cur.execute("select * from runs")
# 总之，在现在的代码下(client)，在notebooks/~中的mlruns的exp和run中都找不到meta.yaml了，因为meta.yaml的内容已经放到db中了！

pprint.pprint(cur.fetchall())

conn.close()

[(0, 'Default', 'file:///data/defeng/mlruns/0', 'active'),
 (1, 'test3', 'file:///data/defeng/mlruns/1', 'active')]


## mlflow.client

以下所有API的用法来源于：https://mlflow.org/docs/latest/python_api/mlflow.tracking.html

### 创建client

In [17]:
#step3(接着正常创建exp和run即可！)
# ****reference: https://mlflow.org/docs/latest/python_api/mlflow.tracking.html#mlflow.tracking.MlflowClient
# Create an experiment with a name that is unique and case sensitive.
client = MlflowClient(tracking_uri='http://localhost:10500')
# the registry will automatically set to the same as tracking_uri
# old: tracking_uri='file:///data/defeng/mlruns/',registry_uri='sqlite:////data/defeng/mlruns/registry.db'
# tracking_uri='sqlite:////data/defeng/mlruns/mlruns.db',registry_uri='sqlite:////data/defeng/mlruns/mlruns.db'

只需要设置tracking_uri即可，然后registry_uri会**自动设置为和他相同**。<br>
**之所以设置这里的uri为localhost，是为了让mlflow的client API关联到已经启动的mlflow server服务**<br>
这样，实际使用的tracking and registry uri就会自动改为**同一个值**，即和mlflow server对应的baeckend-store-uri！<br>
而artifact uri也会改为mlflow server对应的**default-artifact-uri**。
**所以在python代码这里，如果需要设置uri，一般都是localhost，毕竟你可视化的主体是mlflow server** <br><br>

解决方法来源：https://github.com/mlflow/mlflow/issues/1815#issuecomment-531640930

In [22]:
client.list_run_infos(0)[0].run_id

'938355ac84084d33898f1db597f11630'

### 创建experiment

In [18]:
experiment_id = client.create_experiment(name="test2")
# no need to set: ,artifact_location='file:///data/defeng/mlruns/'
#client.set_experiment_tag(experiment_id, "preparation", "incremental_learning")

# Fetch experiment metadata information
experiment = client.get_experiment(experiment_id)
'''
    or use get_experiment_by_name
    experiment = client.get_experiment_by_name('test')
'''
print("Name: {}".format(experiment.name))
print("Experiment_id: {}".format(experiment.experiment_id))
print("Artifact Location: {}".format(experiment.artifact_location)) # actual path: experiment.artifact_location/atifacts
# print("Tags: {}".format(experiment.tags))
# print("Lifecycle_stage: {}".format(experiment.lifecycle_stage))

Name: test2
Experiment_id: 2
Artifact Location: file:///data/defeng/mlruns/2


In [7]:
experiment.artifact_location

'file:///data/defeng/mlruns/1'

### 创建run

In [8]:
# ****reference: https://mlflow.org/docs/latest/python_api/mlflow.tracking.html#mlflow.tracking.MlflowClient.create_run
run = client.create_run(experiment_id)
# no need to add param:start_time in func create_run
# it will cause error time and without this param, the time is correct!

print("Run tags: {}".format(run.data.tags))
print("Experiment id: {}".format(run.info.experiment_id))
print("Run id: {}".format(run.info.run_id))
print("lifecycle_stage: {}".format(run.info.lifecycle_stage))
print("status: {}".format(run.info.status))

Run tags: {}
Experiment id: 1
Run id: 1c1c953e75dc4081adda72120f0a9733
lifecycle_stage: active
status: RUNNING


In [10]:
pprint.pprint(run.info)

<RunInfo: artifact_uri='file:///data/defeng/mlruns/1/1c1c953e75dc4081adda72120f0a9733/artifacts', end_time=None, experiment_id='1', lifecycle_stage='active', run_id='1c1c953e75dc4081adda72120f0a9733', run_uuid='1c1c953e75dc4081adda72120f0a9733', start_time=1626946369631, status='RUNNING', user_id='unknown'>


In [38]:
print(run.info.status)
client.set_terminated(run.info.run_id)
run = client.get_run(run.info.run_id)
print(run.info.status)
# run对象已经发生改变，所以需要重新get_run

RUNNING
FINISHED


### 模拟训练代码执行

下面的代码模拟了训练代码执行过程一般需要用到的数据记录函数log_XX

In [11]:
if os.path.exists(os.path.join(ROOT_PATH, 'test_dir')):
    import shutil
    shutil.rmtree(os.path.join(ROOT_PATH, 'test_dir'), ignore_errors=True)
    print("deleting... Done!")
    
os.mkdir(os.path.join(ROOT_PATH, 'test_dir'))

deleting... Done!


In [12]:
# ****reference: https://mlflow.org/docs/latest/python_api/mlflow.tracking.html#mlflow.tracking.MlflowClient
client.log_param(run.info.run_id,'name','defeng')
client.log_artifact(run.info.run_id, os.path.join(ROOT_PATH, 'jupyter.sh'))
client.log_artifacts(run.info.run_id, os.path.join(ROOT_PATH, 'test_dir'))

In [13]:

'''
    if want to log mulitple items at a time:
        # []中的内容对应log_metric, log_param等函数的参数。
        metrics = [Metric('m', 1.5, timestamp, 1)]
        params = [Param("p", 'p')]
        tags = [RunTag("t", "t")]

        # Log entities, terminate the run, and fetch run status
        client.log_batch(run.info.run_id, metrics=metrics, params=params, tags=tags)
'''

# log_dict, log_figure, log_image, log_text等函数自己见reference。
# log_metric见下一个cell。

'\n    if want to log mulitple items at a time:\n        # []中的内容对应log_metric, log_param等函数的参数。\n        metrics = [Metric(\'m\', 1.5, timestamp, 1)]\n        params = [Param("p", \'p\')]\n        tags = [RunTag("t", "t")]\n\n        # Log entities, terminate the run, and fetch run status\n        client.log_batch(run.info.run_id, metrics=metrics, params=params, tags=tags)\n'

In [13]:
data = np.random.random(100)

for idx,item in enumerate(data):
    #print(type(item))
    #print(idx)
    client.log_metric(run.info.run_id,'data',item,step=idx)

In [14]:
client.get_metric_history(run.info.run_id,key='data')[:5]

[<Metric: key='data', step=0, timestamp=1626946418758, value=0.4258597857536043>,
 <Metric: key='data', step=1, timestamp=1626946418795, value=0.2926135076095694>,
 <Metric: key='data', step=2, timestamp=1626946419132, value=0.9019204595704268>,
 <Metric: key='data', step=3, timestamp=1626946419157, value=0.5791149667847949>,
 <Metric: key='data', step=4, timestamp=1626946419181, value=0.32800999015404064>]

### 创建registered_model

In [11]:
client.create_registered_model(name='defeng')

<RegisteredModel: creation_timestamp=1626753578987, description=None, last_updated_timestamp=1626753578987, latest_versions=[], name='defeng', tags={}>

然后即可在127.0.0.1:5000中看到执行结果

### 创建model_version

基于各个创建的registered_model来创建model_version

In [12]:
client.create_model_version(name='defeng',source='~/temp',run_id=run.info.run_id)

2021/07/20 12:00:12 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: defeng, version 1


<ModelVersion: creation_timestamp=1626753612371, current_stage='None', description=None, last_updated_timestamp=1626753612371, name='defeng', run_id='83c14eabe1514b41b7140383124aa4b3', run_link=None, source='~/temp', status='READY', status_message=None, tags={}, user_id=None, version=1>

然后即可在127.0.0.1:5000中看到执行结果

### 更多有关client的API

见OneNote笔记：《client API》  <br> <br>
或者链接：https://mlflow.org/docs/latest/python_api/mlflow.tracking.html

=============================================================================================

## deprecated code

经过测试✔，不需要执行下面设置tracking_uri cell(cell 内容✔)的内容，也能实现client的tracking，registry和artifacts存到期望的地方。<br>


In [5]:
mlflow.tracking.set_tracking_uri('http://localhost:10500') # 'sqlite:////data/defeng/mlruns/registry.db'
#mlflow.tracking.set_registry_uri('sqlite:////data/defeng/mlruns/registry.db')

In [25]:
mr_uri = mlflow.get_registry_uri()
print("Current registry uri: {}".format(mr_uri))
tracking_uri = mlflow.get_tracking_uri()
print("Current tracking uri: {}".format(tracking_uri))

Current registry uri: http://localhost:10500
Current tracking uri: http://localhost:10500


In [7]:
mlflow.get_artifact_uri()

'file:///data/defeng/mlruns/0/7431ea75988440b3840841d536eda969/artifacts'

In [15]:
pprint.pprint(client)

<mlflow.tracking.client.MlflowClient object at 0x7fe55353a190>


In [7]:
run.info.start_time

1626787985841

In [8]:
run.info.run_link

AttributeError: 'RunInfo' object has no attribute 'run_link'

In [44]:
# not working!
with client.start_run() as run:
    #mlflow.log_artifact('../requirements.txt')
    #mlflow.log_param("child", "yes")
    
    run1 = mlflow.active_run()
    print(run1.info.run_id)
    
    client.log_param(run.info.run_id,"name","defeng")

AttributeError: 'MlflowClient' object has no attribute 'start_run'

In [3]:
k = torch.tensor(3.0,dtype=torch.float32)
b = torch.tensor(5.0,dtype=torch.float32)

In [1]:
from datetime import datetime
dt = datetime.now()
dt.microsecond

844683

In [3]:
import time 
time.time()

1626614241.9734983

In [10]:
from mlflow.entities import ViewType

client.list_experiments(ViewType.ALL) # ACTIVE_ONLY, DELETED_ONLY, or ALL

[<Experiment: artifact_location='file:///data/defeng/mlruns/1', experiment_id='1', lifecycle_stage='active', name='test1', tags={}>,
 <Experiment: artifact_location='file:///data/defeng/mlruns/0', experiment_id='0', lifecycle_stage='active', name='Default', tags={}>,
 <Experiment: artifact_location='file:///data/defeng/mlruns/2', experiment_id='2', lifecycle_stage='active', name='test2', tags={}>]

TODO 测试这两个create函数和ui的关系！ ✔

mlflow server --backend-store-uri sqlite:////data/defeng/mlruns/registry.db --port 10500 --default-artifact-root file:///data/defeng/mlruns/  <br>
** 这样子设置artifacts路径之后，和之前使用mlflow ui时一样，即层级结构还是**~/mlruns/**exp_id/run_id/artifacts

从上面这部分的例子可以看出，client像是一个服务，在这个服务下可以建设exp，而在exp下又可以建构各个run。<br>
在mlflow官方的quickstart和例子中，大多使用with mlflow.start_run()，或其他mlflow.XX的API(https://mlflow.org/docs/latest/python_api/mlflow.html#mlflow.start_run) 。<br>
这是因为这些官方例子中，没有考虑到设置exp name等相关参数的事情(而是交给了系统自行设置)。<br>
可在上面这样的例子中，不能再使用用with mlflow.start_run()，或其他mlflow.XX的API，因为**我们使用了client，而client本身就相当于mlflow.**。<br>所以我们要改用client的API(https://mlflow.org/docs/latest/python_api/mlflow.tracking.html#mlflow.tracking.MlflowClient) 。<br>
TODO more ✔
- 如何在exp和run中集成代码和git管理(mlflow projects)
- 如何恢复之前的run的运行，查询所有的run，可视化之前的run的数据等。
- 数据库作为存储的模式(src: https://mlflow.org/docs/latest/tracking.html#scenario-2-mlflow-on-localhost-with-sqlite)