In [149]:
#!pip install requests colorlog PyHive mlrun kubernetes

In [150]:
from os import path, getenv
from mlrun import new_project, mlconf

#project_name = '-'.join(filter(None, ['getting-started-iris', getenv('V3IO_USERNAME', None)]))
project_name = "parquez"
project_path = path.abspath('./')
project = new_project(project_name, project_path)
project.save()
print(f'Project path: {project_path}\nProject name: {project_name}')

Project path: /User/parquez
Project name: parquez


In [151]:
out = mlconf.artifact_path or path.abspath('./data')
# {{run.uid}} will be substituted with the run id, so output will be written to different directoried per run
artifact_path = path.join(out, '{{run.uid}}')

In [152]:
%env PYTHONPATH=./

env: PYTHONPATH=./


In [153]:
# project.set_function("parquez.py", 'parquezrun', kind='job', image='aviaigz/parquez')

In [154]:
project.set_function("validate-input.py", 'validate', kind='job', image='aviaigz/parquez')

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7ff43040bcd0>

In [155]:
from mlrun import run_local, mount_v3io

In [156]:
project.func('validate').apply(mount_v3io())
project.func('validate').set_env('PYTHONPATH', project_path)
project.func('validate').spec.artifact_path = 'User/artifacts'

In [157]:
project.set_function("get_table_schema.py", 'get_schema', kind='job', image='aviaigz/parquez')

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7ff4304123d0>

In [158]:
project.func('get_schema').apply(mount_v3io())
project.func('get_schema').set_env('PYTHONPATH', project_path)
project.func('get_schema').spec.artifact_path = 'User/artifacts'

In [159]:
project.set_function("create_parquet_table.py", 'create_parquet', kind='job', image='aviaigz/parquez')

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7ff430395dd0>

In [160]:
project.func('create_parquet').apply(mount_v3io())
project.func('create_parquet').set_env('PYTHONPATH', project_path)
project.func('create_parquet').spec.artifact_path = 'User/artifacts'
project.func('create_parquet').spec.service_account='mlrun-api'

In [161]:
project.set_function("create_kv_view.py", 'create_kv_view', kind='job', image='aviaigz/parquez')

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7ff4303d27d0>

In [162]:
project.func('create_kv_view').apply(mount_v3io())
project.func('create_kv_view').set_env('PYTHONPATH', project_path)
project.func('create_kv_view').spec.artifact_path = 'User/artifacts'

In [163]:
project.set_function("create_unified_view.py", 'create_unified_view', kind='job', image='aviaigz/parquez')

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7ff4303a4c10>

In [164]:
project.func('create_unified_view').apply(mount_v3io())
project.func('create_unified_view').set_env('PYTHONPATH', project_path)
project.func('create_unified_view').spec.artifact_path = 'User/artifacts'

In [165]:
project.set_function("run_parquez_interval.py", 'run_parquez_interval', kind='job', image='aviaigz/parquez')

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7ff4303d2b90>

In [166]:
project.func('run_parquez_interval').apply(mount_v3io())
project.func('run_parquez_interval').set_env('PYTHONPATH', project_path)
project.func('run_parquez_interval').spec.artifact_path = 'User/artifacts'
project.func('run_parquez_interval').spec.service_account='mlrun-api'
project.func('run_parquez_interval').save()

'eaa799d2b25e075291df055d5a6c20b030b5f625'

In [167]:
project.set_function("run_scheduler.py", 'run_scheduler', kind='job', image='aviaigz/parquez')

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7ff430467450>

In [168]:
project.func('run_scheduler').apply(mount_v3io())
project.func('run_scheduler').set_env('PYTHONPATH', project_path)
project.func('run_scheduler').spec.artifact_path = 'User/artifacts'
project.func('run_scheduler').spec.service_account='mlrun-api'
project.func('run_scheduler').save()


'd1756c87178b6b2ac7ed87ad75900b2a908787af'

In [169]:
# artifact_path = '/User/artifacts'
# # #project.func('parquezrun').run()
# project.func('run_scheduler').run( params = {'view_name':'view_name'
#          ,'partition_by':'h'
#          ,'partition_interval':'1h'
#          ,'real_time_window':'1d'
#          ,'historical_retention':'7d'
#          ,'real_time_table_name':'faker'
#          ,'config_path':'/User/parquez/config/parquez.ini'},artifact_path=artifact_path)

<a id="gs-step-create-n-run-ml-pipeline"></a>
## Create and Run a Fully Automated ML Pipeline

You're now ready to create a full ML pipeline.
This is done by using [Kubeflow Pipelines](https://www.kubeflow.org/docs/pipelines/overview/pipelines-overview/), which is integrated into the Iguazio Data Science Platform.
Kubeflow Pipelines is an open-source framework for building and deploying portable, scalable machine-learning workflows based on Docker containers.
MLRun leverages this framework to take your existing code and deploy it as steps in the pipeline.

In [170]:
%%writefile {path.join(project_path, 'workflow.py')}

from kfp import dsl
from mlrun import mount_v3io

funcs = {}
parquez_params = {'view_name':'view_name'
         ,'partition_by':'h'
         ,'partition_interval':'1h'
         ,'real_time_window':'1d'
         ,'historical_retention':'7d'
         ,'real_time_table_name':'faker'
         ,'config_path':'/User/parquez/config/parquez.ini'}


# Configure function resources and local settings
def init_functions(functions: dict, project=None, secrets=None):
    for f in functions.values():
        f.apply(mount_v3io())

    #functions['serving'].metadata.name = 'getting-started-serving'


# Create a Kubeflow Pipelines pipeline
@dsl.pipeline(
    name = "parquez-pipeline",
    description = "parquez description"
)
def kfpipeline():
    # Ingest the data set
    validate = funcs['validate'].as_step(
        name="validate",
        params=parquez_params,
        outputs=['validate']
    )
    
    # Analyze the dataset
    schema = funcs['get_schema'].as_step(
        name="get_schema",
        params = parquez_params,
        inputs={'table': validate.outputs},                       
        outputs=['schema']
    )
    
    parquet = funcs["create_parquet"].as_step(
        name="create_parquet",
        params=parquez_params,
        inputs={"table": schema.outputs['schema']},
        outputs=['create_parquet']
    )
    
    kv_view = funcs["create_kv_view"].as_step(
        name="create_kv_view",
        params=parquez_params,
        inputs={'table': parquet.outputs},
        outputs=['kv_view']
    )
    
    unified_view = funcs["create_unified_view"].as_step(
        name="create_unified_view",
        params=parquez_params,
        inputs={'table': kv_view.outputs},
        outputs=['unified_view']
    )
    
    unified_view = funcs["run_scheduler"].as_step(
        name="run_scheduler",
        params=parquez_params,
        inputs={'table': unified_view.outputs},
        outputs=['run_scheduler']
    )    

Overwriting /User/parquez/workflow.py


<a id="gs-register-workflow"></a>
#### Register the Workflow

Use the `set_workflow` MLRun project method to register your workflow with MLRun.
The following code sets the `name` parameter to the selected workflow name ("main") and the `code` parameter to the name of the workflow file that is found in your project directory (**workflow.py**).

In [171]:
# Register the workflow file as "main"
project.set_workflow('main', 'workflow.py')

In [172]:
project.save()

In [173]:
run_id = project.run(
    'main',
    arguments={}, 
    
    artifact_path=path.abspath(path.join('pipeline','{{workflow.uid}}'),
    
                              )
    ,dirty=True)

[mlrun] 2020-07-22 12:49:43,510 using in-cluster config.


[mlrun] 2020-07-22 12:49:44,582 Pipeline run id=4f5c41fe-86aa-4b28-be05-7b6825a3ea63, check UI or DB for progress
