In [30]:
#!pip install requests colorlog PyHive mlrun kubernetes


In [31]:
from os import path, getenv
from mlrun import new_project, mlconf

#project_name = '-'.join(filter(None, ['getting-started-iris', getenv('V3IO_USERNAME', None)]))
project_name = "parquez"
project_path = path.abspath('./')
project = new_project(project_name, project_path)
project.save()
print(f'Project path: {project_path}\nProject name: {project_name}')




Project path: /User/parquez
Project name: parquez


In [32]:
out = mlconf.artifact_path or path.abspath('./data')
# {{run.uid}} will be substituted with the run id, so output will be written to different directoried per run
artifact_path = path.join(out, '{{run.uid}}')

In [33]:
%env PYTHONPATH=./

env: PYTHONPATH=./


In [34]:
# project.set_function("parquez.py", 'parquezrun', kind='job', image='aviaigz/parquez')

In [35]:
project.set_function("validate-input.py", 'validate', kind='job', image='aviaigz/parquez')

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fb0d5e8b908>

In [36]:
from mlrun import run_local, mount_v3io

In [37]:
# project.func('parquezrun').apply(mount_v3io())
# project.func('parquezrun').set_env('PYTHONPATH', project_path)
# project.func('parquezrun').spec.service_account='mlrun-api'
project.func('validate').apply(mount_v3io())
project.func('validate').set_env('PYTHONPATH', project_path)
project.func('validate').spec.artifact_path = 'User/artifacts'


In [38]:
project.set_function("get_table_schema.py", 'get_schema', kind='job', image='aviaigz/parquez')


<mlrun.runtimes.kubejob.KubejobRuntime at 0x7fb0d5e3d198>

In [39]:
project.func('get_schema').apply(mount_v3io())
project.func('get_schema').set_env('PYTHONPATH', project_path)
project.func('get_schema').spec.artifact_path = 'User/artifacts'

In [47]:
artifact_path = '/User/artifacts'
# #project.func('parquezrun').run()
project.func('validate').run( params = {'view_name':'view_name'
         ,'partition_by':'h'
         ,'partition_interval':'1h'
         ,'real_time_window':'1d'
         ,'historical_retention':'7d'
         ,'real_time_table_name':'faker'
         ,'config_path':'/User/parquez/config/parquez.ini'},artifact_path=artifact_path)

[mlrun] 2020-06-22 16:51:55,104 starting run validate uid=0df40da47893484d95fea2988840d99f  -> http://mlrun-api:8080
[mlrun] 2020-06-22 16:51:55,192 Job is running in the background, pod: validate-gvxmf
[mlrun] 2020-06-22 16:51:59,761 starting local run: main.py # main
[mlrun] 2020-06-22 16:52:01,429 logging run results to: http://mlrun-api:8080
[mlrun] 2020-06-22 16:52:01,448 {'view_name': 'view_name', 'partition_by': 'h', 'partition_interval': '1h', 'real_time_window': '1d', 'historical_retention': '7d', 'real_time_table_name': 'faker', 'config_path': '/User/parquez/config/parquez.ini'}
[mlrun] 2020-06-22 16:52:01,448 Starting to Parquezzzzzzzz
[mlrun] 2020-06-22 16:52:01,449 Namespace(config='/User/parquez/config/parquez.ini', historical_retention='7d', partition_by='h', partition_interval='1h', real_time_table_name='faker', real_time_window='1d', view_name='view_name')
[mlrun] 2020-06-22 16:52:01,449 input parsed

[mlrun] 2020-06-22 16:52:01,747 run executed, status=completed
final

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
parquez,...8840d99f,0,Jun 22 16:52:01,completed,validate,host=validate-gvxmfkind=jobowner=adminv3io_user=admin,,config_path=/User/parquez/config/parquez.inihistorical_retention=7dpartition_by=hpartition_interval=1hreal_time_table_name=fakerreal_time_window=1dview_name=view_name,,


to track results use .show() or .logs() or in CLI: 
!mlrun get run 0df40da47893484d95fea2988840d99f --project parquez , !mlrun logs 0df40da47893484d95fea2988840d99f --project parquez
[mlrun] 2020-06-22 16:52:04,339 run executed, status=completed


<mlrun.model.RunObject at 0x7fb0d5caaf60>

In [41]:
# Run the data-ingestion function locally in Jupyter Notebook
#get_data_run = run_local(command= project.func('parquezrun'),workdir='./')

<a id="gs-step-create-n-run-ml-pipeline"></a>
## Create and Run a Fully Automated ML Pipeline

You're now ready to create a full ML pipeline.
This is done by using [Kubeflow Pipelines](https://www.kubeflow.org/docs/pipelines/overview/pipelines-overview/), which is integrated into the Iguazio Data Science Platform.
Kubeflow Pipelines is an open-source framework for building and deploying portable, scalable machine-learning workflows based on Docker containers.
MLRun leverages this framework to take your existing code and deploy it as steps in the pipeline.

In [42]:
%%writefile {path.join(project_path, 'workflow.py')}

from kfp import dsl
from mlrun import mount_v3io

funcs = {}
parquez_params = {'view_name':'view_name'
         ,'partition_by':'h'
         ,'partition_interval':'1h'
         ,'real_time_window':'1d'
         ,'historical_retention':'7d'
         ,'real_time_table_name':'faker'
         ,'config_path':'/User/parquez/config/parquez.ini'}


# Configure function resources and local settings
def init_functions(functions: dict, project=None, secrets=None):
    for f in functions.values():
        f.apply(mount_v3io())

    #functions['serving'].metadata.name = 'getting-started-serving'


# Create a Kubeflow Pipelines pipeline
@dsl.pipeline(
    name = "parquez-pipeline",
    description = "parquez description"
)
def kfpipeline():
    # Ingest the data set
    validate = funcs['validate'].as_step(
        name="validate",
        params=parquez_params
        #handler='get_data',
        #inputs={'source_url': source_url},
        #params={'format': 'pq'},
        #outputs=[DATASET]
    )
    
    # Analyze the dataset
    schema = funcs['get_schema'].as_step(
        name="get_schema",
        params = parquez_params,
        inputs={'table': validate.outputs}
        #handler='get_data',                
        #outputs=[DATASET]
    )
#     describe = funcs["describe"].as_step(
#         name="summary",
#         params={"label_column": LABELS},
#         inputs={"table": ingest.outputs[DATASET]})

#     # Train a model with hyperparemeters
#     train = funcs["train"].as_step(
#         name="train",
#         params={"sample": -1,
#                 "label_column": LABELS,
#                 "test_size": 0.10},
#         hyperparams={'model_pkg_class': MODELS},
#         selector='max.accuracy',
#         inputs={"dataset": ingest.outputs[DATASET]},
#         outputs=['model', 'test_set'])

#     # Test and visualize the model
#     test = funcs["test"].as_step(
#         name="test",
#         params={"label_column": LABELS},
#         inputs={"models_path": train.outputs['model'],
#                 "test_set": train.outputs['test_set']})

#     # Deploy the model as a serverless function
#     deploy = funcs["serving"].deploy_step(
#         models={f"{DATASET}_v1": train.outputs['model']})

#     # Test the new model server (via REST API calls)
#     tester = funcs["serving-tester"].as_step(
#         name='serving-tester',
#         params={'addr': deploy.outputs['endpoint'], 'model': f"{DATASET}_v1"},
#         inputs={'table': train.outputs['test_set']})

Overwriting /User/parquez/workflow.py


<a id="gs-register-workflow"></a>
#### Register the Workflow

Use the `set_workflow` MLRun project method to register your workflow with MLRun.
The following code sets the `name` parameter to the selected workflow name ("main") and the `code` parameter to the name of the workflow file that is found in your project directory (**workflow.py**).

In [43]:
# Register the workflow file as "main"
project.set_workflow('main', 'workflow.py')

In [44]:
project.save()

In [45]:
run_id = project.run(
    'main',
    arguments={}, 
    
    artifact_path=path.abspath(path.join('pipeline','{{workflow.uid}}'),
    
                              )
    ,dirty=True)



[mlrun] 2020-06-22 16:48:55,006 Pipeline run id=b52d8375-ccbb-4390-94dd-649130e13fc0, check UI or DB for progress
