# Running Parquez pipeline

In [1]:
#!pip uninstall mlrun -y

#!pip install colorlog PyHive mlrun kubernetes
!pip show mlrun

Name: mlrun
Version: 0.6.0rc7
Summary: Tracking and config of machine learning runs
Home-page: https://github.com/mlrun/mlrun
Author: Yaron Haviv
Author-email: yaronh@iguazio.com
License: MIT
Location: /User/.pythonlibs/jupyter/lib/python3.7/site-packages
Requires: GitPython, v3io-frames, semver, dask, tabulate, google-auth, sqlalchemy, humanfriendly, azure-storage-blob, alembic, kfp, ipython, click, v3io, pandas, orjson, nuclio-jupyter, aiohttp, nest-asyncio, pydantic, pyyaml, mergedeep, urllib3, fastapi, pyarrow, boto3, kubernetes, requests
Required-by: 


### create the mlrun project 

In [2]:
from os import path, getenv
from mlrun import new_project, mlconf

#project_name = '-'.join(filter(None, ['getting-started-iris', getenv('V3IO_USERNAME', None)]))
project_name = "parquez"
project_path = path.abspath('./')
project = new_project(project_name, project_path)
project.save()
print(f'Project path: {project_path}\nProject name: {project_name}')

RunDBError: Failed storing project parquez

In [None]:
out = mlconf.artifact_path or path.abspath('./data')
# {{run.uid}} will be substituted with the run id, so output will be written to different directoried per run
artifact_path = path.join(out, '{{run.uid}}')
%env PYTHONPATH=./

### set the project functions

In [4]:
from mlrun import mount_v3io
#project.set_function("functions/clean_parquez.py", 'clean', kind='job', image='aviaigz/parquez')
project.set_function("functions/validate_input.py", 'validate', kind='job', image='aviaigz/parquez')
project.set_function("functions/get_table_schema.py", 'get_schema', kind='job', image='aviaigz/parquez')
project.set_function("functions/create_parquet_table.py", 'create_parquet', kind='job', image='aviaigz/parquez')
project.set_function("functions/create_kv_view.py", 'create_kv_view', kind='job', image='aviaigz/parquez')
project.set_function("functions/create_unified_view.py", 'create_unified_view', kind='job', image='aviaigz/parquez')
project.set_function(kind='spark', command='/User/parquez/functions/kv_to_parquet.py', name='run_parquez_interval')
project.func('run_parquez_interval').with_driver_limits(cpu="1300m")
project.func('run_parquez_interval').with_driver_requests(cpu=1, mem="512m") # gpu_type & gpus=<number_of_gpus> are supported too
project.func('run_parquez_interval').with_executor_limits(cpu="1400m")
project.func('run_parquez_interval').with_executor_requests(cpu=2, mem="512m") # gpu_type & gpus=<number_of_gpus> are supported too
project.func('run_parquez_interval').with_igz_spark() # Adds fuse, daemon & iguazio's jars support

# Args are also supported:
# sj.spec.args = ['-arg1', '-arg2']
 
project.func('run_parquez_interval').spec.replicas = 2 # Number of executors
 
project.func('run_parquez_interval').deploy() # Rebuilds the image with MLRun - This is needed in order to support artifact logging etc. This step is too long (~3 minutes)
# project.set_function("functions/run_parquez_interval.py", 'run_parquez_interval', kind='job', image='aviaigz/parquez')
# project.func('run_parquez_interval').apply(mount_v3io())
# project.func('run_parquez_interval').set_env('PYTHONPATH', project_path)
# project.func('run_parquez_interval').spec.artifact_path = 'User/artifacts'
# project.func('run_parquez_interval').spec.service_account='mlrun-api'
# project.func('run_parquez_interval').save()
project.set_function("functions/run_scheduler.py", 'run_scheduler', kind='job', image='aviaigz/parquez')



<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f19c741b890>

<a id="gs-step-create-n-run-ml-pipeline"></a>
## Create and Run a Fully Automated ML Pipeline

You're now ready to create a full ML pipeline.
This is done by using [Kubeflow Pipelines](https://www.kubeflow.org/docs/pipelines/overview/pipelines-overview/), which is integrated into the Iguazio Data Science Platform.
Kubeflow Pipelines is an open-source framework for building and deploying portable, scalable machine-learning workflows based on Docker containers.
MLRun leverages this framework to take your existing code and deploy it as steps in the pipeline.

In [5]:
%%writefile {path.join(project_path, 'workflow.py')}

from kfp import dsl
from mlrun import mount_v3io
from os import path
import os

V3IO_ACCESS_KEY = os.environ['V3IO_ACCESS_KEY']
V3IO_USERNAME = os.getenv('V3IO_USERNAME')

funcs = {}
project_path = path.abspath('./')
parquez_params = {'view_name':'view_name'
         ,'partition_by':'h'
         ,'partition_interval':'1h'
         ,'real_time_window':'1d'
         ,'historical_retention':'7d'
         ,'real_time_table_name':'faker'
         ,'config_path':'/User/parquez/config/parquez.ini'
         ,'user_name':V3IO_USERNAME
         ,'access_key':V3IO_ACCESS_KEY          
         ,'project_path': project_path
         ,'shell_pod_name' : 'parquez-shell'       }


# Configure function resources and local settings
def init_functions(functions: dict, project=None, secrets=None):
    project_path = path.abspath('./')
    for f in functions.values():
        f.apply(mount_v3io())
        f.set_env('PYTHONPATH', project_path)
        f.spec.artifact_path = 'User/artifacts'
        f.spec.service_account='mlrun-api'
        
        
# Create a Kubeflow Pipelines pipeline
@dsl.pipeline(
    name = "parquez-pipeline",
    description = "parquez description"
)
def kfpipeline():
    
#     # clean the tables
#     clean = funcs['clean'].as_step(
#         name="clean",
#         params=parquez_params,
#         outputs=['clean']
#     )
    
    # Ingest the data set
    validate = funcs['validate'].as_step(
        name="validate",
        params=parquez_params,
#         inputs={'table': clean.outputs},
        outputs=['validate']
    )
    
    # Analyze the dataset
    schema = funcs['get_schema'].as_step(
        name="get_schema",
        params = parquez_params,
        inputs={'table': validate.outputs},                       
        outputs=['schema']
    )
    
    parquet = funcs["create_parquet"].as_step(
        name="create_parquet",
        params=parquez_params,
        inputs={"table": schema.outputs['schema']},
        outputs=['create_parquet']
    )
    
    kv_view = funcs["create_kv_view"].as_step(
        name="create_kv_view",
        params=parquez_params,
        inputs={'table': parquet.outputs},
        outputs=['kv_view']
    )
    
    unified_view = funcs["create_unified_view"].as_step(
        name="create_unified_view",
        params=parquez_params,
        inputs={'table': kv_view.outputs},
        outputs=['unified_view']
    )
    
    unified_view = funcs["run_scheduler"].as_step(
        name="run_scheduler",
        params=parquez_params,
        inputs={'table': unified_view.outputs},
        outputs=['run_scheduler']
    )    

Overwriting /User/parquez/workflow.py


<a id="gs-register-workflow"></a>
#### Register the Workflow

Use the `set_workflow` MLRun project method to register your workflow with MLRun.
The following code sets the `name` parameter to the selected workflow name ("main") and the `code` parameter to the name of the workflow file that is found in your project directory (**workflow.py**).

In [6]:
# Register the workflow file as "main"
project.set_workflow('main', 'workflow.py')

In [7]:
project.save()

In [8]:
run_id = project.run(
    'main',
    arguments={}, 
    
    artifact_path=path.abspath(path.join('pipeline','{{workflow.uid}}'),
    
                              )
    ,dirty=True)

> 2020-08-30 14:44:18,659 [info] using in-cluster config.


> 2020-08-30 14:44:19,173 [info] Pipeline run id=1935c12a-cb38-4cf1-8239-cc93af526412, check UI or DB for progress


In [9]:
from mlrun import get_run_db
get_run_db().list_schedules('parquez')

SchedulesOutput(schedules=[ScheduleOutput(name='run-parquez-interval', kind=<ScheduleKinds.job: 'job'>, scheduled_object={'task': {'spec': {'parameters': {'view_name': 'view_name', 'partition_by': 'h', 'partition_interval': '1h', 'real_time_window': '1d', 'historical_retention': '7d', 'real_time_table_name': 'faker', 'config_path': '/User/parquez/config/parquez.ini', 'user_name': 'avia', 'access_key': '8e0eab92-86ab-4b8d-b7fd-902b2e761267', 'project_path': '/User/parquez', 'shell_pod_name': 'parquez-shell'}, 'output_path': '/User/artifacts', 'function': 'parquez/run-parquez-interval@9c82bd912b4426d5cfcb7eb038ab97067e3d241d', 'secret_sources': [], 'scrape_metrics': False}, 'metadata': {'uid': 'c58383ca9faf45b6ae0912aadcb345e6', 'name': 'run-parquez-interval', 'project': 'parquez', 'labels': {'v3io_user': 'avia', 'kind': 'job', 'owner': 'avia'}, 'iteration': 0}, 'status': {'state': 'created'}}, 'schedule': '0 */1 * * * '}, cron_trigger=ScheduleCronTrigger(year=None, month='*', day='*', w