# Introduction
Author: Yingding Wang\
Created on: 05.01.2024

This notebook file demonstrate the simple Kubeflow V2 pipeline on Kubeflow 1.8.0 manifests deployment with ml pipeline api server 2.0.5. 

## KFP v2 docs
* KFP v1 to v2 migration https://www.kubeflow.org/docs/components/pipelines/v2/migration/
* Hello world v2 https://www.kubeflow.org/docs/components/pipelines/v2/hello-world/

In [1]:
import sys
from platform import python_version
print (f"current platform python version: {python_version()}")

current platform python version: 3.11.6


In [2]:
#!{sys.executable} -m pip install --upgrade --user kfp==2.5.0 kfp-kubernetes==1.0.0 kfp-server-api==2.0.5

In [3]:
!{sys.executable} -m pip list | grep kfp

kfp                           2.4.0
kfp-kubernetes                1.0.0
kfp-pipeline-spec             0.2.2
kfp-server-api                2.0.5


In [4]:
from kfp.dsl.pipeline_task import PipelineTask

def set_res_limit(task: PipelineTask, mem_req="200Mi", cpu_req="2000m", mem_lim="4000Mi", cpu_lim='4000m') -> PipelineTask:
    """set the resource limit for cpu and memory, no cpu and memory requirement sofar.
    should the limit is set to small, the Task Pod would be stopped by kubernetes with OOMKilled status.
    
    Args:
        task(PipelineTask): the KFP PipelineTask which need to be set the cpu and memory limits
        cpu_limit(str): the str representation of cpu limit e.g. '1' as one cpu time, '0.5' as 1/2 cpu time
        mem_limit(str): the str representation of memory limit e.g. '500M' for 500MB RAM
        
    Return:
        (PipelineTask): the PipelineTask with the desired limitations set
    """
    # return task.set_cpu_limit('1').set_memory_limit('500M')
    return task.set_cpu_request(cpu_req)\
            .set_cpu_limit(cpu_lim)\
            .set_memory_request(mem_req)\
            .set_memory_limit(mem_lim)

### Simple pipeline
* hello world v2 https://www.kubeflow.org/docs/components/pipelines/v2/hello-world/

In [5]:
from dataclasses import dataclass

@dataclass
class Settings():
    # base_image='python:3.10.13-bullseye' 
    base_image='python:3.11.7-bullseye'

settings = Settings()

def gen_compiled_file_path(file_name: str, pipeline_path_dir="./compiled") -> str:
    """
    In KFP SDK v2, YAML is the preferred serialization format. Json will also work
    Reference:
    https://www.kubeflow.org/docs/components/pipelines/v2/migration/#sdk-v1-v2-namespace-to-sdk-v2
    """
    return f"{pipeline_path_dir}/{file_name}.yaml"

## Pass Artifacts and Parameters
* Parameters (Data Type with small amount of data): https://www.kubeflow.org/docs/components/pipelines/v2/data-types/parameters/
* Artifacts (Data Types for tracking ML artifacts): https://www.kubeflow.org/docs/components/pipelines/v2/data-types/artifacts/

Python object type, dict, array possible: https://www.kubeflow.org/docs/components/pipelines/v2/data-types/parameters/

Multiple output parameters with NamedTuple
```python
from kfp import dsl
from typing import NamedTuple

@dsl.component
def my_comp() -> NamedTuple('outputs', a=int, b=str):
    outputs = NamedTuple('outputs', a=int, b=str)
    return outputs(1, 'hello')
```

In [6]:
from kfp import dsl

@dsl.component(
    base_image=settings.base_image,
)
def say_hello(name: str) -> str:
    hello_text = f"Hello, {name}!"
    print(hello_text)
    return hello_text

@dsl.pipeline(
    name='Helloworld pipeline',
    description='An example kfp v2 simple pipeline'
)
def hello_pipeline(recipient: str) -> str:
    # key word argument required, name=...
    hello_task = say_hello(name=recipient)
    # set resource limit for the task pod 200Mi, 1CPU
    hello_task = set_res_limit(task=hello_task, mem_req="200Mi", cpu_req="1000m", mem_lim="500Mi", cpu_lim='1000m')
    return hello_task.output

my_pipeline = hello_pipeline

In [7]:
from kfp import compiler
import os

component_file_name = "hello_component"
component_file_path = gen_compiled_file_path(component_file_name)

my_pipeline_file_name = "hello_pipeline_v2"
pipeline_package_path = gen_compiled_file_path(my_pipeline_file_name)

pipeline_path_dir="./compiled"
if not os.path.exists(pipeline_path_dir):
    os.makedirs(pipeline_path_dir)

# compile component, instead of using output_component_file in the @dsl.component decorator
compiler.Compiler().compile(
    pipeline_func=say_hello,
    package_path=component_file_path
)

compiler.Compiler().compile(
    pipeline_func=my_pipeline,
    package_path=pipeline_package_path
)

In [8]:
from kfp.client import Client
import warnings

'''suppress kfp v2 client FutureWarning
https://stackoverflow.com/questions/14463277/how-to-disable-python-warnings/14463362#14463362
'''
with warnings.catch_warnings(action="ignore", category=FutureWarning):
    # kubeflow pipeline poddefault is passing the credential to the client
    client = Client()

NAMESPACE = client.get_user_namespace()
EXPERIMENT_NAME = "demo"
print(NAMESPACE)

args = {
    'recipient': 'World',
}
ENABLE_CACHING = True

kubeflow-kindfor


### Run from pipeline function

In [9]:
run = client.create_run_from_pipeline_func(
    pipeline_func = my_pipeline, 
    experiment_name = EXPERIMENT_NAME,
    namespace = NAMESPACE,
    enable_caching=ENABLE_CACHING,
    arguments=args,
)
run

RunPipelineResult(run_id=1bb4c34a-fd68-41bd-9ece-025576f7b171)

### Upload pipeline

In [10]:
PIPELINE_NAME = "helloworld-pipeline"

import json

# without given namespace all the shared pipeline will be shown
# filter the pipeline list in namespace with json filter
# by given namespace, search only the private pipeline, no namespace the shared pipeline
result_dict = client.list_pipelines(
    filter=json.dumps({
                        "predicates": [{
                            "operation": "EQUALS",
                            "key": "display_name",
                            "stringValue": PIPELINE_NAME,
                        }]
                    }),
    namespace=NAMESPACE)
pipelines = result_dict.pipelines

if pipelines is None or len(pipelines) < 1:
    # create pipeline only if there is none in the namespace
    pipeline = client.upload_pipeline(
        pipeline_package_path=pipeline_package_path,
        pipeline_name=PIPELINE_NAME,
        namespace=NAMESPACE
    )
else:
    # get the first kfp_server_api.V2beta1Pipeline
    pipeline_id = pipelines[0].pipeline_id
    print(f"pipeline: {PIPELINE_NAME} exists in namespace: {NAMESPACE}\nhas id: {pipeline_id}")

pipeline: helloworld-pipeline exists in namespace: kubeflow-kindfor
has id: 5ad9f545-de4e-4b69-83ec-b885d88d17bc


### Upload pipeline version

In [11]:
VERSION_NAME="v2"
NEW_PIPELINE_VERSION_NAME=f"{PIPELINE_NAME}-{VERSION_NAME}"

if pipeline_id is not None:
    # get the pipeline version 
    pipeline_versions = client.list_pipeline_versions(
    pipeline_id=pipeline_id,
    filter=json.dumps({
                        "predicates": [{
                            "operation": "EQUALS",
                            "key": "display_name",
                            "stringValue": NEW_PIPELINE_VERSION_NAME,
                        }]
                    }),
    ).pipeline_versions
    # upload a new version if not exists
    if pipeline_versions is None or len(pipeline_versions) < 1:
        pipeline_version = client.upload_pipeline_version(
            pipeline_package_path=pipeline_package_path,
            pipeline_version_name=NEW_PIPELINE_VERSION_NAME,
            pipeline_id=pipeline_id,
        )
    else:
        # get the first kfp_server_api.models.v2beta1_pipeline_version.V2beta1PipelineVersion
        pipeline_version = pipeline_versions[0]
        print(f"pipeline version: {pipeline_version.display_name} exists in namespace: {NAMESPACE}")    

pipeline version: helloworld-pipeline-v2 exists in namespace: kubeflow-kindfor


### (optional) KFP cli for upload pipeline

The cli has no namespace option, for private pipeline in namespace use KFP client sdk instead

* https://www.kubeflow.org/docs/components/pipelines/v2/cli/

In [12]:
# create pipeline
# !kfp pipeline create -p helloworld-pipeline -d v1 $HOME/kf-examples/sdkV2/compiled/hello_pipeline_v2.yaml

In [13]:
# list pipeline ids
# !kfp pipeline list

In [14]:
# show pipeline version with ids
# !kfp pipeline list-versions 3f528a1d-9368-4f25-ae90-04f9490836d2

In [15]:
# create a new version of the pipeline by id or name
# !kfp pipeline create-version -p 3f528a1d-9368-4f25-ae90-04f9490836d2 -v helloworld-pipeline-v2 $HOME/kf-examples/sdkV2/compiled/hello_pipeline_v2.yaml

In [16]:
# show pipeline version with ids
# !kfp pipeline list-versions 3f528a1d-9368-4f25-ae90-04f9490836d2

### (optional) Run from pipeline package using SDK

In [17]:
# run = client.create_run_from_pipeline_package(
#     pipeline_file=pipeline_package_path,
#     experiment_name = EXPERIMENT_NAME,
#     namespace = NAMESPACE,
#     enable_caching=ENABLE_CACHING,
#     arguments=args,
# )
# run