### **irpf.ipynb** ###
### **Pipeline for converting IRPF pdf to xml** ###

* ##### 01 - Install packages
* ##### 02 - Import packages
* ##### 03 - Create tasks
* ##### 04 - Create pipeline
* ##### 05 - Create pipeline yaml
* ##### 06 - Create pipeline run

### 01 - Install packages

In [None]:
!pip install --upgrade kfp[kubernetes]

### 02 - Import packages

In [None]:
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))

import kfp
import kfp.kubernetes as kubernetes

from components.download_document import download_document
from components.upload_document   import upload_document

### 03 - Create tasks

In [None]:
image_boto3 = '<image_boto3>'

In [None]:
download_document_op = kfp.dsl.component(
    func       = download_document,
    base_image = image_boto3
)

In [None]:
upload_document_op = kfp.dsl.component(
    func       = upload_document,
    base_image = image_boto3
)

### 04 - Create pipeline

In [None]:
pipeline_name        = 'irpf'
pipeline_description = 'Convert IRPF .pdf to .xml'

In [None]:
@kfp.dsl.pipeline(
    name        = pipeline_name,
    description = pipeline_description
)
def pipeline(
    s3_service_name      : str,
    s3_endpoint_url      : str,
    s3_access_key_id     : str,
    s3_secret_access_key : str,
    s3_region            : str,
    s3_bucket            : str,
    s3_document          : str,
    s3_directory         : str,
    storage_class_name   : str
):

    # Imports

    import os

    # Create PVC task

    create_pvc_task = kubernetes.CreatePVC(
        pvc_name_suffix    = '-pipeline-pvc',
        size               = '1Gi',
        access_modes       = ['ReadWriteOnce'],
        storage_class_name = '<storage_class_name>'
    )

    pvc_directory = os.path.join('/', 'pipeline', pipeline_name)
    pvc_name      = create_pvc_task.outputs['name']

    # Download document task

    download_document_task = download_document_op(
        s3_service_name      = s3_service_name,
        s3_endpoint_url      = s3_endpoint_url,
        s3_access_key_id     = s3_access_key_id,
        s3_secret_access_key = s3_secret_access_key,
        s3_region            = s3_region,
        s3_bucket            = s3_bucket,
        s3_document          = s3_document,
        pvc_directory        = pvc_directory
    )

    kubernetes.mount_pvc(
        task       = download_document_task,
        pvc_name   = pvc_name,
        mount_path = pvc_directory
    )

    download_document_task.after(create_pvc_task)

    # Delete PVC task

    delete_pvc_task = kubernetes.DeletePVC(pvc_name = pvc_name)
    delete_pvc_task.after(download_document_task)

### 05 - Create pipeline yaml

In [None]:
pipeline_package_path = os.path.join('yaml', f'{ pipeline_name }.yaml')

In [None]:
kfp.compiler.Compiler().compile(
    pipeline_func = pipeline,
    package_path  = pipeline_package_path
)

### 06 - Create pipeline run

In [None]:
kubeflow_host = '<kubeflow_host>'

In [None]:
pipeline_arguments = {
    's3_service_name'      : 's3',
    's3_endpoint_url'      : '<s3_endpoint_url>',
    's3_access_key_id'     : '<s3_access_key_id>',
    's3_secret_access_key' : '<s3_secret_access_key>',
    's3_region'            : '<s3_region>',
    's3_bucket'            : '<s3_bucket>',
    's3_document'          : '<s3_document>',
    's3_directory'         : '<s3_directory>',
    'storage_class_name'   : '<storage_class_name>'
}

In [None]:
kfp.client.Client(host = kubeflow_host).create_run_from_pipeline_package(
    pipeline_file = pipeline_package_path,
    arguments     = pipeline_arguments
)