### **04_document_insights.ipynb**
### **Document Insights Pipeline**

* ##### 01 - Install packages
* ##### 02 - Import packages
* ##### 03 - Create tasks
* ##### 04 - Create pipeline
* ##### 05 - Create pipeline yaml
* ##### 06 - Create pipeline run

### 01 - Install packages

In [None]:
!pip install kfp-tekton==1.5.9

### 02 - Import packages

In [None]:
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))

import kfp
import kfp_tekton

from components.download_document               import download_document
from components.evaluate_document_names         import evaluate_document_names
from components.extract_document_info_cnh       import extract_document_info_cnh
from components.extract_document_info_escritura import extract_document_info_escritura

### 03 - Create tasks

In [None]:
task_base_image = 'registry.access.redhat.com/ubi9/python-311'
image_tesseract = 'image-registry.openshift-image-registry.svc:5000/my-project/tesseract'

In [None]:
download_document_op = kfp.components.create_component_from_func(
    func                = download_document,
    base_image          = task_base_image,
    packages_to_install = ['boto3']
)

In [None]:
extract_document_info_cnh_op = kfp.components.create_component_from_func(
    func       = extract_document_info_cnh,
    base_image = image_tesseract
)

In [None]:
extract_document_info_escritura_op = kfp.components.create_component_from_func(
    func       = extract_document_info_escritura,
    base_image = image_tesseract
)

In [None]:
evaluate_document_names_op = kfp.components.create_component_from_func(
    func       = evaluate_document_names,
    base_image = task_base_image
)

### 04 - Create pipeline

In [None]:
pipeline_name        = '04_document_insights'
pipeline_description = 'Document Insights Pipeline'

In [None]:
@kfp.dsl.pipeline(
    name        = pipeline_name,
    description = pipeline_description
)
def pipeline(
    s3_service_name      : str,
    s3_endpoint_url      : str,
    s3_access_key_id     : str,
    s3_secret_access_key : str,
    s3_region            : str,
    s3_bucket            : str,
    tesseract_config     : str,
    document_cnh         : str,
    document_escritura   : str
):

    import os

    create_pvc_task = kfp.dsl.VolumeOp(
        name          = 'create_pvc',
        resource_name = 'pvc',
        size          = '1Gi',
        modes         = kfp.dsl.VOLUME_MODE_RWO
    )

    pvc_directory = os.path.join('/', 'pipeline')
    pvc_volume    = create_pvc_task.volume

    download_document_cnh_task = download_document_op(
        s3_service_name      = s3_service_name,
        s3_endpoint_url      = s3_endpoint_url,
        s3_access_key_id     = s3_access_key_id,
        s3_secret_access_key = s3_secret_access_key,
        s3_region            = s3_region,
        s3_bucket            = s3_bucket,
        pipeline_name        = pipeline_name,
        document_name        = document_cnh
    )
    download_document_cnh_task.set_display_name('download-document-cnh')
    download_document_cnh_task.add_pvolumes({ pvc_directory : pvc_volume.after(create_pvc_task) })

    download_document_escritura_task = download_document_op(
        s3_service_name      = s3_service_name,
        s3_endpoint_url      = s3_endpoint_url,
        s3_access_key_id     = s3_access_key_id,
        s3_secret_access_key = s3_secret_access_key,
        s3_region            = s3_region,
        s3_bucket            = s3_bucket,
        pipeline_name        = pipeline_name,
        document_name        = document_escritura
    )
    download_document_escritura_task.set_display_name('download-document-escritura')
    download_document_escritura_task.add_pvolumes({ pvc_directory : pvc_volume.after(create_pvc_task) })

    extract_document_info_cnh_task = extract_document_info_cnh_op(document_cnh, tesseract_config)
    extract_document_info_cnh_task.add_pvolumes({ pvc_directory : pvc_volume.after(download_document_cnh_task) })

    extract_document_info_escritura_task = extract_document_info_escritura_op(document_escritura, tesseract_config)
    extract_document_info_escritura_task.add_pvolumes({ pvc_directory : pvc_volume.after(download_document_escritura_task) })

    evaluate_document_names_task = evaluate_document_names_op(document_cnh, document_escritura)
    evaluate_document_names_task.add_pvolumes({ pvc_directory : pvc_volume.after(extract_document_info_cnh_task).after(extract_document_info_escritura_task) })

### 05 - Create pipeline yaml

In [None]:
pipeline_package_path = os.path.join('yaml', f'{ pipeline_name }.yaml')

In [None]:
kfp_tekton.compiler.TektonCompiler().compile(
    pipeline_func = pipeline,
    package_path  = pipeline_package_path
)

### 06 - Create pipeline run

In [None]:
kubeflow_host  = '<kubeflow_host>'
kubeflow_token = '<kubeflow_token>'

In [None]:
pipeline_arguments = {
    's3_service_name'      : 's3',
    's3_endpoint_url'      : '<s3_endpoint_url>',
    's3_access_key_id'     : '<s3_access_key_id>',
    's3_secret_access_key' : '<s3_secret_access_key>',
    's3_region'            : '<s3_region>',
    's3_bucket'            : '<s3_bucket>',
    'tesseract_config'     : '<tesseract_config>',  # r'--oem 3 --psm 4 -l por'
    'document_cnh'         : '<document_cnh>',
    'document_escritura'   : '<document_escritura>'
}

In [None]:
kfp_tekton.TektonClient(host = kubeflow_host, existing_token = kubeflow_token).create_run_from_pipeline_package(
    pipeline_file = pipeline_package_path,
    arguments     = pipeline_arguments
)