### **index_document.ipynb**
### **Index a PDF document in Elasticsearch**

* ##### 01 - Install packages
* ##### 02 - Import packages
* ##### 03 - Create pipeline
* ##### 04 - Create pipeline yaml
* ##### 05 - Create pipeline run

### 01 - Install packages

In [None]:
from sys import executable

In [None]:
!{ executable } -m pip install --upgrade kfp[kubernetes]

### 02 - Import packages

In [None]:
from kfp          import kubernetes
from kfp.client   import Client
from kfp.compiler import Compiler
from kfp.dsl      import pipeline
from os           import getcwd, pardir, path
from sys          import path as sys_path

sys_path.append(path.dirname(getcwd()))

from components.download_document import download_document
from components.index_document    import index_document

### 03 - Create pipeline

In [None]:
pipeline_name        = 'index_document'
pipeline_description = 'Index a PDF document in Elasticsearch'

In [None]:
@pipeline(
    name        = pipeline_name,
    description = pipeline_description
)
def pipeline_func(
    pvc_name                           : str,
    pvc_storage_class                  : str,
    pvc_access_modes                   : list,
    pvc_size                           : str,
    s3_service_name                    : str,
    s3_endpoint_url                    : str,
    s3_access_key_id                   : str,
    s3_secret_access_key               : str,
    s3_region                          : str,
    s3_bucket                          : str,
    s3_file                            : str,
    elasticsearch_host                 : str,
    elasticsearch_username             : str,
    elasticsearch_password             : str,
    elasticsearch_document_index       : str,
    tensorflow_hub_embedding_model_url : str
) -> None:

    # TASK Create PVC

    pvc_directory = path.join('/', 'pipeline', pipeline_name)

    create_pvc_task = kubernetes.CreatePVC(
        pvc_name           = pvc_name,
        storage_class_name = pvc_storage_class,
        access_modes       = pvc_access_modes,
        size               = pvc_size
    )

    # TASK Download document

    download_document_task = download_document(
        s3_service_name      = s3_service_name,
        s3_endpoint_url      = s3_endpoint_url,
        s3_access_key_id     = s3_access_key_id,
        s3_secret_access_key = s3_secret_access_key,
        s3_region            = s3_region,
        s3_bucket            = s3_bucket,
        s3_file              = s3_file,
        pvc_directory        = pvc_directory
    )

    download_document_task.after(create_pvc_task)

    kubernetes.mount_pvc(
        task       = download_document_task,
        pvc_name   = pvc_name,
        mount_path = pvc_directory
    )

    pvc_file = download_document_task.output

    # TASK Index document

    index_document_task = index_document(
        elasticsearch_host                 = elasticsearch_host,
        elasticsearch_username             = elasticsearch_username,
        elasticsearch_password             = elasticsearch_password,
        elasticsearch_document_index       = elasticsearch_document_index,
        tensorflow_hub_embedding_model_url = tensorflow_hub_embedding_model_url,
        pvc_file                           = pvc_file
    )

    index_document_task.after(download_document_task)

    kubernetes.mount_pvc(
        task       = index_document_task,
        pvc_name   = pvc_name,
        mount_path = pvc_directory
    )

### 04 - Create pipeline yaml

In [None]:
pipeline_yaml = path.join(pardir, pardir, 'app_pipeline', 'pipeline_yaml', f'{ pipeline_name }.yaml')

In [None]:
Compiler().compile(
    pipeline_func = pipeline_func,
    package_path  = pipeline_yaml
)

### 05 - Create pipeline run

In [None]:
pipeline_arguments = {
    'pvc_name'                           : '<pvc_name>',
    'pvc_storage_class'                  : '<pvc_storage_class>',
    'pvc_access_modes'                   : ['<pvc_access_modes>'],
    'pvc_size'                           : '<pvc_size>',
    's3_service_name'                    : '<s3_service_name>',
    's3_endpoint_url'                    : '<s3_endpoint_url>',
    's3_access_key_id'                   : '<s3_access_key_id>',
    's3_secret_access_key'               : '<s3_secret_access_key>',
    's3_region'                          : '<s3_region>',
    's3_bucket'                          : '<s3_bucket>',
    's3_file'                            : '<s3_file>',
    'elasticsearch_host'                 : '<elasticsearch_host>',
    'elasticsearch_username'             : '<elasticsearch_username>',
    'elasticsearch_password'             : '<elasticsearch_password>',
    'elasticsearch_document_index'       : '<elasticsearch_document_index>',
    'tensorflow_hub_embedding_model_url' : '<tensorflow_hub_embedding_model_url>'
}

In [None]:
kubeflow_host = '<kubeflow_host>'

In [None]:
Client(host = kubeflow_host).create_run_from_pipeline_package(
    pipeline_file = pipeline_yaml,
    arguments     = pipeline_arguments
)