In [1]:
import sys

In [2]:
import applyllm
print(f"applyllm version: {applyllm.__version__}")

applyllm version: 0.0.3


In [3]:
# !{sys.executable} -m pip install --upgrade --user kfp==1.8.22

In [12]:
import kfp
from kfp import dsl
from functools import partial
from kfp.dsl import (
    pipeline,
    ContainerOp
)
from kfp.components import (
    InputPath,
    OutputPath,
    create_component_from_func
)
client = kfp.Client()
NAMESPACE = client.get_user_namespace()
EXPERIMENT_NAME = 'llm' # Name of the experiment in the KF webapp UI
EXPERIMENT_DESC = 'llm experiment'

print(NAMESPACE)

kubeflow-kindfor


In [15]:
from dataclasses import dataclass

@dataclass
class Settings():
    llm_base_image: str = 'pytorch/pytorch:2.2.0-cuda11.8-cudnn8-devel'
    # s3_base_image: str = 'python:3.10.13-slim-bullseye'
    # use a runtime pytorch image to speed up the pip install process, since the applyllm has too many dependencies
    # TODO: to seperate applyllm-io an applyllm package
    s3_base_image: str = 'pytorch/pytorch:2.2.0-cuda11.8-cudnn8-runtime'
    applyllm_version: str = '0.0.3'
    pypdf_version: str = '3.15.5'
    accelerate_version: str = '0.26.1'
    unstructured_version: str = '0.11.0'
    sentence_transformers_version: str = '2.2.2'
    docarray_version: str = '0.39.1'
    pydantic_version: str = '1.10.13'
    boto3_version: str = '1.34.14'
    pandas_version: str = '2.2.1'
    tabula_py_version: str = '2.9.0'

settings = Settings()

In [14]:
import os
PIPELINE_PATH_DIR = "./compiled"
if not os.path.exists(PIPELINE_PATH_DIR):
    os.makedirs(PIPELINE_PATH_DIR)

## Check Tabula in s3 pdf files

In [5]:
from functools import partial

@partial(
    create_component_from_func,
    output_component_file=f"{PIPELINE_PATH_DIR}/s3_pdf_tabula_check_component.yaml",
    base_image=settings.s3_base_image, 
    packages_to_install=[
        f"applyllm=={settings.applyllm_version}",
        f"boto3=={settings.boto3_version}",
        f"pandas=={settings.pandas_version}",
        f"tabula-py=={settings.tabula_py_version}"
    ],
)
def file_comp():
    import boto3, os
    import pandas as pd
    import tabula
    from io import BytesIO
    from pypdf import PdfReader

    

In [6]:
def pod_resource_transformer(op: ContainerOp, mem_req="200Mi", cpu_req="2000m", mem_lim="4000Mi", cpu_lim='4000m'):
    """
    this function helps to set the resource limit for container operators
    op.set_memory_limit('1000Mi') = 1GB
    op.set_cpu_limit('1000m') = 1 cpu core
    """
    return op.set_memory_request(mem_req)\
            .set_memory_limit(mem_lim)\
            .set_cpu_request(cpu_req)\
            .set_cpu_limit(cpu_lim)

In [7]:
@pipeline(
    name = EXPERIMENT_NAME,
    description = EXPERIMENT_DESC
)
def custom_pipeline(epochs: int):
    '''local variable'''
    no_artifact_cache = "P0D"
    artifact_cache_today = "P1D"
    # cache_setting = artifact_cache_today
    cache_setting = no_artifact_cache
    
    '''pipeline'''   
    custom_task = custom_comp()
    # 200 MB ram and 1 cpu
    custom_task = pod_resource_transformer(custom_task, mem_req="500Mi", cpu_req="200m")
    # set the download caching to be 1day, disable caching with P0D
    # download_task.execution_options.caching_strategy.max_cache_staleness = artifact_cache_today
    custom_task.execution_options.caching_strategy.max_cache_staleness = cache_setting
    custom_task.set_display_name("install github packages")

In [8]:
PIPE_LINE_FILE_NAME=f"github_package_pipeline"
kfp.compiler.Compiler().compile(custom_pipeline, f"{PIPE_LINE_FILE_NAME}.yaml")

In [9]:
from datetime import datetime
from pytz import timezone as ptimezone

def get_local_time_str(target_tz_str: str = "Europe/Berlin", format_str: str = "%Y-%m-%d %H-%M-%S") -> str:
    """
    this method is created since the local timezone is miss configured on the server
    @param: target timezone str default "Europe/Berlin"
    @param: "%Y-%m-%d %H-%M-%S" returns 2022-07-07 12-08-45
    """
    target_tz = ptimezone(target_tz_str) # create timezone, in python3.9 use standard lib ZoneInfo
    # utc_dt = datetime.now(datetime.timezone.utc)
    target_dt = datetime.now(target_tz)
    return datetime.strftime(target_dt, format_str)

In [10]:
# from kubernetes import client as k8s_client
pipeline_config = dsl.PipelineConf()

# pipeline_config.set_image_pull_secrets([k8s_client.V1ObjectReference(name=K8_GIT_SECRET_NAME, namespace=NAME_SPACE)])
# pipeline_config.set_image_pull_policy("Always")
pipeline_config.set_image_pull_policy("IfNotPresent")

pipeline_args = {
}

In [11]:
RUN_NAME = f"custom package pipeline {get_local_time_str()}"

# client = kfp.Client()
run = client.create_run_from_pipeline_func(
    pipeline_func=custom_pipeline,
    arguments = pipeline_args, #{}
    run_name = RUN_NAME,
    pipeline_conf=pipeline_config,
    experiment_name=EXPERIMENT_NAME,
    namespace=NAMESPACE,
)

run

RunPipelineResult(run_id=4bd91393-817e-461f-a9f5-f3bfbe049068)