In [None]:
!pip install --upgrade kfp[kubernetes]

In [None]:
from kfp          import kubernetes
from kfp.client   import Client
from kfp.compiler import Compiler
from kfp.dsl      import component, pipeline

In [None]:
IMAGE_BASE    = 'registry.access.redhat.com/ubi9/python-311'
IMAGE_BOTO3   = '<task_image_boto3>'
IMAGE_DOCLING = '<task_image_docling>'
IMAGE_TO_PDF  = '<task_image_to_pdf>'
IMAGE_LLM     = '<task_image_llm>'
IMAGE_AGENT   = '<task_image_agent>'

In [None]:
@component(
    base_image = IMAGE_BOTO3
)
def download_json(
    s3_endpoint_url      : str,
    s3_access_key_id     : str,
    s3_secret_access_key : str,
    s3_region_name       : str,
    s3_bucket            : str,
    s3_filename          : str,
    pvc_filename         : str,
):

    from boto3   import client
    from os      import makedirs
    from os.path import dirname

    makedirs(dirname(pvc_filename), exist_ok = True)

    s3_client = client(
        service_name          = 's3',
        endpoint_url          = s3_endpoint_url,
        aws_access_key_id     = s3_access_key_id,
        aws_secret_access_key = s3_secret_access_key,
        region_name           = s3_region_name
    )

    s3_client.download_file(s3_bucket, s3_filename, pvc_filename)

In [None]:
@component(
    base_image = IMAGE_DOCLING
)
def extract_content(
    pvc_filename : str
):

    from base64                     import b64decode
    from docling.document_converter import DocumentConverter
    from json                       import load
    from os.path                    import basename, dirname, join, splitext

    basename = splitext(basename(pvc_filename))[0]
    basename = join(dirname(pvc_filename), basename)

    result          = ''
    result_filename = basename + '.md'

    with open(pvc_filename) as file:

        data = load(file)

    for index0, detalhe in enumerate(data['PRODETALHEs'], start = 1):

        for index1, anexo in enumerate(detalhe['ANEXOs'], start = 1):

            if anexo['extArquivo'] != '.pdf':

                continue

            if anexo['idTipoDoc'] == 437:

                continue

            filename = basename + '_' + str(index0) + '_' + str(index1) + '_' + str(anexo['idTipoDoc']) + '.pdf'
            print(f'Processando arquivo {filename}')

            with open(filename, 'wb') as file:

                file.write(b64decode(anexo['CONTEUDO']))

            converter = DocumentConverter()
            result   += converter.convert(filename).document.export_to_markdown() + '\n\n'

    print(f'Resultado:\n\n{result}')

    with open(result_filename, 'w') as file:

        file.write(result)

In [None]:
@component(
    base_image = IMAGE_LLM
)
def generate_summary(
    pvc_filename   : str,
    llm_api_base   : str,
    llm_api_key    : str,
    llm_model_name : str,
):

    from langchain_core.messages       import trim_messages
    from langchain_core.messages.utils import count_tokens_approximately
    from langchain.schema              import SystemMessage, HumanMessage
    from langchain_community.llms      import VLLMOpenAI
    from os.path                       import basename, dirname, join, splitext

    filename_base = splitext(basename(pvc_filename))[0]
    filename_base = join(dirname(pvc_filename), filename_base)

    filename_md      = filename_base + '.md'
    filename_summary = filename_base + '_resumo.txt'

    content = None

    with open(filename_md, 'r') as file:

        content = file.read()

    llm = VLLMOpenAI(
        openai_api_base = f'{llm_api_base}/v1',
        openai_api_key  = llm_api_key,
        model_name      = llm_model_name,
        max_tokens      = 2048
    )

    messages = [
        SystemMessage(content = '''
            Você é um assistente especialista em resumir documentos de infrações ambientais.
            Seu objetivo é resumir o contexto, extraindo informações importantes e dividindo nos seguintes tópicos:

            1 - Quem cometeu a infração ambiental
            2 - Do que se trata a infração ambiental
            3 - O que alega a defesa do infrator

            Foque nos pontos importantes e ignore pequenos detalhes.
        '''),
        HumanMessage(content = content)
    ]

    messages = trim_messages(
        messages,
        strategy       = 'last',
        token_counter  = count_tokens_approximately,
        max_tokens     = 88000,
        include_system = True,
        allow_partial  = False,
    )

    response = llm.invoke(messages)
    print(response)

    with open(filename_summary, 'w') as file:

        file.write(response)

In [None]:
@component(
    base_image = IMAGE_BASE
)
def similarity_search(): pass

In [None]:
@component(
    base_image = IMAGE_LLM
)
def validate_inspection(
    pvc_filename   : str,
    llm_api_base   : str,
    llm_api_key    : str,
    llm_model_name : str,
):

    from langchain_core.messages       import trim_messages
    from langchain_core.messages.utils import count_tokens_approximately
    from langchain.schema              import SystemMessage, HumanMessage
    from langchain_community.llms      import VLLMOpenAI
    from os.path                       import basename, dirname, join, splitext

    filename_base = splitext(basename(pvc_filename))[0]
    filename_base = join(dirname(pvc_filename), filename_base)
    filename_md   = filename_base + '.md'

    content = None

    with open(filename_md, 'r') as file:

        content = file.read()

    llm = VLLMOpenAI(
        openai_api_base = f'{llm_api_base}/v1',
        openai_api_key  = llm_api_key,
        model_name      = llm_model_name,
        max_tokens      = 128
    )

    messages = [
        SystemMessage(content = '''
            Você é um assistente especialista em extrair informações de infrações ambientais.
            Seu objetivo é buscar no contexto se foi realizado vistoria no local da infração.
            Caso tenha sido realizado a vistoria, apresentar essa informação.
            Caso você não encontre informações sobre vistoria, informar que é necessário realizar vistoria no local.
            Seja objetivo, se é necessário ou não realizar vistoria.
        '''),
        HumanMessage(content = content)
    ]

    messages = trim_messages(
        messages,
        strategy       = 'last',
        token_counter  = count_tokens_approximately,
        max_tokens     = 88000,
        include_system = True,
        allow_partial  = False,
    )

    response = llm.invoke(messages)
    print(response)

In [None]:
@component(
    base_image = IMAGE_LLM
)
def generate_output(
    pvc_filename   : str,
    llm_api_base   : str,
    llm_api_key    : str,
    llm_model_name : str,
):

    from langchain_community.llms import VLLMOpenAI
    from os.path                  import basename, dirname, join, splitext

    filename_base = splitext(basename(pvc_filename))[0]
    filename_base = join(dirname(pvc_filename), filename_base)

    filename_summary = filename_base + '_resumo.txt'
    filename_parecer = filename_base + '_parecer.txt'

    summary = None

    with open(filename_summary, 'r') as file:

        summary = file.read()

    llm = VLLMOpenAI(
        openai_api_base = f'{llm_api_base}/v1',
        openai_api_key  = llm_api_key,
        model_name      = llm_model_name,
        max_tokens      = 1024
    )

    message = f'''
        {summary}

        Baseando-se em leis ambientais, crie um parecer jurídico para o texto acima.
        Seu parecer deve levar em conta a infração, a defesa apresentada e leis ambientais aplicáveis.
        Seja claro e objetivo no seu parecer se é favorável ou desfavorável à alegação da defesa.
        Não utilize mais do que 200 palavras.

        Parecer:
    '''

    response = llm.invoke(message)
    print(response)

    with open(filename_parecer, 'w') as file:

        file.write(response)

In [None]:
@component(
    base_image = IMAGE_TO_PDF
)
def generate_output_pdf(
    s3_endpoint_url      : str,
    s3_access_key_id     : str,
    s3_secret_access_key : str,
    s3_region_name       : str,
    s3_bucket            : str,
    s3_filename          : str,
    pvc_filename         : str,
):

    from boto3   import client
    from fpdf    import FPDF
    from os.path import basename, dirname, join, splitext

    filename_base = splitext(basename(pvc_filename))[0]
    filename_base = join(dirname(pvc_filename), filename_base)

    filename_summary     = filename_base + '_resumo.txt'
    filename_parecer     = filename_base + '_parecer.txt'
    filename_parecer_pdf = filename_base + '_parecer.pdf'

    summary = None
    parecer = None

    with open(filename_summary, 'r') as file:

        summary = file.readlines()

    with open(filename_parecer, 'r') as file:

        parecer = file.readlines()

    pdf = FPDF()
    pdf.add_page()
    pdf.set_font(family = 'helvetica', style = 'B', size = 12)
    pdf.cell(text = 'Resumo')
    pdf.ln()
    pdf.ln()
    pdf.set_font(family = 'helvetica', style = '', size = 10)

    for line in summary:

        pdf.multi_cell(w = 0, text = line)

    pdf.ln()
    pdf.ln()
    pdf.set_font(family = 'helvetica', style = 'B', size = 12)
    pdf.cell(text = 'Parecer')
    pdf.ln()
    pdf.ln()
    pdf.set_font(family = 'helvetica', style = '', size = 10)

    for line in parecer:

        pdf.multi_cell(w = 0, text = line)

    pdf.output(filename_parecer_pdf)

    s3_filename_output = splitext(basename(s3_filename))[0]
    s3_filename_output = join(dirname(s3_filename), s3_filename_output) + '_parecer.pdf'

    s3_client = client(
        service_name          = 's3',
        endpoint_url          = s3_endpoint_url,
        aws_access_key_id     = s3_access_key_id,
        aws_secret_access_key = s3_secret_access_key,
        region_name           = s3_region_name
    )

    s3_client.upload_file(filename_parecer_pdf, s3_bucket, s3_filename_output)

In [None]:
PIPELINE_NAME              = 'Gerar Parecer'
PIPELINE_DESCRIPTION       = 'Utilizando IA para gerar parecer jurídico de infrações ambientais'
PIPELINE_YAML              = 'pipeline.yaml'
PIPELINE_PVC_NAME          = '<pipeline_pvc_name>'
PIPELINE_PVC_STORAGE_CLASS = '<pipeline_pvc_storage_class>'

In [None]:
@pipeline(
    name        = PIPELINE_NAME,
    description = PIPELINE_DESCRIPTION
)
def pipeline(
    s3_endpoint_url      : str,
    s3_access_key_id     : str,
    s3_secret_access_key : str,
    s3_region_name       : str,
    s3_bucket            : str,
    s3_filename          : str,
    pvc_filename         : str,
    llm_api_base         : str,
    llm_api_key          : str,
    llm_model_name       : str,
):

    from os.path import join

    # PVC

    pvc_name      = PIPELINE_PVC_NAME
    pvc_directory = join('/', 'pipeline')

    # Task Download JSON

    download_json_task = download_json(
        s3_endpoint_url      = s3_endpoint_url,
        s3_access_key_id     = s3_access_key_id,
        s3_secret_access_key = s3_secret_access_key,
        s3_region_name       = s3_region_name,
        s3_bucket            = s3_bucket,
        s3_filename          = s3_filename,
        pvc_filename         = pvc_filename
    )

    kubernetes.mount_pvc(
        task       = download_json_task,
        pvc_name   = pvc_name,
        mount_path = pvc_directory
    )

    download_json_task.set_display_name('Carregando JSON')
    download_json_task.set_caching_options(False)

    # Task Extract Content

    extract_content_task = extract_content(
        pvc_filename = pvc_filename
    )

    kubernetes.mount_pvc(
        task       = extract_content_task,
        pvc_name   = pvc_name,
        mount_path = pvc_directory
    )

    extract_content_task.set_display_name('Extraindo Conteúdo')
    extract_content_task.set_caching_options(False)
    extract_content_task.after(download_json_task)

    # Task Generate Summary

    generate_summary_task = generate_summary(
        pvc_filename   = pvc_filename,
        llm_api_base   = llm_api_base,
        llm_api_key    = llm_api_key,
        llm_model_name = llm_model_name
    )

    kubernetes.mount_pvc(
        task       = generate_summary_task,
        pvc_name   = pvc_name,
        mount_path = pvc_directory
    )

    generate_summary_task.set_display_name('Criando Resumo')
    generate_summary_task.set_caching_options(False)
    generate_summary_task.after(extract_content_task)

    # Task Similarity Search

    similarity_search_task = similarity_search()

    kubernetes.mount_pvc(
        task       = similarity_search_task,
        pvc_name   = pvc_name,
        mount_path = pvc_directory
    )

    similarity_search_task.set_display_name('Busca por Similaridade')
    similarity_search_task.set_caching_options(False)
    similarity_search_task.after(generate_summary_task)

    # Task Validate Inspection

    validate_inspection_task = validate_inspection(
        pvc_filename   = pvc_filename,
        llm_api_base   = llm_api_base,
        llm_api_key    = llm_api_key,
        llm_model_name = llm_model_name
    )

    kubernetes.mount_pvc(
        task       = validate_inspection_task,
        pvc_name   = pvc_name,
        mount_path = pvc_directory
    )

    validate_inspection_task.set_display_name('Validar Vistoria')
    validate_inspection_task.set_caching_options(False)
    validate_inspection_task.after(similarity_search_task)

    # Task Generate Output

    generate_output_task = generate_output(
        pvc_filename   = pvc_filename,
        llm_api_base   = llm_api_base,
        llm_api_key    = llm_api_key,
        llm_model_name = llm_model_name
    )

    kubernetes.mount_pvc(
        task       = generate_output_task,
        pvc_name   = pvc_name,
        mount_path = pvc_directory
    )

    generate_output_task.set_display_name('Criando Parecer')
    generate_output_task.set_caching_options(False)
    generate_output_task.after(validate_inspection_task)

    # Task Generate Output PDF

    generate_output_pdf_task = generate_output_pdf(
        s3_endpoint_url      = s3_endpoint_url,
        s3_access_key_id     = s3_access_key_id,
        s3_secret_access_key = s3_secret_access_key,
        s3_region_name       = s3_region_name,
        s3_bucket            = s3_bucket,
        s3_filename          = s3_filename,
        pvc_filename         = pvc_filename
    )

    kubernetes.mount_pvc(
        task       = generate_output_pdf_task,
        pvc_name   = pvc_name,
        mount_path = pvc_directory
    )

    generate_output_pdf_task.set_display_name('Criando PDF')
    generate_output_pdf_task.set_caching_options(False)
    generate_output_pdf_task.after(generate_output_task)

In [None]:
Compiler().compile(pipeline, PIPELINE_YAML)

In [None]:
KUBEFLOW_HOST = '<kubeflow_host>'

PIPELINE_ARGUMENTS = {
    's3_endpoint_url'      : '<s3_endpoint_url>',
    's3_access_key_id'     : '<s3_access_key_id>',
    's3_secret_access_key' : '<s3_secret_access_key>',
    's3_region_name'       : '<s3_region_name>',
    's3_bucket'            : '<s3_bucket>',
    's3_filename'          : '<s3_filename>',
    'pvc_filename'         : '<pvc_filename>',
    'llm_api_base'         : '<llm_api_base>',
    'llm_api_key'          : '<llm_api_key>',
    'llm_model_name'       : '<llm_model_name>',
}

In [None]:
Client(host = KUBEFLOW_HOST).create_run_from_pipeline_package(
    pipeline_file = PIPELINE_YAML,
    arguments     = PIPELINE_ARGUMENTS
)