In [1]:
import sys

In [2]:
# !{sys.executable} -m pip install --upgrade --user kfp==1.8.22

In [3]:
import kfp
from kfp import dsl
from functools import partial
from kfp.dsl import (
    pipeline,
    ContainerOp,
    PipelineVolume
)
from kfp.components import (
    # InputPath,
    # OutputPath,
    create_component_from_func
)

EXPERIMENT_NAME = 'llm' # Name of the experiment in the KF webapp UI
EXPERIMENT_DESC = 'llm med report pipeline experiment'

In [4]:
from dataclasses import dataclass

@dataclass
class Settings():
    llm_base_image: str = 'pytorch/pytorch:2.2.0-cuda11.8-cudnn8-devel'
    # s3_base_image: str = 'python:3.10.13-slim-bullseye'
    # use a runtime pytorch image to speed up the pip install process, since the applyllm has too many dependencies
    # TODO: to seperate applyllm-io an applyllm package
    s3_base_image: str = 'pytorch/pytorch:2.2.0-cuda11.8-cudnn8-runtime'
    applyllm_version: str = '0.0.3'
    pypdf_version: str = '3.15.5'
    accelerate_version: str = '0.26.1'
    unstructured_version: str = '0.11.0'
    sentence_transformers_version: str = '2.2.2'
    docarray_version: str = '0.39.1'
    pydantic_version: str = '1.10.13'
    boto3_version: str = '1.34.14'

settings = Settings()

In [5]:
import os
PIPELINE_PATH_DIR = "./compiled"
if not os.path.exists(PIPELINE_PATH_DIR):
    os.makedirs(PIPELINE_PATH_DIR)

#### Load S3 text data objects with prefix and count limits

In [6]:
@partial(
    create_component_from_func,
    output_component_file=f"{PIPELINE_PATH_DIR}/s3_text_report_load_component.yaml",
    base_image=settings.s3_base_image, 
    packages_to_install=[
        f"applyllm=={settings.applyllm_version}",
        f"boto3=={settings.boto3_version}",
    ],
)
def file_op(bucket_name: str, s3_file_prefix: str, s3_file_max_count: int, verify_host: bool = True) -> list:
    # print(f"{bucket_name}, {s3_file_prefix}, {s3_file_max_count}")
    import os
    from applyllm.io import (
        S3AccessConf,
        S3BucketHelper,
    )
    s3_conf = S3AccessConf(
        access_key_id = os.environ.get('AWS_ACCESS_KEY_ID'),
        secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY'),
        endpoint = os.environ.get('S3_ENDPOINT'),
        bucket_name = bucket_name,
        verify_host = verify_host,
    )
    s3_text_reports_helper = S3BucketHelper(conf=s3_conf, file_prefix=s3_file_prefix)
    target_s3_obj_list = list(s3_text_reports_helper.get_object_keys(limit_count=s3_file_max_count))
    return target_s3_obj_list


#### Get run id and run name within the kubeflow pipeline component
* https://stackoverflow.com/questions/63473716/how-to-obtain-the-kubeflow-pipeline-run-name-from-within-a-component/67616896#67616896

In [7]:
@partial(
    create_component_from_func,
    output_component_file=f"{PIPELINE_PATH_DIR}/llm_entity_extraction_component.yaml",
    base_image=settings.llm_base_image, 
    packages_to_install=[
        f"applyllm=={settings.applyllm_version}",
        f"pypdf=={settings.pypdf_version}",
        f"accelerate=={settings.accelerate_version}",
        f"unstructured=={settings.unstructured_version}",
        f"sentence-transformers=={settings.sentence_transformers_version}", # for langchain vectorestore embedding model
        f"docarray=={settings.docarray_version}",
        f"pydantic=={settings.pydantic_version}", # must be a version less than 2.x
    ], # adding additional libs
    # pip_index_urls=["https://gitlab.lrz.de/api/v4/projects/150553/packages/pypi/simple"]
    # define my private pypi package registry v2 component decorator
)
def llm_op(model_root: str, 
           lm_model_type: str, 
           max_token_length: int, 
           max_position_embeddings: int,
           max_new_tokens: int,
           repetition_penalty: float,
           temperature: float,
           lm_device_map: str,
           top_k: int,
           top_p: float,
           bucket_name: str,
           # s3_file_prefix: str,
           # s3_file_key: str,
           s3_file_key_list: list,
           embed_model_vendor: str,
           retriever_k: int,
           user_query1: str,
           user_query1_parser_question: str,
           user_query1_target_var: str,
           user_query1_target_var_desc: str,
           user_query2: str,
           user_query2_vars: list,
           user_query2_parser_question: str,
           user_query2_target_var: str,
           user_query2_target_var_desc: str,
           sql_target_table: str,
           run_name: str,
           verbose: bool = False,
    ):
    """
    Args:
        model_root: The root directory of the model
        lm_model_type: The type of the language model
        max_token_length: The maximum token length 4096
        max_position_embeddings: The maximum position embeddings 3072
        max_new_tokens: The maximum new tokens to be generated 80
        repetition_penalty: The repetition penalty 1.15
        temperature: The temperature 0.001
        lm_device_map: The device map for the language model "auto"
        top_k: The top k value 3
        top_p: The top p value 0.8
        bucket_name: The name of the bucket "scivias-medreports"
        s3_file_key: The key of the s3 file "trans2en/KK-SCIVIAS-00003^0053360847^2018-09-28^KIIGAS.txt"
        s3_file_key_list: list, The key of the s3 files to be processed ["trans2en/KK-SCIVIAS-00003^0053360847^2018-09-28^KIIGAS.txt"]
        embed_model_vendor: The embedding model vendor "baai" or "sentence-transformers"
        retriever_k: The retriever k value 3
        user_query1: The user query "What is the name of the patient? (Remember to include 'The name of the patient is' in your answer.)"
        user_query1_parser_question: The user query parser question "retrieve one: patient name"
        user_query1_target_var: The user query target variable "patient_name"
        user_query1_target_var_desc: The user query target variable description "patient name"
        user_query2: "What is the age of the patient {patient_name}? (Remember to include 'The age of the patient is' in your answer.)"
        user_query2_vars: The user query 2 variables ["patient_name"]
        user_query2_parser_question: The user query 2 parser question "retrieve one: patient age as number"
        user_query2_target_var: The user query 2 target variable "patient_age"
        user_query2_target_var_desc: The user query 2 target variable description "patient age"
        sql_target_table: The target table for the sql "llm_med_report_info"
        run_name: The kubeflow pipeline run name "llm-med-report-pipeline"
        verbose: The verbose flag False
    """
    import os
    import applyllm as apl

    from applyllm.accelerators import (
        AcceleratorHelper,
        AcceleratorStatus,
        DirectorySetting,
        # DIR_MODE_MAP,
        TokenHelper as th
    )
    from applyllm.utils import time_func
    from applyllm.pipelines import (
        LocalCausalLMConfig,
        ModelConfig,
        ModelCatalog,
    )

    dir_setting = DirectorySetting(home_dir=model_root)
    
    # debug code to check the mounted model_root, whether DirectorySetting is working
    # print([x[0] for x in os.walk(model_root)])

    gpu_status = AcceleratorStatus.create_accelerator_status()
    '''init gpu helper'''
    gpu_helper = AcceleratorHelper()
    UUIDs = gpu_helper.nvidia_device_uuids_filtered_by(is_mig=True, log_output=False)
    # will set the XDG_CACHE_HOME, this line must be called before import transformers
    gpu_helper.init_cuda_torch(UUIDs, dir_setting)
    print(os.environ["CUDA_VISIBLE_DEVICES"])
    print(os.environ["XDG_CACHE_HOME"])
    '''init llm model to be loaded'''
    model_map = {
        "llama7B-chat":     "meta-llama/Llama-2-7b-chat-hf",
        "llama13B-chat" :   "meta-llama/Llama-2-13b-chat-hf",
        "llama70B-chat" :   "meta-llama/Llama-2-70b-chat-hf",
        "mistral7B-01":     "mistralai/Mistral-7B-v0.1",
        "mistral7B-inst02": "mistralai/Mistral-7B-Instruct-v0.2",
        "mixtral8x7B-01":   "mistralai/Mixtral-8x7B-v0.1",
        "mixtral8x7B-inst01":   "mistralai/Mixtral-8x7B-Instruct-v0.1", 
    }
    # model_type = "mistral7B-inst02"
    model_type = lm_model_type
    model_name = model_map.get(model_type, "mistral7B-inst02")
    print(model_name)

    import transformers
    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer
    from torch import bfloat16
    
    print(f"applyllm version:     {apl.__version__}")
    print(f"transformers version: {transformers.__version__}")
    print(f"torch version:        {torch.__version__}")

    token_kwargs = th.gen_token_kwargs(model_type=model_type, dir_setting=dir_setting)
    
    """Load CausalLM model"""
    base_lm_config = ModelConfig(
        model_config = {
            "pretrained_model_name_or_path": model_name,
            "device_map": "auto",
            # "max_memory": f"{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB",
        }
    )

    lm_model_kwargs = {
        "quantized": True,
        "model_config": base_lm_config.get_config(),
        "quantization_config": {
            "quantization_config": transformers.BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type='nf4',
                bnb_4bit_use_double_quant=True,
                bnb_4bit_compute_dtype=bfloat16
            )
        }
    }
    lm_config = LocalCausalLMConfig(**lm_model_kwargs)

    @time_func
    def fetch_lm_model():
        return AutoModelForCausalLM.from_pretrained(
        **lm_config.get_config(),
        **token_kwargs,  
        )

    model = fetch_lm_model()

    gpu_status.gpu_usage()
    """Load CausalLM tokenizer"""

    def config_tokenizer(model_name: str, config: dict, pad_token_id = 2):
        if model_name.startswith(ModelCatalog.MISTRAL_FAMILY):
            return {**config, "pad_token_id": pad_token_id}
        else:
            return config
        
    MAX_POSITION_EMBEDDINGS = max_position_embeddings
    MAX_LENGTH = max_token_length

    model_config= {
        "pretrained_model_name_or_path": model_name,
        "device": "cpu",
        # "device_map": "auto", # put to GPU if GPU is available
        "max_position_embeddings": MAX_LENGTH,
        "max_length": MAX_LENGTH,
    }
    model_config = config_tokenizer(model_name=model_name, config=model_config)
    tokenizer_config = ModelConfig(model_config=model_config)

    tokenizer = AutoTokenizer.from_pretrained(
        **tokenizer_config.get_config(), 
        **token_kwargs,
    )

    """init the transformer pipeline as backend llm for langchain"""
    tp_kwargs = {
        "task": "text-generation",
        "model": model,
        "tokenizer": tokenizer,
        "device_map": lm_device_map,
        "max_length": None, # remove the total length of the generated response
        "max_new_tokens": max_new_tokens, # set the size of new generated token 
    }

    tp_config = ModelConfig(model_config = tp_kwargs)

    generator = transformers.pipeline(
        **tp_config.get_config(),
        **token_kwargs,
    )

    """Huggingface pipeline"""
    from applyllm.pipelines import ModelCatalog, ModelInfo, PromptHelper

    model_info = ModelCatalog.get_model_info(model_name=model_name)
    prompt_helper = PromptHelper(model_info=model_info)

    import langchain
    from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
    print(f"langchain.__version__: {langchain.__version__}")

    llm = HuggingFacePipeline(
        pipeline=generator 
    )

    llm.model_id = model_name
    pipeline_kwargs_config = {
        "device_map": lm_device_map,
        "max_length": MAX_LENGTH, # deactivate to use max_new_tokens
        "max_new_tokens": max_new_tokens, # this is not taken by the model ?
        "eos_token_id": tokenizer.eos_token_id, # also making trouble (optional)
        "temperature": temperature,
        "repetition_penalty": repetition_penalty, # 1.15,
    }
    model_kwargs_config = {
        "do_sample": True, # also making trouble with langchain (optional)
        "top_k": top_k, # this param result in trouble with langchain (optional)
        "num_return_sequences": 1, # (optional)
        "eos_token_id": tokenizer.eos_token_id, # also making trouble (optional)
        "max_length": MAX_LENGTH, # deactivate to use max_new_tokens
        "max_new_tokens": max_new_tokens, # this is not taken by the model ?
        "temperature": temperature,
        "top_p": top_p, # 0.95 # alternative to top_k summerized probability while do_sample=True
        "repetition_penalty": repetition_penalty, # 1.15,
        "trust_remote_code": True,
    }

    llm.model_kwargs = config_tokenizer(model_name=model_name, config=model_kwargs_config, pad_token_id=tokenizer.eos_token_id)
    llm.pipeline_kwargs = config_tokenizer(model_name=model_name, config=pipeline_kwargs_config, pad_token_id=tokenizer.eos_token_id)
    print("HuggingFacePipeline setup done")
    gpu_status.gpu_usage()

    """LangChain pipeline"""
    from langchain.chains import RetrievalQA
    from langchain_community.document_loaders import S3DirectoryLoader, S3FileLoader
    from langchain_community.vectorstores import DocArrayInMemorySearch
    from langchain.indexes import VectorstoreIndexCreator
    from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
    # from langchain.text_splitter import TextSplitter
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    from langchain_core.documents.base import Document
    from langchain.prompts import PromptTemplate
    from typing import List
    import boto3
    from applyllm.io import S3PdfObjHelper, DocMetaInfo, DocCorpusS3

    print(boto3.__version__)

    bucket_name = bucket_name
    # file_prefix = s3_file_prefix
    # PREFIX = f"{S3PdfObjHelper.DataContract.key_lead}/{file_prefix}"
    access_key_id = os.environ.get('AWS_ACCESS_KEY_ID')
    secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
    s3_endpoint = os.environ.get('S3_ENDPOINT')
    # VERIFY = False
    VERIFY = True

    # print(PREFIX)

    def get_single_file_loader(key: str):
        return S3FileLoader(bucket=bucket_name, 
                        key = key,
                        aws_access_key_id=access_key_id,
                        aws_secret_access_key=secret_access_key,
                        endpoint_url=s3_endpoint,
                        verify = VERIFY,
                        use_ssl = True)
    
    @time_func
    def fetch_s3_object(key: str) -> List[Document]:
        """
        Returns:
        a list of LangChain Document object
        """
        loader = get_single_file_loader(key)
        return loader.load()


    """RAG Setting"""
    from applyllm.utils import token_size

    CHUNK_SIZE = (MAX_POSITION_EMBEDDINGS // 1000) * 1000
    model_config = {
        # Set a really small chunk size, just to show.
        "chunk_size": CHUNK_SIZE,
        "chunk_overlap": 20,
        "length_function": token_size, # len,
        "is_separator_regex": False,
    }

    splitter_config = ModelConfig(model_config=model_config)
    text_splitter = RecursiveCharacterTextSplitter(
        **splitter_config.get_config()
    )

    embed_model_map = {
        "sentence-transformers": "sentence-transformers/all-MiniLM-L12-v2", # 384
        "baai" : "BAAI/bge-base-en-v1.5" # 768 embedding dims
    }

    embed_model_vendor = embed_model_vendor
    embed_model_name = embed_model_map[embed_model_vendor]
    model_config = {
        "model_name" : embed_model_name,
        "model_kwargs": {'device': 'cpu'},
        "encode_kwargs": {'normalize_embeddings': True}
    }
    embed_config = ModelConfig(model_config=model_config)

    # is downloaded at "{MODEL_CACHE_DIR}/models/torch/sentence_transformer" folder
    embed_model = HuggingFaceEmbeddings(
        **embed_config.get_config()
    )
    
    
    from sqlalchemy import String, Integer, MetaData, BigInteger
    from sqlalchemy.orm import DeclarativeBase
    from sqlalchemy.orm import mapped_column
    from sqlalchemy.orm import Mapped
    from typing import Optional
    from sqlalchemy.dialects.postgresql import insert as pgsql_upsert
    from sqlalchemy.orm import sessionmaker
    from sqlalchemy.orm import Session
    from datetime import datetime
    import pytz
    from langchain.prompts import PromptTemplate
    from langchain.chains import LLMChain, TransformChain
    from langchain.output_parsers import ResponseSchema
    from langchain.output_parsers import StructuredOutputParser

   
    class Base(DeclarativeBase):
        pass

    class ReportInfo(Base):
        __tablename__ = sql_target_table
        
        id: Mapped[str] = mapped_column(String(50), primary_key=True, autoincrement=False)
        age: Mapped[int] = mapped_column(Integer)
        llm_model: Mapped[str] = mapped_column(String(100))
        embed_model: Mapped[str] = mapped_column(String(100))
        run_name: Mapped[str] = mapped_column(String(100))
        timestamp: Mapped[int] = mapped_column(BigInteger)
        # timestamp: Mapped[int] = mapped_column(Integer)
        
        def __repr__(self) -> str:
            # !r calls repr(self.id) 
            # https://stackoverflow.com/questions/44800801/in-python-format-f-string-strings-what-does-r-mean
            return f"ReportInfo(id={self.id!r}, age={self.age!r}), llm_model={self.llm_model!r}, embed_model={self.embed_model!r}, run_name={self.run_name!r}, timestamp={self.timestamp!r}"
        
        def to_dict(self) -> dict:
            return {
                "id": self.id,
                "age": self.age,
                "llm_model": self.llm_model,
                "embed_model": self.embed_model,
                "run_name": self.run_name,
                "timestamp": self.timestamp,
            }
        
    
    from applyllm.io import (
        SqlDBHelperFactory,
    )
    from applyllm.pipelines import StructuredOutputParserHelper as ParserHelper

    db_config = SqlDBHelperFactory.get_db_config_from_env(
        port_key="SCIVIAS_ANALYTICS_DB_PORT"
    )
    sync_engine = SqlDBHelperFactory.get_sync_engine(db_config=db_config, verbose=verbose)

    def build_sql_stmt(id: str, age: int, llm_model: str, embed_model: str, run_name: str, timestamp: int):
        stmt = pgsql_upsert(ReportInfo).values(
            [ 
                {"id": id, "age": age, "llm_model": llm_model, "embed_model": embed_model, "run_name": run_name, "timestamp": timestamp}
            ]
        )
        stmt = stmt.on_conflict_do_update(constraint = ReportInfo.__table__.primary_key, set_=stmt.excluded)
        return stmt
    
    def llm_experiment(index, sync_engine, llm_model: str, embed_model: str):
        # build in memory retriever index
        retriever = index.vectorstore.as_retriever(search_kwargs={'k': retriever_k})

        """Prompts"""
        query_template = """
        Context:
        {context}

        Question: {question}

        Only return the helpful answer below and nothing else.
        Helpful answer:
        """

        map_template = prompt_helper.gen_prompt(query_template)

        map_prompt_template = PromptTemplate.from_template(map_template)

        reduce_template = """[INST]<<SYS>>\nYou are a helpful, respectful and honest assistant.
        Always answer as helpfully as possible using the context text provided.
        Always summarise the context text provided.
        Your answers should only answer the question once and not have any text after the answer is done.\n\n
        If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.\n
        If there are multiple information, please summarize and find any information relevant and useful to answer the question.\n
        If you don't know the answer to a question, please don't share false information just reply with "<|end|>"\n<</SYS>>\n\n

        CONTEXT:/n/n {summaries}/n/n/n

        Question: {question}/n/n

        Only return the summarised answer below and nothing else.
        Summarised answer:
        [/INST]"""

        reduce_prompt_template = PromptTemplate.from_template(reduce_template)

        refine_init_template = """[INST]<<SYS>>\nYou are a helpful, respectful and honest assistant.
        Always answer as helpfully as possible using the context text provided.
        Your answers should only answer the question once and not have any text after the answer is done.\n\n
        If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.
        If you don't know the answer to a question, please don't share false information, just reply with "<|end|>"\n<</SYS>>\n\n

        Context:/n/n {context_str}/n/n/n

        Question: {question}/n/n

        Only return the helpful answer below and nothing else.
        Helpful answer:
        [/INST]"""

        init_prompt_template = PromptTemplate.from_template(refine_init_template)

        chain_type = "map_reduce"

        def build_qa_chain(chain_type: str, retriever, map_prompt_template, reduce_prompt_template, init_prompt_template):
            qa_chain = RetrievalQA.from_chain_type(
                llm=llm,
                chain_type=chain_type,
                retriever=retriever,
                # combine_docs_chain_kwargs={'prompt': reduce_prompt_template},
                # chain_type_kwargs={"map_prompt": map_prompt_template},
                return_source_documents=True,
                verbose=True,
            )

            if chain_type == "map_reduce":
                qa_chain.combine_documents_chain.llm_chain.prompt = map_prompt_template
                qa_chain.combine_documents_chain.reduce_documents_chain.combine_documents_chain.llm_chain.prompt = reduce_prompt_template
                # set the token max from 3000 to 4000
                qa_chain.combine_documents_chain.reduce_documents_chain.token_max = MAX_POSITION_EMBEDDINGS   
            if chain_type == "refine":
                # pass
                qa_chain.combine_documents_chain.initial_llm_chain.prompt = init_prompt_template
            return qa_chain
        
        qa_chain = build_qa_chain(chain_type, retriever, map_prompt_template, reduce_prompt_template, init_prompt_template)
        
        query = user_query1
        response = qa_chain.invoke({"query": query})


        parser_query_template = """<s>[INST] You are a helpful, respectful and honest assistant.
        Always answer as helpfully as possible using the context text provided.
        Your answers should only answer the question once and not have any text after the answer is done.
        If you don't know the answer to a question, please don't share false information. Just return "</s>"

        CONTEXT:
        {text}

        Question: 
        {question}

        {format_instructions}
        [/INST]"""
        
        name_schema = ResponseSchema(name=user_query1_target_var, description=user_query1_target_var_desc)

        def build_parser_and_chain(llm, schema: ResponseSchema, parser_query_template: str, input_variables: List[str] = ["text","questions"]):
            output_parser = StructuredOutputParser.from_response_schemas([schema])
            format_instructions = output_parser.get_format_instructions()
            prompt_template = PromptTemplate(
                template=parser_query_template,
                input_variables=input_variables,
                partial_variables={"format_instructions": format_instructions},
            )
            return output_parser, LLMChain(prompt=prompt_template, llm=llm)
        
        
        user_query1_parser_chain_input = response['result'].strip()

        query1_output_parser, parser_chain = build_parser_and_chain(llm=llm, schema=name_schema, parser_query_template=parser_query_template, input_variables=["text","question"])
        dict_response = parser_chain.invoke(input={"text":user_query1_parser_chain_input, "question":user_query1_parser_question})
        
        user_query1_target_value = ParserHelper.parse_response_dict(
            parser_response=dict_response,
            output_parser=query1_output_parser,
            text_key="text"
        ).get(user_query1_target_var, "").strip()

        llm_results = {user_query1_target_var: user_query1_target_value}

        """Read User Query 2"""

        query = user_query2
        if len(user_query2_vars) > 0:
            prompt_template = PromptTemplate.from_template(user_query2)
            kwargs = {var: llm_results.get(var, "") for var in user_query2_vars}
            # print(kwargs)
            query = prompt_template.format(**kwargs)
        
        # print(query)

        chain_type = "map_reduce"
        # chain_type = "stuff"
        # chain_type = "refine" 
        qa_chain = build_qa_chain(chain_type, retriever, map_prompt_template, reduce_prompt_template, init_prompt_template)
        response = qa_chain.invoke({"query": query})
        
        age_schema = ResponseSchema(name=user_query2_target_var, description=user_query2_target_var_desc)
        query2_output_parser, parser_chain = build_parser_and_chain(llm=llm, schema=age_schema, parser_query_template=parser_query_template, input_variables=["text","question"])
        
        user_query2_parser_chain_input = response['result'].strip()
        dict_response = parser_chain.invoke(input={"text":user_query2_parser_chain_input, "question":user_query2_parser_question})
        
        patient_age_obj = ParserHelper.parse_response_dict(
            parser_response=dict_response,
            output_parser=query2_output_parser,
            text_key="text",
            verbose=False
        ).get(user_query2_target_var, "")

        try:
            if isinstance(patient_age_obj, str):
                patient_age_obj = patient_age_obj.strip()
                patient_age = int(patient_age_obj)
            if isinstance(patient_age_obj, int):
                patient_age = patient_age_obj
        except Exception as e:
            print(e)
            patient_age = -1

        llm_results[user_query2_target_var] = patient_age

        id = CUR_DOC_INFO.name.split(".")[0]
        
        now = datetime.utcnow()
        print(f"timestamp to save result for {id} is {now}")
        current_time = int(now.timestamp())
        stmt = build_sql_stmt(id=id, age=llm_results[user_query2_target_var], 
                              llm_model=llm_model, embed_model=embed_model, run_name=run_name, timestamp=current_time)
        with Session(sync_engine) as session:    
            # sqlalchemy.engine.result.ScalarResult
            result = session.scalars(stmt.returning(ReportInfo))
            # result.all() returns a list of ReportInfo objects which is bound to the session
            # unpacking the ReportInfo objects to dictionary to use it outside session as copy
            # elements = [e.to_dict() for e in result.all()]    
            session.commit()
    
    
    """create target sql table if not exists"""
    table_objects = [ReportInfo.__table__]
    Base.metadata.create_all(sync_engine, tables=table_objects, checkfirst=True)

    for s3_file_key in s3_file_key_list:
        
        data = fetch_s3_object(key=s3_file_key)

        """Fetch S3 text file corpra"""
        s3_corpus = DocCorpusS3(data)
        if verbose:
            print("--- Max Length Doc Info ---")
            print(s3_corpus.max_doc_meta)
            print("--- Min Length Doc Info ---")
            print(s3_corpus.min_doc_meta)

        file_idx = 0
        show_content = False
        CUR_DOC, CUR_DOC_INFO = s3_corpus.get_s3_obj_info(file_idx, show_content=show_content)
        # print(CUR_DOC)
        print(CUR_DOC_INFO)

        # RAG one document
        index = VectorstoreIndexCreator(
            vectorstore_cls=DocArrayInMemorySearch,
            embedding=embed_model,
            text_splitter=text_splitter,
            ).from_documents([data[file_idx]])        
        
        llm_experiment(index=index, sync_engine=sync_engine, llm_model=model_name, embed_model=embed_model_name)

In [8]:
def set_res_limits(task: ContainerOp, mem_req="200Mi", cpu_req="2000m", mem_lim="4000Mi", cpu_lim='4000m', gpu_req=None, gpu_lim=None, gpu_type:str="20gb"):
    """
    this function helps to set the resource limit for container operators
    op.set_memory_limit('1000Mi') = 1GB
    op.set_cpu_limit('1000m') = 1 cpu core
    """
    if gpu_type == "20gb":
        gpu_resource = "nvidia.com/mig-2g.20gb"
        # gpu_resource = "nvidia.com/mig-1g.20gb"
    elif gpu_type == "40gb":
        gpu_resource = "nvidia.com/mig-3g.40gb"
    else:
        gpu_resource = "nvidia.com/mig-1g.10gb"
        
    # gpu_resource = "nvidia.com/mig-2g.20gb"
    new_op = task.set_memory_request(mem_req)\
        .set_memory_limit(mem_lim)\
        .set_cpu_request(cpu_req)\
        .set_cpu_limit(cpu_lim)
    if (gpu_req is not None) and (gpu_lim is not None) and (gpu_type is not None):
        new_op.add_resource_request(gpu_resource, gpu_req)
        new_op.add_resource_limit(gpu_resource, gpu_lim)
    return new_op

In [9]:
@pipeline(
    name = EXPERIMENT_NAME,
    description = EXPERIMENT_DESC
)
def llm_pipeline(
        model_root: str = "/mnt", 
        lm_model_type: str = "mistral7B-inst02", 
        max_token_length: int = 4096,
        max_position_embeddings: int = 3072,
        max_new_tokens: int = 80,
        repetition_penalty: float = 1.15,
        temperature: float = 0.001,
        lm_device_map: str = "auto",
        top_k: int = 3,
        top_p: float = 0.8,
        bucket_name: str = "scivias-medreports",
        s3_file_prefix: str = "KK-SCIVIAS",
        s3_file_max_count: int = -1, # unlimited
        verify_host: bool = True,
        # s3_file_key: str = "trans2en/KK-SCIVIAS-00003^0053360847^2018-09-28^KIIGAS.txt",
        s3_secrets: str="add-scivias-medreport-secret",
        sql_secrets: str="add-scivias-postgresdb-secret",
        embed_model_vendor: str = "baai",
        retriever_k: int = 3,
        user_query1: str = "What is the name of the patient? (Remember to include 'The name of the patient is' in your answer.)",
        user_query1_parser_question: str = "retrieve one: patient name",
        user_query1_target_var: str = "patient_name",
        user_query1_target_var_desc: str = "patient name",
        user_query2: str = "What is the age of the patient {patient_name}? (Remember to include 'The age of the patient is' in your answer.)",
        user_query2_vars: list = ["patient_name"],
        user_query2_parser_question: str = "retrieve one: patient age as number",
        user_query2_target_var: str = "patient_age",
        user_query2_target_var_desc: str = "patient age",
        sql_target_table: str = "llm_med_report_info",
        run_name: str = "llm_rag_kfp_pipeline",
        verbose: bool = False,
        gpu_type: str = "20gb"
    ):
    '''local variable'''
    no_artifact_cache = "P0D"
    artifact_cache_today = "P1D"
    # cache_setting = artifact_cache_today
    cache_setting = no_artifact_cache

    '''Pipeline Volume'''
    shared_volume = PipelineVolume("llm-models")
    
    '''pipeline'''
    file_task = file_op(
        bucket_name=bucket_name,
        s3_file_prefix=s3_file_prefix, 
        s3_file_max_count=s3_file_max_count,
        verify_host=verify_host,
    )
    file_task = set_res_limits(task=file_task, mem_req="1Gi", mem_lim="4Gi",
                            cpu_req="2000m", cpu_lim="10000m", 
                            gpu_req=0, gpu_lim=0, gpu_type=None)
    file_task.execution_options.caching_strategy.max_cache_staleness = cache_setting
    file_task.add_pod_label(s3_secrets, "true") # s3_serets is the label for poddefault
    file_task.set_display_name("load reports op")
    
    llm_task = llm_op(
        model_root=model_root, 
        lm_model_type=lm_model_type,
        max_token_length=max_token_length,
        max_position_embeddings=max_position_embeddings,
        max_new_tokens=max_new_tokens,
        repetition_penalty=repetition_penalty,
        temperature=temperature,
        lm_device_map=lm_device_map,
        top_k=top_k,
        top_p=top_p,
        bucket_name=bucket_name,
        s3_file_key_list=file_task.output, # s3_file_key_list selected from file task output
        embed_model_vendor=embed_model_vendor,
        retriever_k=retriever_k,
        user_query1=user_query1,
        user_query1_parser_question=user_query1_parser_question,
        user_query1_target_var=user_query1_target_var,
        user_query1_target_var_desc=user_query1_target_var_desc,
        user_query2=user_query2,
        user_query2_vars=user_query2_vars,
        user_query2_parser_question=user_query2_parser_question,
        user_query2_target_var=user_query2_target_var,
        user_query2_target_var_desc=user_query2_target_var_desc,
        sql_target_table=sql_target_table,
        run_name=run_name,
        verbose=verbose,
        )
    llm_task = set_res_limits(task=llm_task, mem_req="20Gi", mem_lim="40Gi",
                            cpu_req="2000m", cpu_lim="10000m", 
                            gpu_req=1, gpu_lim=1, gpu_type=gpu_type)
    llm_task.add_pvolumes({model_root: shared_volume})
    llm_task.execution_options.caching_strategy.max_cache_staleness = cache_setting
    llm_task.add_pod_label(s3_secrets, "true") 
    llm_task.add_pod_label(sql_secrets, "true")
    llm_task.set_display_name("llm op")

In [10]:
# import os
# pipeline_path_dir="./compiled"
# if not os.path.exists(pipeline_path_dir):
#     os.makedirs(pipeline_path_dir)

PIPE_LINE_FILE_NAME=f"llm_rag_kfp_pipeline"
kfp.compiler.Compiler().compile(llm_pipeline, f"{PIPELINE_PATH_DIR}/{PIPE_LINE_FILE_NAME}.yaml")

In [11]:
from datetime import datetime
from pytz import timezone as ptimezone

def get_local_time_str(target_tz_str: str = "Europe/Berlin", format_str: str = "%Y-%m-%d %H-%M-%S") -> str:
    """
    this method is created since the local timezone is miss configured on the server
    @param: target timezone str default "Europe/Berlin"
    @param: "%Y-%m-%d %H-%M-%S" returns 2022-07-07 12-08-45
    """
    target_tz = ptimezone(target_tz_str) # create timezone, in python3.9 use standard lib ZoneInfo
    # utc_dt = datetime.now(datetime.timezone.utc)
    target_dt = datetime.now(target_tz)
    return datetime.strftime(target_dt, format_str)

In [12]:
RUN_NAME = f"{PIPE_LINE_FILE_NAME} {get_local_time_str()}"

In [13]:
# from kubernetes import client as k8s_client
pipeline_config = dsl.PipelineConf()

# pipeline_config.set_image_pull_secrets([k8s_client.V1ObjectReference(name=K8_GIT_SECRET_NAME, namespace=NAME_SPACE)])
# pipeline_config.set_image_pull_policy("Always")
pipeline_config.set_image_pull_policy("IfNotPresent")

pipeline_args = {
    "model_root": "/mnt",
    # "lm_model_type": "llama13B-chat",
    # "lm_model_type": "mistral7B-inst02",
    "lm_model_type": "mixtral8x7B-inst01", # "mistral7B-inst02", # "llama13B-chat",
    "max_token_length": 4096, # for llama2 models max_length is 4096
    "max_position_embeddings": 3072, # for llama2 models, using chunk size of 3072
    "max_new_tokens": 80, # the maximum new tokens to be generated by the causalLM 
    "repetition_penalty": 1.15,
    "temperature": 0.001,
    "lm_device_map": "auto",
    "top_k": 3,
    "top_p": 0.8,
    "bucket_name": "scivias-medreports",
    "s3_file_prefix": "trans2en/KK-SCIVIAS", # used for filter the s3 file inside the trans2en folder
    "s3_file_max_count": 1, # -1 for unlimited file match the prefix
    "verify_host": True,
    # "s3_file_key": "trans2en/KK-SCIVIAS-00003^0053360847^2018-09-28^KIIGAS.txt",
    # "s3_file_key": "trans2en/KK-SCIVIAS-00008^0053673565^2019-04-29^KIIID.txt",
    "s3_secrets": "add-scivias-medreport-secret",
    "sql_secrets": "add-scivias-postgresdb-secret",
    "embed_model_vendor": "baai", # "sentence-transformers"
    "retriever_k": 3, # the number of docs as context to be retrieved by RAG for RetrievalQA chain
    "user_query1": "What is the name of the patient? (Remember to include 'The name of the patient is' in your answer.)",
    "user_query1_parser_question": "retrieve one: patient name",
    "user_query1_target_var": "patient_name",
    "user_query1_target_var_desc": "patient name",
    "user_query2": "What is the age of the patient {patient_name}? (Remember to include 'The age of the patient is' in your answer.)",
    "user_query2_vars": ["patient_name"],
    "user_query2_parser_question": "retrieve one: patient age as number",
    "user_query2_target_var": "patient_age",
    "user_query2_target_var_desc": "patient age",
    "sql_target_table": "llm_med_report_info",
    "run_name": RUN_NAME,
    "verbose": False,
    "gpu_type": "40gb", # "20gb", # "40gb"
}

In [14]:
client = kfp.Client()
NAMESPACE = client.get_user_namespace()

In [15]:
run = client.create_run_from_pipeline_func(
    pipeline_func=llm_pipeline,
    arguments = pipeline_args, #{}
    run_name = RUN_NAME,
    pipeline_conf=pipeline_config,
    experiment_name=EXPERIMENT_NAME,
    namespace=NAMESPACE,
)

run

RunPipelineResult(run_id=003c2c51-4a83-45a6-8bf3-13c210198fce)