In [1]:
import sys

In [2]:
# !{sys.executable} -m pip install --upgrade --user kfp==1.8.22

In [3]:
import kfp
from kfp import dsl
from functools import partial
from kfp.dsl import (
    pipeline,
    ContainerOp,
    PipelineVolume
)
from kfp.components import (
    InputPath,
    OutputPath,
    create_component_from_func
)

EXPERIMENT_NAME = 'llm' # Name of the experiment in the KF webapp UI
EXPERIMENT_DESC = 'llm med report pipeline experiment'

In [4]:
from dataclasses import dataclass

@dataclass
class Settings():
    llm_base_image: str = 'pytorch/pytorch:2.2.0-cuda11.8-cudnn8-devel'
    applyllm_version: str = '0.0.2'
    pypdf_version: str = '3.15.5'
    accelerate_version: str = '0.26.1'

settings = Settings()

In [5]:
@partial(
    create_component_from_func,
    output_component_file=f"custom_registry_component.yaml",
    base_image=settings.llm_base_image, 
    packages_to_install=[
        f"applyllm=={settings.applyllm_version}",
        f"pypdf=={settings.pypdf_version}",
        f"accelerate=={settings.accelerate_version}",
    ], # adding additional libs
    # pip_index_urls=["https://gitlab.lrz.de/api/v4/projects/150553/packages/pypi/simple"]
    # define my private pypi package registry v2 component decorator
)
def llm_op(model_root: str, 
           lm_model_type: str, 
           max_token_length: int, 
           max_position_embeddings: int,
           max_new_tokens: int,
           repetition_penalty: float,
           temperature: float,
           lm_device_map: str,
           top_k: int,
           top_p: float,
    ):
    """
    Args:
        model_root: The root directory of the model
        lm_model_type: The type of the language model
        max_token_length: The maximum token length 4096
        max_position_embeddings: The maximum position embeddings 3072
        max_new_tokens: The maximum new tokens to be generated 80
        repetition_penalty: The repetition penalty 1.15
        temperature: The temperature 0.001
        lm_device_map: The device map for the language model "auto"
        top_k: The top k value 3
        top_p: The top p value 0.8
    """
    import os
    import applyllm as apl

    from applyllm.accelerators import (
        AcceleratorHelper,
        AcceleratorStatus,
        DirectorySetting,
        # DIR_MODE_MAP,
        TokenHelper as th
    )
    from applyllm.utils import time_func
    from applyllm.pipelines import (
        LocalCausalLMConfig,
        ModelConfig,
        ModelCatalog,
    )

    dir_setting = DirectorySetting(home_dir=model_root)
    
    # debug code to check the mounted model_root, whether DirectorySetting is working
    # print([x[0] for x in os.walk(model_root)])

    gpu_status = AcceleratorStatus.create_accelerator_status()
    '''init gpu helper'''
    gpu_helper = AcceleratorHelper()
    UUIDs = gpu_helper.nvidia_device_uuids_filtered_by(is_mig=True, log_output=False)
    # will set the XDG_CACHE_HOME, this line must be called before import transformers
    gpu_helper.init_cuda_torch(UUIDs, dir_setting)
    print(os.environ["CUDA_VISIBLE_DEVICES"])
    print(os.environ["XDG_CACHE_HOME"])
    '''init llm model to be loaded'''
    model_map = {
        "llama7B-chat":     "meta-llama/Llama-2-7b-chat-hf",
        "llama13B-chat" :   "meta-llama/Llama-2-13b-chat-hf",
        "llama70B-chat" :   "meta-llama/Llama-2-70b-chat-hf",
        "mistral7B-01":     "mistralai/Mistral-7B-v0.1",
        "mistral7B-inst02": "mistralai/Mistral-7B-Instruct-v0.2",
        "mixtral8x7B-01":   "mistralai/Mixtral-8x7B-v0.1",
        "mixtral8x7B-inst01":   "mistralai/Mixtral-8x7B-Instruct-v0.1", 
    }
    # model_type = "mistral7B-inst02"
    model_type = lm_model_type
    model_name = model_map.get(model_type, "mistral7B-inst02")
    print(model_name)

    import transformers
    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer
    from torch import bfloat16
    
    print(f"applyllm version:     {apl.__version__}")
    print(f"transformers version: {transformers.__version__}")
    print(f"torch version:        {torch.__version__}")

    token_kwargs = th.gen_token_kwargs(model_type=model_type, dir_setting=dir_setting)
    
    """Load CausalLM model"""
    base_lm_config = ModelConfig(
        model_config = {
            "pretrained_model_name_or_path": model_name,
            "device_map": "auto",
            # "max_memory": f"{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB",
        }
    )

    lm_model_kwargs = {
        "quantized": True,
        "model_config": base_lm_config.get_config(),
        "quantization_config": {
            "quantization_config": transformers.BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type='nf4',
                bnb_4bit_use_double_quant=True,
                bnb_4bit_compute_dtype=bfloat16
            )
        }
    }
    lm_config = LocalCausalLMConfig(**lm_model_kwargs)

    @time_func
    def fetch_lm_model():
        return AutoModelForCausalLM.from_pretrained(
        **lm_config.get_config(),
        **token_kwargs,  
        )

    model = fetch_lm_model()

    gpu_status.gpu_usage()
    """Load CausalLM tokenizer"""

    def config_tokenizer(model_name: str, config: dict, pad_token_id = 2):
        if model_name.startswith(ModelCatalog.MISTRAL_FAMILY):
            return {**config, "pad_token_id": pad_token_id}
        else:
            return config
        
    MAX_POSITION_EMBEDDINGS = max_position_embeddings
    MAX_LENGTH = max_token_length

    model_config= {
        "pretrained_model_name_or_path": model_name,
        "device": "cpu",
        # "device_map": "auto", # put to GPU if GPU is available
        "max_position_embeddings": MAX_LENGTH,
        "max_length": MAX_LENGTH,
    }
    model_config = config_tokenizer(model_name=model_name, config=model_config)
    tokenizer_config = ModelConfig(model_config=model_config)

    tokenizer = AutoTokenizer.from_pretrained(
        **tokenizer_config.get_config(), 
        **token_kwargs,
    )

    """init the transformer pipeline as backend llm for langchain"""
    tp_kwargs = {
        "task": "text-generation",
        "model": model,
        "tokenizer": tokenizer,
        "device_map": lm_device_map,
        "max_length": None, # remove the total length of the generated response
        "max_new_tokens": max_new_tokens, # set the size of new generated token 
    }

    tp_config = ModelConfig(model_config = tp_kwargs)

    generator = transformers.pipeline(
        **tp_config.get_config(),
        **token_kwargs,
    )

    """Huggingface pipeline"""
    from applyllm.pipelines import ModelCatalog, ModelInfo, PromptHelper

    # model_info = ModelCatalog.get_model_info(model_name=model_name)
    # prompt_helper = PromptHelper(model_info=model_info)

    import langchain
    from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
    print(f"langchain.__version__: {langchain.__version__}")

    llm = HuggingFacePipeline(
        pipeline=generator 
    )

    llm.model_id = model_name
    pipeline_kwargs_config = {
        "device_map": lm_device_map,
        "max_length": MAX_LENGTH, # deactivate to use max_new_tokens
        "max_new_tokens": max_new_tokens, # this is not taken by the model ?
        "eos_token_id": tokenizer.eos_token_id, # also making trouble (optional)
        "temperature": temperature,
        "repetition_penalty": repetition_penalty, # 1.15,
    }
    model_kwargs_config = {
        "do_sample": True, # also making trouble with langchain (optional)
        "top_k": top_k, # this param result in trouble with langchain (optional)
        "num_return_sequences": 1, # (optional)
        "eos_token_id": tokenizer.eos_token_id, # also making trouble (optional)
        "max_length": MAX_LENGTH, # deactivate to use max_new_tokens
        "max_new_tokens": max_new_tokens, # this is not taken by the model ?
        "temperature": temperature,
        "top_p": top_p, # 0.95 # alternative to top_k summerized probability while do_sample=True
        "repetition_penalty": repetition_penalty, # 1.15,
        "trust_remote_code": True,
    }

    llm.model_kwargs = config_tokenizer(model_name=model_name, config=model_kwargs_config, pad_token_id=tokenizer.eos_token_id)
    llm.pipeline_kwargs = config_tokenizer(model_name=model_name, config=pipeline_kwargs_config, pad_token_id=tokenizer.eos_token_id)
    print("HuggingFacePipeline setup done")
    gpu_status.gpu_usage()

    """LangChain pipeline"""
    from langchain.chains import RetrievalQA
    from langchain.document_loaders import S3DirectoryLoader, S3FileLoader
    from langchain.vectorstores import DocArrayInMemorySearch
    from langchain.indexes import VectorstoreIndexCreator
    from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
    # from langchain.text_splitter import TextSplitter
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    from langchain_core.documents.base import Document
    from langchain.prompts import PromptTemplate
    from typing import List
    import boto3
    from applyllm.io import S3PdfObjHelper, DocMetaInfo, DocCorpusS3

    print(boto3.__version__)





In [6]:
def set_res_limits(task: ContainerOp, mem_req="200Mi", cpu_req="2000m", mem_lim="4000Mi", cpu_lim='4000m', gpu_req=None, gpu_lim=None, gpu_type:str="20gb"):
    """
    this function helps to set the resource limit for container operators
    op.set_memory_limit('1000Mi') = 1GB
    op.set_cpu_limit('1000m') = 1 cpu core
    """
    if gpu_type == "20gb":
        gpu_resource = "nvidia.com/mig-2g.20gb"
        # gpu_resource = "nvidia.com/mig-1g.20gb"
    elif gpu_type == "40gb":
        gpu_resource = "nvidia.com/mig-3g.40gb"
    else:
        gpu_resource = "nvidia.com/mig-1g.10gb"
        
    # gpu_resource = "nvidia.com/mig-2g.20gb"
    new_op = task.set_memory_request(mem_req)\
        .set_memory_limit(mem_lim)\
        .set_cpu_request(cpu_req)\
        .set_cpu_limit(cpu_lim)
    if (gpu_req is not None) and (gpu_lim is not None):
        new_op.add_resource_request(gpu_resource, gpu_req)
        new_op.add_resource_limit(gpu_resource, gpu_lim)
    return new_op

In [7]:
@pipeline(
    name = EXPERIMENT_NAME,
    description = EXPERIMENT_DESC
)
def llm_pipeline(
        model_root: str = "/mnt", 
        lm_model_type: str = "mistral7B-inst02", 
        max_token_length: int = 4096,
        max_position_embeddings: int = 3072,
        max_new_tokens: int = 80,
        repetition_penalty: float = 1.15,
        temperature: float = 0.001,
        lm_device_map: str = "auto",
        top_k: int = 3,
        top_p: float = 0.8,
        gpu_type: str = "20gb"
    ):
    '''local variable'''
    no_artifact_cache = "P0D"
    artifact_cache_today = "P1D"
    # cache_setting = artifact_cache_today
    cache_setting = no_artifact_cache

    '''Pipeline Volume'''
    shared_volume = PipelineVolume("llm-models")
    
    '''pipeline'''   
    llm_task = llm_op(
        model_root=model_root, 
        lm_model_type=lm_model_type,
        max_token_length=max_token_length,
        max_position_embeddings=max_position_embeddings,
        max_new_tokens=max_new_tokens,
        repetition_penalty=repetition_penalty,
        temperature=temperature,
        lm_device_map=lm_device_map,
        top_k=top_k,
        top_p=top_p,
        )
    # 200 MB ram and 1 cpu
    llm_task = set_res_limits(task=llm_task, mem_req="20Gi", mem_lim="40Gi",
                            cpu_req="2000m", cpu_lim="10000m", 
                            gpu_req=1, gpu_lim=1, gpu_type=gpu_type)
    # set the download caching to be 1day, disable caching with P0D
    # download_task.execution_options.caching_strategy.max_cache_staleness = artifact_cache_today
    llm_task.add_pvolumes({model_root: shared_volume})
    llm_task.execution_options.caching_strategy.max_cache_staleness = cache_setting
    llm_task.set_display_name("llm op")

In [8]:
import os
pipeline_path_dir="./compiled"
if not os.path.exists(pipeline_path_dir):
    os.makedirs(pipeline_path_dir)

PIPE_LINE_FILE_NAME=f"llm_rag_kfp_pipeline"
kfp.compiler.Compiler().compile(llm_pipeline, f"{pipeline_path_dir}/{PIPE_LINE_FILE_NAME}.yaml")

In [9]:
from datetime import datetime
from pytz import timezone as ptimezone

def get_local_time_str(target_tz_str: str = "Europe/Berlin", format_str: str = "%Y-%m-%d %H-%M-%S") -> str:
    """
    this method is created since the local timezone is miss configured on the server
    @param: target timezone str default "Europe/Berlin"
    @param: "%Y-%m-%d %H-%M-%S" returns 2022-07-07 12-08-45
    """
    target_tz = ptimezone(target_tz_str) # create timezone, in python3.9 use standard lib ZoneInfo
    # utc_dt = datetime.now(datetime.timezone.utc)
    target_dt = datetime.now(target_tz)
    return datetime.strftime(target_dt, format_str)

In [10]:
# from kubernetes import client as k8s_client
pipeline_config = dsl.PipelineConf()

# pipeline_config.set_image_pull_secrets([k8s_client.V1ObjectReference(name=K8_GIT_SECRET_NAME, namespace=NAME_SPACE)])
# pipeline_config.set_image_pull_policy("Always")
pipeline_config.set_image_pull_policy("IfNotPresent")

pipeline_args = {
    "model_root": "/mnt",
    # "lm_model_type": "llama13B-chat",
    "lm_model_type": "mistral7B-inst02", # "llama13B-chat",
    "max_token_length": 4096, # for llama2 models max_length is 4096
    "max_position_embeddings": 3072, # for llama2 models, using chunk size of 3072
    "max_new_tokens": 80, # the maximum new tokens to be generated by the causalLM 
    "repetition_penalty": 1.15,
    "temperature": 0.001,
    "lm_device_map": "auto",
    "top_k": 3,
    "top_p": 0.8,
    "gpu_type": "20gb", # "40gb"
}

In [11]:
RUN_NAME = f"{PIPE_LINE_FILE_NAME} {get_local_time_str()}"

client = kfp.Client()
NAMESPACE = client.get_user_namespace()

In [12]:
run = client.create_run_from_pipeline_func(
    pipeline_func=llm_pipeline,
    arguments = pipeline_args, #{}
    run_name = RUN_NAME,
    pipeline_conf=pipeline_config,
    experiment_name=EXPERIMENT_NAME,
    namespace=NAMESPACE,
)

run

RunPipelineResult(run_id=cb4bae2d-e676-447d-89f3-fa9bf21ac0c2)