# About this Jupyter Notebook

@author: Yingding Wang\
@updated: 08.09.2023

This notebook defines and runs a kubeflow pipeline with KFP python SDK v1 for using LlaMA2 and T5 based de_en translatition models to extract information from a non-structured PDF data.

The prompt is specially constructed to extract "patient name" and "patient age" information from a doctor's letter.

Notice:
```
urllib3 1.26.16 works with google auth, see any http google auth error, reinstall kfp=1.8.22
```

## Install KFP Python SDK to build a V1 pipeline
* Build KF pipeline with python SDK: https://www.kubeflow.org/docs/components/pipelines/sdk/build-pipeline/
* Current KFP python SDK version on pypi.org: https://pypi.org/project/kfp/ 

In [1]:
import sys

In [2]:
#!{sys.executable} -m pip uninstall -y kfp-server-api
#!{sys.executable} -m pip install --user --upgrade kfp-server-api==1.8.5

In [3]:
#!{sys.executable} -m pip install --upgrade --user kfp==2.0.0b13
#!{sys.executable} -m pip install --upgrade --user kfp==1.8.22

## Restart the Kernel

After the installation of KFP python SDK, the notebook kernel must be restarted.

## Getting familiar with Jupyter Notebook ENV 

In [4]:
from platform import python_version
print (f"current platform python version: {python_version()}")

current platform python version: 3.8.10


In [5]:
# run kubectl command line to see the quota in the name space
!kubectl describe quota

Name:                                                         kf-resource-quota
Namespace:                                                    kubeflow-kindfor
Resource                                                      Used     Hard
--------                                                      ----     ----
basic-csi.storageclass.storage.k8s.io/persistentvolumeclaims  5        15
basic-csi.storageclass.storage.k8s.io/requests.storage        135Gi    150Gi
cpu                                                           2090m    128
longhorn.storageclass.storage.k8s.io/persistentvolumeclaims   1        15
longhorn.storageclass.storage.k8s.io/requests.storage         250Gi    500Gi
memory                                                        24966Mi  512Gi
requests.nvidia.com/mig-1g.10gb                               0        2
requests.nvidia.com/mig-1g.20gb                               0        1
requests.nvidia.com/mig-2g.20gb                               1        1


In [6]:
# examing the kfp python sdk version inside a KubeFlow v1.5.1
!{sys.executable} -m pip list | grep kfp

[0mkfp                       1.8.22
kfp-pipeline-spec         0.1.16
kfp-server-api            1.8.5
[0m

## Setup global variables

In [7]:
import kfp
client = kfp.Client()
NAMESPACE = client.get_user_namespace()
EXPERIMENT_NAME = 'scivias' # Name of the experiment in the KF webapp UI
EXPERIMENT_DESC = 'extract information from doctors letter'
PREFIX = "llm"
DATA_ROOT = "/mnt"
DATA_SUB_PATH = "core-kind/yinwang"
FILE_SUB_PATH = f"{DATA_SUB_PATH}/data/medreports"
FILE_PATTERN = "KK-SCIVIAS-*.pdf"
DEFAULT_GEN_MODEL_TYPE = "7B"
DEFAULT_TRANS_MODEL_TYPE = "custom"

print(NAMESPACE)

kubeflow-kindfor


In [8]:
from dataclasses import dataclass

'''
cudf 23.2.0 requires pandas<1.6.0dev0,>=1.0, but you have pandas 2.0.3 which is incompatible.
dask-cudf 23.2.0 requires pandas<1.6.0dev0,>=1.0, but you have pandas 2.0.3 which is incompatible.
'''
@dataclass
class Settings:
    base_torch_image: str = "harbor-dmz.srv.med.uni-muenchen.de/core-general/ngc:0.0.0"
    pandas: str = "pandas==1.5.3" # < 2.0.3 by cudf and dash-cudf
    pypdf: str = "pypdf==3.15.5"
    pyarrow: str = "pyarrow==10.0.0"

    
settings = Settings() 
print(f"{settings}")

Settings(base_torch_image='harbor-dmz.srv.med.uni-muenchen.de/core-general/ngc:0.0.0', pandas='pandas==1.5.3', pypdf='pypdf==3.15.5', pyarrow='pyarrow==10.0.0')


### Creating KubeFlow component from python function

In [9]:
# import kfp dsl components
import kfp.dsl as dsl
from functools import partial
from kfp.dsl import (
    pipeline,
    ContainerOp,
    PipelineVolume
)
from kfp.components import (
    InputPath,
    OutputPath,
    InputBinaryFile, 
    OutputBinaryFile,
    create_component_from_func
)

#### PDF text_extractor component

In [10]:
@partial(
    create_component_from_func,
    output_component_file=f"{PREFIX}_text_extraction_component.yaml",
    base_image="python:3.8.18", # settings.base_torch_image, # use pt base image
    packages_to_install=[
        settings.pandas,
        settings.pypdf,
        settings.pyarrow,
        #"pypdf==3.15.5",
        #"pyarrow==10.0.0"
    ],# adding additional libs
)
def text_extractor(
    data_root: str, 
    data_sub_path: str, 
    file_pattern: str,
    file_idx: int,
    show_log_txt: bool,
    output_path: OutputPath("Dataset")):
    """
    Args:
      file_idx: the idx of the file to extract text
    """
    import sys, os, glob
    from pypdf import PdfReader
    import pandas as pd
    from pandas import DataFrame, Series
    
    class PDFHelper():
        def __init__(self, data_folder: str, file_pattern: str):
            """
            Args:
              data_folder: where all the pdf files are
              file_pattern: the pdf files match this pattern

            Examples:

              PDFHelper(data_folder = "./data/medreports", 
                        file_pattern="KK-SCIVIAS-*.pdf")
            """
            self.data_folder = data_folder
            self.file_pattern = file_pattern
            self.dir_path = f"{data_folder}/{file_pattern}"
            self.file_path_list = glob.glob(self.dir_path)

        # @multifunction(None, str)
        def read_pdf(self, input) -> str:
            """read from the give path the text and returns a raw string. 
               use print to print the content
            """
            if isinstance(input, str):
                file_path = input
                reader = PdfReader(file_path)
                print(f"read pages: {len(reader.pages)}")
                content_raw_str = "".join([page.extract_text() for page in reader.pages])
                return content_raw_str
            elif isinstance(input, int):
                file_idx = input
                if (file_idx < len(self.file_path_list)):
                    return self.read_pdf(str(self.file_path_list[file_idx]))
                else:
                    return ""
            else:
                return ""

            
        # @multifunction(None, str)
        def count_token(self, input)-> int:
            """count the total token in a pdf file
            """
            if isinstance(input, str):
                file_path = input
                token_size = len(self.read_pdf(file_path))
                print(f"file: {file_path}\n" + 
                  f"total token: {token_size}")
                return token_size
            elif isinstance(input, int):
                file_idx = input
                if (file_idx < len(self.file_path_list)):
                    return self.count_token(str(self.file_path_list[file_idx]))
                else:
                    return 0
            else:
                return 0


        def read_txt(self, input) -> str:
            """read from the give path the text and returns a raw string. 
               use print to print the content
            """
            if isinstance(input, str):
                file_path = input
                with open(file_path, "r") as txt_file:
                    content_raw_str = txt_file.read()
                return content_raw_str
            elif isinstance(input, int):
                file_idx = input
                if (file_idx < len(self.file_path_list)):
                    return self.read_txt(str(self.file_path_list[file_idx]))
                else:
                    return ""
            else:
                return ""
    
    
    '''helper function'''
    def show_folder_files(folder: str) -> None:
        print(os.listdir(folder))
        
    
    def save_txt_to_feather(txt_list: list, path: str, col_names=["text"]) -> None:
        with open(path, "w") as file:
            ser = Series(txt_list)
            df = DataFrame(data=Series(txt_list), columns=col_names)
            df.to_feather(path)
            
            
    def log_txt(txt: str, debug: bool) -> None:
        if debug:
            print("Reading string content...")
            print("-"*20)
            print(txt)        
        
        
    '''Global variable'''
    # file_idx = 1 # which file to read
    data_path = f"{data_root}/{data_sub_path}"
    show_folder_files(data_path)
    loader = PDFHelper(data_folder = data_path, file_pattern=file_pattern)
    # read only the second file, which is short
    file_content = loader.read_pdf(file_idx)
    print(f"all files in directory:\n{loader.file_path_list}")
    print(f"current loading file: {loader.file_path_list[file_idx]}")
    # show content in component logs
    log_txt(txt = file_content, debug=show_log_txt)
    # save to output
    save_txt_to_feather(file_content, output_path)
        
    # return file_content

#### translate_bart component

While BERT was trained by using a simple token masking technique, BART empowers the BERT encoder by using more challenging kinds of masking mechanisms in its pre-training.

* loop with index using enumerate: https://treyhunner.com/2016/04/how-to-loop-with-indexes-in-python/
* BART : Generalizing BERT (due to the bidirectional encoder) and GPT2 (with the left to right decoder) : https://www.projectpro.io/article/transformers-bart-model-explained/553
* MarianMT on huggingface (BART): https://huggingface.co/docs/transformers/model_doc/marian

In [11]:
@partial(
    create_component_from_func,
    output_component_file=f"{PREFIX}_trans_component.yaml",
    base_image="tensorflow/tensorflow:2.12.0", # cpu version, small as pytorch https://hub.docker.com/r/pytorch/pytorch/tags?page=1&name=2.0.1
    # base_image="python:3.8.18", # settings.base_torch_image, # use pt base image
    packages_to_install=[
        "transformers==4.31.0",
        "sacremoses==0.0.53",
        "sentencepiece==0.1.99",
        settings.pandas,
        settings.pyarrow,
        # "pandas==2.0.3"
        #"https://download.pytorch.org/whl/cu117/torch==2.0.1+cu117",
    ],# adding additional libs
)
def bart_translator(
    data_root: str, 
    data_sub_path: str, 
    model_type: str,
    show_log_txt: bool,
    # origin_text: str, 
    input_path: InputPath("Dataset"),
    output_path: OutputPath("Dataset")
    ):
    """
    Args:
      input_path: feather binary encoded text str to be translated from german to english
      output_path: feather binary encoded the translated english text
    """
    import subprocess, os, re, sys, time
    import pandas as pd
    from pandas import DataFrame, Series
    
    class GPUInfoHelper():
        def __init__(self):
            pass


        def byte_gb_info(self, byte_mem) -> str:
            """calculate the byte size to GB size for better human readable"""
            # format the f string float with :.2f to decimal digits
            # https://zetcode.com/python/fstring/
            return f"{(byte_mem/1024**3):4f} GB"


        def accelerator_mem_info(self, device_idx: int):
            # total
            t = torch.cuda.get_device_properties(device_idx).total_memory
            # usable
            r = torch.cuda.memory_reserved(device_idx)
            # allocated
            a = torch.cuda.memory_allocated(device_idx)
            # still free
            f = r-a  
            print( # "GPU memory info:\n" + 
                  f"Physical  memory : {self.byte_gb_info(t)}\n" + 
                  f"Reserved  memory : {self.byte_gb_info(r)}\n" + 
                  f"Allocated memory : {self.byte_gb_info(a)}\n" + 
                  f"Free      memory : {self.byte_gb_info(f)}")


        def accelerator_compute_info(self, device_idx: int) -> None:
            name = torch.cuda.get_device_properties(device_idx).name
            count = torch.cuda.get_device_properties(device_idx).multi_processor_count
            print(f"Device_name      : {name} \n" +
                  f"Multi_processor  : {count}")    


        def gpu_usage(self) -> None:        
            num_of_gpus = torch.cuda.device_count();
            # this shows only the gpu device, not the MIG
            print(f"num_of_gpus: {num_of_gpus}")
            for device_idx in range(torch.cuda.device_count()):
                print("-"*20)
                self.accelerator_compute_info(device_idx)                 
                self.accelerator_mem_info(device_idx)
                print("-"*20)
     
    
    # further
    def display_container_info():
        print("-"*10)
        print(f"python version: {sys.version}")
        # print(f"torch version: {torch.__version__}")
        print("-"*10)
    
    
    def nvidia_device_uuid(input: str):
        """parse the nvidia devices uuid from the nvidia device info str
        """
        try:
            # r'' before the search pattern indicates it is a raw string, 
            # otherwise "" instead of single quote
            uuid = re.search(r'UUID\:\s(.+?)\)', input).group(1)
        except AttributeError:
            # "UUID\:\s" and "\)" not found
            uuid = ""
        return uuid
    
    
    def nvidia_device_info() -> str:
        """get the nvidia MIGs device uuid and GPU uuid 
        """
        # blocking call
        result = subprocess.run(["nvidia-smi", "-L"], stdout=subprocess.PIPE)
        # decode the byte object, returns string with \n
        cmd_out_str = result.stdout.decode('utf-8')
        return [line.strip() for line in cmd_out_str.split('\n') if len(line) > 0]
        
    
    def nvidia_mig_uuids() -> str:
        """get a comma separated str of nvidia MIGs devices
        """
        info_list = nvidia_device_info()
        # skip the first GPU ID, get the MIGs IDS
        uuid_list = [nvidia_device_uuid(e) for e in info_list[1:]]
        # if multi gpus need to join the device together for pytorch
        return ",".join(uuid_list)
    
    
    def init_cuda_torch(uuids: str, data_path: str) -> None:
        """setup the default env variables for transformers
        
        Args:
          uuids: a comma separate str of nvidia gpu/mig uuids
        """
        os.environ["WORLD_SIZE"] = "1" 
        os.environ["CUDA_VISIBLE_DEVICES"] = uuids 
        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512" #512
        init_transformers()
        
    def init_transformers() -> None:
        os.environ['XDG_CACHE_HOME']=f"{data_path}/models"
        
        
    def show_folder_files(folder: str) -> None:
        print(os.listdir(folder))
        
    
    # https://stackoverflow.com/questions/13673060/split-string-into-strings-by-length
    def wrap(s, w):
        """
        split string with length w into a list of strings with length w
        Arge:
          s: orginial str
          w: with of the each split for the string

        Return:
          a list of string with each element as string of length w
        """
        return [s[i:i + w] for i in range(0, len(s), w)]

    
    def save_txt_to_feather(txt_list: list, path: str, col_names=["text"]) -> None:
        with open(path, "w") as file:
            ser = Series(txt_list)
            df = DataFrame(data=Series(txt_list), columns=col_names)
            df.to_feather(path)
    
    
    def load_feather(path: str, col_names=["text"]) -> DataFrame:
        with open(path, "r") as file:
            df = pd.read_feather(path)
        if df is not None:
            return df
        else:
            return pd.DataFrame()
        
        
    def log_txt(txt: str, debug: bool) -> None:
        if debug:
            print("Reading string content...")
            print("-"*20)
            print(txt)  
        
    
    '''Global variable'''
    model_map = {
       #"small": "google/mt5-small", # 1.2 GB
       #"base" : "google/mt5-base", # 2.33 GB
       #"large" : "google/mt5-large", # 4.9 GB,
       #"xl" : "google/mt5-xl", # 15 GB
       #"xxl" : "google/mt5-xxl", # 51.7 GB,
       "custom": "Helsinki-NLP/opus-mt-de-en", 
    }
    data_path = f"{data_root}/{data_sub_path}"
    model_name = model_map.get(model_type, "custom")
    # nvidia_state = GPUInfoHelper()
    split_length = 350 # the token to split for text
    
    # load feather DataFrame
    feather_text_col_name = "text"
    df = load_feather(input_path)
    # get the row series and the column
    origin_text = str(df.iloc[0][feather_text_col_name])
    print(f"origin_text has token length: {len(origin_text)}")
    
    '''Initialization'''
    # UUIDs = nvidia_mig_uuids()
    init_transformers()
    # init_cuda_torch(UUIDs, data_path)
    # import torch
    display_container_info()
    # print(UUIDs)
    
    show_folder_files(data_path)
    
    # transformers must be imported after the init_cuda_torch so that the env variable will be set properly
    from transformers import pipeline
    import transformers
    
    print(f"Loading LLM model {model_name} ...")
    # using the accelerator with id by default, the device_map="auto" doesn't work,
    # model is too outdated to use auto accelerator detection. 
    accelerator_id = 0
    generator = pipeline(
        "translation", 
        model=model_name,
        # device_map="auto",
        device=accelerator_id,
    )
        
    def translate_gen(
        generator: transformers.pipelines.text2text_generation.TranslationPipeline, 
        nvidia_state: GPUInfoHelper = None,
    ):  
        """
        Args:
          max_new_tokens: control the maximum length of the generation
        """

        def local(sentences: list, print_mode: bool = True, max_length=400) -> list:
            """single input, no batch input
            Args:
              sentences:
            """
            start = time.time()
            result = generator(sentences, max_length=max_length)
            end = time.time()
            duration = end - start
            if print_mode: 
                print("-"*20)
                print(f"walltime: {duration} in secs.")
                if nvidia_state is not None:
                    nvidia_state.gpu_usage()
            return result
        return local    
    
    # create the convenient translate function
    # translate = translate_gen(generator, nvidia_state)
    
    translate = translate_gen(generator, None)
    # translate input
    splitted_content = wrap(origin_text, split_length)
    split_count = len(splitted_content)
    output = []
    for idx, split_text in enumerate(splitted_content, start=1):
        print_mode = (idx == 1 or idx == split_count)
        print(f"print_mode: {print_mode}")
        output.append(translate(split_text, print_mode, 1000)[0].get('translation_text', '').strip())
    
    en_content = ''.join(output)
    print(f"translated en_content text has token length: {len(en_content)}")
    # show content in component logs
    log_txt(txt = en_content, debug=show_log_txt)
    save_txt_to_feather(en_content, output_path)
    

### Create llm inference component

#### Subprocess call to pass the nvidia-smi output

* Python 3.5 subprocess.run https://stackoverflow.com/questions/4760215/running-shell-command-and-capturing-the-output
* https://stackoverflow.com/questions/7681715/whats-the-difference-between-subprocess-popen-and-call-how-can-i-use-them

#### Issue

```console
RuntimeError: Failed to import transformers.pipelines because of the following error (look up to see its traceback):
/usr/local/lib/python3.8/dist-packages/transformer_engine_extensions.cpython-38-x86_64-linux-gnu.so: undefined symbol: _ZN5torch3jit17parseSchemaOrNameERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE
Error: exit status 1
```
Workaround:
* https://github.com/microsoft/TaskMatrix/issues/116


* If a None object is returned, and the component shall return as string will receive an error: https://github.com/kubeflow/pipelines/issues/8868

#### Example of batch Gen

* https://github.com/huggingface/transformers/issues/18478#issuecomment-1208049618

In [12]:
from typing import NamedTuple
@partial(
    create_component_from_func,
    output_component_file=f"{PREFIX}_inference_component.yaml",
    base_image=settings.base_torch_image, # use pt base image
    packages_to_install=[
        # "transformers==4.32.1",
        "xformers==0.0.21",
        "huggingface_hub==0.17.1", 
        # "accelerate==0.21.0", # bug in accelerate 0.22.0 which runs on cpu only https://discuss.huggingface.co/t/could-not-load-model-meta-llama-llama-2-7b-chat-hf-with-any-of-the-following-classes/47641
        settings.pandas,
        settings.pyarrow,
        #"https://download.pytorch.org/whl/cu117/torch==2.0.1+cu117",
        #"https://download.pytorch.org/whl/cu117/torchvision==0.15.2+cu117",
        #"https://download.pytorch.org/whl/cu117/torchaudio==2.0.2"
    ],# adding additional libs
)
def llm_gen(
    data_root: str, 
    data_sub_path: str, 
    model_type: str,
    prompt_templates: list,
    #prompt_context: str,
    prompt_placeholder: str,
    # prompt: str,
    prompt_context_path: InputPath("Dataset"),
    show_log_txt: bool=False) -> NamedTuple("output", [("answers", list)]):
    """
    Args:
      prompt_templates: list of prompt string template, which can be extended with prompt_context by the promplt_placeholder
      prompt_context_path (prompt_context): additional context string passed as feather binary, which can be injected to the prompts, the path is dropped during the kfp compiling
      prompt_placeholder: the special charactor used in the prompts to be replaced by the prompt_context
    """
    from collections import namedtuple 
    import subprocess
    import os, time, sys, re
    import pandas as pd
    from pandas import DataFrame, Series
    # https://github.com/huggingface/transformers/issues/23340
    # subprocess.call(["pip", "uninstall", "-y", "transformer-engine"])
    
    class GPUInfoHelper():
        def __init__(self):
            pass


        def byte_gb_info(self, byte_mem) -> str:
            """calculate the byte size to GB size for better human readable"""
            # format the f string float with :.2f to decimal digits
            # https://zetcode.com/python/fstring/
            return f"{(byte_mem/1024**3):4f} GB"


        def accelerator_mem_info(self, device_idx: int):
            # total
            t = torch.cuda.get_device_properties(device_idx).total_memory
            # usable
            r = torch.cuda.memory_reserved(device_idx)
            # allocated
            a = torch.cuda.memory_allocated(device_idx)
            # still free
            f = r-a  
            print( # "GPU memory info:\n" + 
                  f"Physical  memory : {self.byte_gb_info(t)}\n" + 
                  f"Reserved  memory : {self.byte_gb_info(r)}\n" + 
                  f"Allocated memory : {self.byte_gb_info(a)}\n" + 
                  f"Free      memory : {self.byte_gb_info(f)}")


        def accelerator_compute_info(self, device_idx: int) -> None:
            name = torch.cuda.get_device_properties(device_idx).name
            count = torch.cuda.get_device_properties(device_idx).multi_processor_count
            print(f"Device_name      : {name} \n" +
                  f"Multi_processor  : {count}")    


        def gpu_usage(self) -> None:        
            num_of_gpus = torch.cuda.device_count();
            # this shows only the gpu device, not the MIG
            print(f"num_of_gpus: {num_of_gpus}")
            for device_idx in range(torch.cuda.device_count()):
                print("-"*20)
                self.accelerator_compute_info(device_idx)                 
                self.accelerator_mem_info(device_idx)
                print("-"*20)
    
    
    def display_container_info():
        print("-"*10)
        print(f"python version: {sys.version}")
        print(f"torch version: {torch.__version__}")
        print("-"*10)
    
    
    def nvidia_device_uuid(input: str):
        """parse the nvidia devices uuid from the nvidia device info str
        """
        try:
            # r'' before the search pattern indicates it is a raw string, 
            # otherwise "" instead of single quote
            uuid = re.search(r'UUID\:\s(.+?)\)', input).group(1)
        except AttributeError:
            # "UUID\:\s" and "\)" not found
            uuid = ""
        return uuid
    
    
    def nvidia_device_info() -> str:
        """get the nvidia MIGs device uuid and GPU uuid 
        """
        # blocking call
        result = subprocess.run(["nvidia-smi", "-L"], stdout=subprocess.PIPE)
        # decode the byte object, returns string with \n
        cmd_out_str = result.stdout.decode('utf-8')
        return [line.strip() for line in cmd_out_str.split('\n') if len(line) > 0]
        
    
    def nvidia_mig_uuids() -> str:
        """get a comma separated str of nvidia MIGs devices
        """
        info_list = nvidia_device_info()
        # skip the first GPU ID, get the MIGs IDS
        uuid_list = [nvidia_device_uuid(e) for e in info_list[1:]]
        # if multi gpus need to join the device together for pytorch
        return ",".join(uuid_list)
    
    
    def init_cuda_torch(uuids: str, data_path: str) -> None:
        """setup the default env variables for transformers
        
        Args:
          uuids: a comma separate str of nvidia gpu/mig uuids
        """
        os.environ["WORLD_SIZE"] = "1" 
        os.environ["CUDA_VISIBLE_DEVICES"] = uuids 
        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512" #512
        os.environ["TOKENIZERS_PARALLELISM"]="false"
        os.environ['XDG_CACHE_HOME']=f"{data_path}/models"
        
        
    def show_folder_files(folder: str) -> None:
        print(os.listdir(folder))
        
        
    def huggingface_access_token(data_path: str) -> str:
        token_file_path = f"{data_path}/.cache/huggingface/token"
        token = ""
        with open(token_file_path, "r") as file:
            token = file.read().replace('\n', '')
        return token
    
    
    def load_feather(path: str, col_names=["text"]) -> DataFrame:
        with open(path, "r") as file:
            df = pd.read_feather(path)
        if df is not None:
            return df
        else:
            return pd.DataFrame()
        
        
    def log_txt(data, debug: bool) -> None:
        if debug:
            if isinstance(data, str):
                print("Reading string content...")
                print("-"*20)
                print(data)
            elif isinstance(data, list):
                for txt in data:
                    print("Reading string content...")
                    print("-"*20)
                    print(txt)
    
    def prompts_with_context(prompts: list, placeholder, context: str) -> list:
        """replace the placeholder in prompts with context
        
        Returns:
           new prompt list of str replace the placeholder with context
        """
        return [ prompt.replace(placeholder, context) for prompt in prompts]
        
        
    '''Global variable'''
    model_map = {
        "7B": "meta-llama/Llama-2-7b-chat-hf",
        "13B" : "meta-llama/Llama-2-13b-chat-hf",
        "70B" : "meta-llama/Llama-2-70b-chat-hf",
        # "70B" : "meta-llama/Llama-2-70b-hf" 
    }
    data_path = f"{data_root}/{data_sub_path}"
    model_name = model_map.get(model_type, "7B")
    nvidia_state = GPUInfoHelper()
    # load feather DataFrame
    feather_text_col_name = "text"
    df = load_feather(prompt_context_path)
    # get the row series and the column
    prompt_context = str(df.iloc[0][feather_text_col_name])
    prompts = prompts_with_context(prompts=prompt_templates, placeholder=prompt_placeholder, context=prompt_context)
    # clean up memory
    del prompt_context
    # print str or list[str] content with log_txt
    log_txt(data=prompts, debug=show_log_txt)
    
    '''Initialization'''
    UUIDs = nvidia_mig_uuids()
    init_cuda_torch(UUIDs, data_path)
    import torch
    display_container_info()
    print(UUIDs)
    
    show_folder_files(data_path)
    
    '''Transformers must be imported after the init_cuda_torch to get env set'''
    import transformers
    from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
    import transformers
    print(f"transformers version: {transformers.__version__}")
    
    def chat_gen(
        generator: transformers.pipelines.text_generation.TextGenerationPipeline, 
        tokenizer: transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast,
        nvidia_state: GPUInfoHelper,
    ):    
        def local(input: str, print_mode: bool = True) -> list[str]:
            start = time.time()
            sequences = generator(
                input,
                do_sample=True,
                top_k=10,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id,
                # max_length=200,
                max_new_tokens=200,
            )
            result = []
            for seq in sequences:
                result.append(f"Result: \n{seq['generated_text']}")

            end = time.time()
            duration = end - start
            # returns the result
            if print_mode == True:
                for s in result:
                    print(s)

                print("-"*20)
                print(f"walltime: {duration} in secs.")
                nvidia_state.gpu_usage()
            return result
        
        return local
    
    
    token = huggingface_access_token(data_path)
    print(f"XDG_CACHE_HOME: {os.environ['XDG_CACHE_HOME']}")
    print(f"Loading LLM model {model_name} ...")
    # os.makedirs("/tmp/outputs/Output/data")
    tokenizer = AutoTokenizer.from_pretrained(
        model_name, 
        #use_auth_token=token,
        token=token, #transformers==4.32.1
    )
    
    generator = pipeline(
        "text-generation",
        model=model_name,
        torch_dtype=torch.float16,
        device_map="auto",
        token=token,
        #use_auth_token=token, #transformers==4.32.1
    )
    # print gpu mem usage after loading the llm
    nvidia_state.gpu_usage()
    # create convenient chat function
    chat = chat_gen(generator, tokenizer, nvidia_state)
    
    gen_results = []
    print(f"total prompts: {len(prompts)}")
    for prompt in prompts:
        print(f"current prompt token length: {len(prompt)}")
        talk_back_list = chat(prompt)
    
        if (talk_back_list is not None) and len(talk_back_list) > 0:
            # answer_str = talk_back_list[0]
            gen_results.append(talk_back_list[0])
        else:
            # answer_str = ""
            gen_results.append("")
            
    #with open(output_path, "w+", encoding="utf-8") as f:
    #    df.to_csv(f, index=False, header=True, encoding="utf-8")
    
    output = namedtuple('output',['answers']) 
    return output(gen_results)
    
    

### Create data processing component

### Define Helper Function
Difference between 2Gi and 2G:
* https://stackoverflow.com/questions/50804915/kubernetes-size-definitions-whats-the-difference-of-gi-and-g/50805048#50805048

Set MIG GPU requests:
* https://github.com/kubeflow/pipelines/issues/6858#issuecomment-1007511676

```python
containerOp.add_resource_request(gpu_resource, gpu_req)
containerOp.add_resource_limit(gpu_resource, gpu_lim)
```

In [13]:
def pod_resource_transformer(op: ContainerOp, mem_req="200Mi", cpu_req="2000m", mem_lim="4000Mi", cpu_lim='4000m', gpu_req=None, gpu_lim=None):
    """
    this function helps to set the resource limit for container operators
    op.set_memory_limit('1000Mi') = 1GB
    op.set_cpu_limit('1000m') = 1 cpu core
    """
    gpu_resource = "nvidia.com/mig-1g.20gb"
    # gpu_resource = "nvidia.com/mig-2g.20gb"
    new_op = op.set_memory_request(mem_req)\
        .set_memory_limit(mem_lim)\
        .set_cpu_request(cpu_req)\
        .set_cpu_limit(cpu_lim)
    if (gpu_req is not None) and (gpu_lim is not None):
        new_op.add_resource_request(gpu_resource, gpu_req)
        new_op.add_resource_limit(gpu_resource, gpu_lim)
    return new_op

## Define Pipeline
* Intro Kubeflow pipeline: https://v1-5-branch.kubeflow.org/docs/components/pipelines/introduction/
* Kubeflow pipeline SDK v1: https://v1-5-branch.kubeflow.org/docs/components/pipelines/sdk/sdk-overview/

#### Construct promp list

In [14]:
place_holder = "#"
test_prompt='Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?\nA: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.\nQ: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?\n'
name_prompt = f"Context: Patient: Fried\nQuestion: what is the name of the patient? \nAnswer: Name of the patient is Fried\nContext: {place_holder}\nQuestion: what is the name of the patient?\nAnswer: the name of patient is"
age_prompt = f"Context:\nPatient: Fried is a 34-year-old patient\nQuestion:\nhow old is the patient? \nAnswer:\nFried is a patient, 34 year-old, the answers is 34\nContext:\n{place_holder}\nQuestion:\nhow old is the patient?\nAnswer: "

# processed in parallel output of memory
# prompts = [test_prompt, name_prompt, age_prompt]
prompts = [age_prompt]
# prompts = [test_prompt]
# print(len(prompts))
# print(prompts)

#### Define pipeline DAG

In [15]:
@pipeline(
    name = EXPERIMENT_NAME,
    description = EXPERIMENT_DESC
)
def custom_pipeline(
    data_root: str= "/mnt", 
    data_sub_path: str="core-kind/yinwang",
    file_sub_path: str="core-kind/yinwang/data/medreports",
    file_pattern: str="KK-SCIVIAS-*.pdf",
    file_idx: int = 1,
    trans_model_type: str="custom", 
    gen_model_type: str="7B",
    show_log_txt: bool=False):
    """
    Args:
      data_root: the mount path of shared data volume.
      data_sub_path: the relative path to the data folder, without leading ./
    """
    
    '''local variable'''
    no_artifact_cache = "P0D"
    artifact_cache_today = "P1D"
    #cache_setting = no_artifact_cache
    cache_setting = artifact_cache_today
    # prompt = "how are you buddy?"
    # trans_text = "Das Haus ist wunderbar."
    
    '''Pipeline Volume'''
    # predefined pvc in namespace
    shared_volume = PipelineVolume("llm-models")
    
    '''pipeline'''
    extract_task = text_extractor(
        data_root=data_root, 
        data_sub_path=file_sub_path,
        file_pattern=file_pattern,
        file_idx=file_idx,
        show_log_txt=show_log_txt
    )

    extract_task = pod_resource_transformer(extract_task, mem_req="4000Mi", cpu_req="2000m", mem_lim="8000Mi", cpu_lim="4000m")
    extract_task.execution_options.caching_strategy.max_cache_staleness = cache_setting
    extract_task.add_pvolumes({data_root: shared_volume})
    extract_task.set_display_name("PDF-text extractor")    
    
    translate_task = bart_translator(
        data_root=data_root, 
        data_sub_path=data_sub_path,
        model_type=trans_model_type,
        show_log_txt=show_log_txt,
        input=extract_task.outputs["output"]
        # origin_text=extract_task.output
    )
    #translate_task = pod_resource_transformer(translate_task, mem_req="24000Mi", cpu_req="1000m", mem_lim="24000Mi", cpu_lim="2000m", gpu_req=1, gpu_lim=1)
    translate_task = pod_resource_transformer(translate_task, mem_req="12000Mi", cpu_req="4000m", mem_lim="12000Mi", cpu_lim="16000m", gpu_req=1, gpu_lim=1)
    translate_task.execution_options.caching_strategy.max_cache_staleness = cache_setting
    translate_task.add_pvolumes({data_root: shared_volume})
    translate_task.set_display_name("BART de_en translator")
    
    inference_task = llm_gen(
        data_root=data_root, 
        data_sub_path=data_sub_path, 
        model_type=gen_model_type,
        prompt_templates=prompts,
        prompt_context=translate_task.output,
        prompt_placeholder=place_holder,
        show_log_txt=show_log_txt
    )
    # 200 MB ram and 1 cpu
    inference_task = pod_resource_transformer(inference_task, mem_req="24000Mi", cpu_req="1000m", mem_lim="24000Mi", cpu_lim="2000m", gpu_req=1, gpu_lim=1)
    # set the download caching to be 1day, disable caching with P0D
    inference_task.execution_options.caching_strategy.max_cache_staleness = cache_setting
    inference_task.add_pvolumes({data_root: shared_volume})
    inference_task.set_display_name("LlaMA2 entity extractor")
    
    inference_task.after(translate_task)
    
    

### (optional) pipeline compile step
use the following command to compile the pipeline to 

In [16]:
PIPE_LINE_FILE_NAME=f"{PREFIX}_kfp1_info_extraction_pipeline"
kfp.compiler.Compiler().compile(custom_pipeline, f"{PIPE_LINE_FILE_NAME}.yaml")

### Create Experiment Run

create run label with current data time
```python
from datetime import datetime
from pytz import timezone as ptimezone
ts = datetime.strftime(datetime.now(ptimezone("Europe/Berlin")), "%Y-%m-%d %H-%M-%S")
print(ts)
```

Reference:
* https://stackoverflow.com/questions/25837452/python-get-current-time-in-right-timezone/25887393#25887393

In [17]:
from datetime import datetime
from pytz import timezone as ptimezone

def get_local_time_str(target_tz_str: str = "Europe/Berlin", format_str: str = "%Y-%m-%d %H-%M-%S") -> str:
    """
    this method is created since the local timezone is miss configured on the server
    @param: target timezone str default "Europe/Berlin"
    @param: "%Y-%m-%d %H-%M-%S" returns 2022-07-07 12-08-45
    """
    target_tz = ptimezone(target_tz_str) # create timezone, in python3.9 use standard lib ZoneInfo
    # utc_dt = datetime.now(datetime.timezone.utc)
    target_dt = datetime.now(target_tz)
    return datetime.strftime(target_dt, format_str)

### Config pipeline run
* Setting imagePullSecretes for Pipeline with SDK: https://github.com/kubeflow/pipelines/issues/5843#issuecomment-859799181

In [18]:
# import pandas as pd
# from pandas import DataFrame, Series

# place_holder = "#"
# test_prompt='Q: Roger has 3 tennis balls. He buys 2 more cans of tennis balls. Each can has 4 tennis balls. How many tennis balls does he have now?\nA: Roger started with 3 balls. 2 cans of 4 tennis balls each is 8 tennis balls. 3 + 8 = 11. The answer is 11.\nQ: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?\n'
# name_prompt = f"Context: Patient: Fried\nQuestion: what is the name of the patient? \nAnswer: Name of the patient is Fried\nContext: {place_holder}\nQuestion: what is the name of the patient?\nAnswer: the name of patient is"
# age_prompt = f"Context:\nPatient: Fried is a 34-year-old patient\nQuestion:\nhow old is the patient? \nAnswer:\nFried is a patient, 34 year-old, the answers is 34\nContext:\n{place_holder}\nQuestion:\nhow old is the patient?\nAnswer: "

# ser = Series([test_prompt, name_prompt, age_prompt])
# df = DataFrame(data=ser, columns=["prompt"])
# df.to_json()
# print(df)

In [19]:
# from kubernetes import client as k8s_client
pipeline_config = dsl.PipelineConf()

# pipeline_config.set_image_pull_secrets([k8s_client.V1ObjectReference(name=K8_GIT_SECRET_NAME, namespace=NAME_SPACE)])
# pipeline_config.set_image_pull_policy("Always")
pipeline_config.set_image_pull_policy("IfNotPresent")

# note:
#file_idx = 1 has 8 page 14K token, too long for the llm prompt
#file_idx = 0 has 4 page 7K token, works for the llm prompt
pipeline_args = {
    'data_root' : DATA_ROOT,
    'data_sub_path' : DATA_SUB_PATH,
    'file_sub_path' : FILE_SUB_PATH,
    'file_pattern' : FILE_PATTERN,
    'file_idx' : 0, # 1
    'gen_model_type': DEFAULT_GEN_MODEL_TYPE,
    'trans_model_type': DEFAULT_TRANS_MODEL_TYPE,
    'show_log_txt': True,
}
print(pipeline_args)

{'data_root': '/mnt', 'data_sub_path': 'core-kind/yinwang', 'file_sub_path': 'core-kind/yinwang/data/medreports', 'file_pattern': 'KK-SCIVIAS-*.pdf', 'file_idx': 0, 'gen_model_type': '7B', 'trans_model_type': 'custom', 'show_log_txt': True}


In [20]:
RUN_NAME = f"{PREFIX}_extract_info_kfp1 {get_local_time_str()}"

# client = kfp.Client()
client.create_run_from_pipeline_func(
    pipeline_func=custom_pipeline,
    arguments = pipeline_args, #{}
    run_name = RUN_NAME,
    pipeline_conf=pipeline_config,
    experiment_name=EXPERIMENT_NAME,
    namespace=NAMESPACE,
)

RunPipelineResult(run_id=78fa3d87-dab9-42f2-a6ab-cbee1a5d59c6)