## Introduction

@author: Yingding Wang\
@created: 28.02.2024\
@updated: 28.02.2024\
@version: 1

This notebook contains llm inference pipeline (using transformer, pytorch, llama2, langchain) performed on an annotated properitory medical report dataset to extract weight information, the dataset is accessed from a s3 compatible storage.

This notebook also demonstrate the use of `applyllm` PyPI lib for building a LLM RAG inference system.


In [1]:
import sys

In [2]:
#!{sys.executable} -m pip install --upgrade pip
#!{sys.executable} -m pip install --user --upgrade kfp==1.8.22

In [3]:
#!{sys.executable} -m pip install --user --upgrade --no-cache-dir -i https://test.pypi.org/simple/ applyllm

In [4]:
#!cat ./requirements.txt

### Libraries used in the pipeline

In [5]:
from platform import python_version
import applyllm

print(python_version())
print(f"applyllm.__version__ : {applyllm.__version__}")

3.10.14
applyllm.__version__ : 0.0.6


In [6]:
import os
from applyllm.accelerators import (
    AcceleratorHelper,
    AcceleratorStatus,
    DIR_MODE_MAP,
    TokenHelper as th
)

dir_setting = dir_setting=DIR_MODE_MAP.get("kf_notebook")
gpu_status = AcceleratorStatus.create_accelerator_status()

In [7]:
gpu_helper = AcceleratorHelper()
# dynamically fetch attached accelerator devices
UUIDs = gpu_helper.nvidia_device_uuids_filtered_by(is_mig=True, log_output=False)

In [8]:
# init all the cuda torch env and model download cache directory
gpu_helper.init_cuda_torch(UUIDs, dir_setting)

print(os.environ["CUDA_VISIBLE_DEVICES"])
print(os.environ["XDG_CACHE_HOME"])

MIG-9579f618-ddae-5958-9285-3207382f0b36
/home/jovyan/llm-models/core-kind/yinwang/models


In [9]:
model_map = {
    "llama7B-chat":     "meta-llama/Llama-2-7b-chat-hf",
    "llama13B-chat":    "meta-llama/Llama-2-13b-chat-hf",
    "llama70B-chat":    "meta-llama/Llama-2-70b-chat-hf",
    "llama3-8B-inst":   "meta-llama/Meta-Llama-3-8B-Instruct",
    "llama3-70B-inst":  "meta-llama/Meta-Llama-3-70B-Instruct",
    "mistral7B-01":     "mistralai/Mistral-7B-v0.1",
    "mistral7B-inst02": "mistralai/Mistral-7B-Instruct-v0.2",
    "mixtral8x7B-01":   "mistralai/Mixtral-8x7B-v0.1",
    "mixtral8x7B-inst01": "mistralai/Mixtral-8x7B-Instruct-v0.1", 
    "gemma7b-it": "google/gemma-7b-it",
    "gemma7b":    "google/gemma-7b",
    "gemma2b-it": "google/gemma-2b-it",
    "gemma2b":    "google/gemma-2b",
    "gemma7b-it-1.1": "google/gemma-1.1-7b-it",
    "gemma2b-it-1.1": "google/gemma-1.1-2b-it",
}

default_model_type = "mistral7B-inst02"
default_dir_mode = "mac_local"

In [10]:
# model_type = default_model_type
# model_type = "mistral7B-inst02"
# model_type = "mixtral8x7B-01"
model_type = "mixtral8x7B-inst01"
# model_type = "llama3-70B-inst"
# model_type = "llama3-8B-inst"
# model_type = "llama13B-chat"

model_name = model_map.get(model_type, default_model_type)

print(model_name)

mistralai/Mixtral-8x7B-Instruct-v0.1


In [11]:
import transformers
import torch
# from transformers import pipeline
from transformers import AutoModelForCausalLM, AutoTokenizer
print(transformers.__version__)
print(torch.__version__)

4.39.3
2.2.2+cu118


In [12]:
"""
Load the token
"""
token_kwargs = th.gen_token_kwargs(model_type=model_type, dir_setting=dir_setting)

huggingface token is NOT needed


## Following this approach to load OSS LLM model with bitsandbytes quantization
* https://github.com/pinecone-io/examples/blob/master/learn/generation/llm-field-guide/llama-2/llama-2-13b-retrievalqa.ipynb

## 4bit  quantization

Load pretrained model first, then the tokenizer.

<table>
    <!-- row 1-->
<tr>
<th>
Llama-2-13b-chat-hf
</th>
<th>
Mixtral-8x7B-v0.1
</th>

<th>
Mixtral-8x7B-Instruct-v0.1
</th>
</tr>
    <!-- row 2 -->
<tr>

<td>
<pre>
num_of_gpus: 1
--------------------
Device name      : NVIDIA A100 80GB PCIe MIG 3g.40gb 
Device idx       : 0 
No. of processors: 42
Physical  memory : 39.250000 GB
Reserved  memory : 7.085938 GB
Allocated memory : 6.809501 GB
Free      memory : 0.276437 GB
--------------------
</pre>
</td>

<td>
<pre>
num_of_gpus: 1
--------------------
Device name      : NVIDIA A100 80GB PCIe MIG 3g.40gb 
Device idx       : 0 
No. of processors: 42
Physical  memory : 39.250000 GB
Reserved  memory : 23.496094 GB
Allocated memory : 23.303491 GB
Free      memory : 0.192603 GB
--------------------
</pre>
</td>

<td>
<pre>
num_of_gpus: 1
--------------------
Device name      : NVIDIA A100 80GB PCIe MIG 3g.40gb 
Device idx       : 0 
No. of processors: 42
Physical  memory : 39.250000 GB
Reserved  memory : 23.496094 GB
Allocated memory : 23.303491 GB
Free      memory : 0.192603 GB
--------------------
</pre>
</td>

</tr>
</table>

In [13]:
from applyllm.utils import time_func
from applyllm.pipelines import (
    LocalCausalLMConfig,
    ModelConfig,
    ModelCatalog,
)

#cuda_max_memory = {
#   0: "40GB", # GPU device 0
#   "cpu": "160GB", 
#} 

base_lm_config = ModelConfig(
    model_config = {
        "pretrained_model_name_or_path": model_name,
        "device_map": "auto",
        # "max_memory": cuda_max_memory,
        # "offload_state_dict": True,
    }
)

kwargs = {
    "quantized": True,
    "model_config": base_lm_config.get_config(),
    "quantization_config": {
        "quantization_config": transformers.BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type='nf4',
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16
        )
    }
}
lm_config = LocalCausalLMConfig(**kwargs)

lm_config

LocalCausalLMConfig(quantized=True, model_config={'pretrained_model_name_or_path': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'device_map': 'auto'}, quantization_config={'quantization_config': BitsAndBytesConfig {
  "_load_in_4bit": true,
  "_load_in_8bit": false,
  "bnb_4bit_compute_dtype": "bfloat16",
  "bnb_4bit_quant_storage": "uint8",
  "bnb_4bit_quant_type": "nf4",
  "bnb_4bit_use_double_quant": true,
  "llm_int8_enable_fp32_cpu_offload": false,
  "llm_int8_has_fp16_weight": false,
  "llm_int8_skip_modules": null,
  "llm_int8_threshold": 6.0,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "quant_method": "bitsandbytes"
}
})

In [14]:
@time_func
def fetch_model():
    return AutoModelForCausalLM.from_pretrained(
      **lm_config.get_config(),
      **token_kwargs,  
    )

model = fetch_model()

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

executed: fetch_model() python function
walltime: 223.96892833709717 in secs.


In [15]:
gpu_status.gpu_usage()

num_of_gpus: 1
--------------------
Device name      : NVIDIA A100 80GB PCIe MIG 3g.40gb 
Device idx       : 0 
No. of processors: 42
Physical  memory : 39.250000 GB
Reserved  memory : 23.527344 GB
Allocated memory : 23.302026 GB
Free      memory : 0.225318 GB
--------------------


### Llama2 max position embeddings
Default is set to be 2048
* https://huggingface.co/docs/transformers/model_doc/llama2#transformers.LlamaConfig.max_position_embeddings

Set teh max_length for the tokenizer, Transformer issues:
* https://github.com/huggingface/transformers/issues/1791#issuecomment-553397054
* https://github.com/huggingface/transformers/pull/1833


In [16]:
def config_tokenizer(model_name: str, config: dict, pad_token_id = 2):
    if model_name.startswith(ModelCatalog.MISTRAL_FAMILY):
        return {**config, "pad_token_id": pad_token_id}
    else:
        return config

In [17]:
MAX_POSITION_EMBEDDINGS = 3072
MAX_LENGTH = 4096

model_config= {
    "pretrained_model_name_or_path": model_name,
    "device": "cpu",
    # "device_map": "auto", # put to GPU if GPU is available
    "max_position_embeddings": MAX_LENGTH,
    "max_length": MAX_LENGTH,
}
model_config = config_tokenizer(model_name=model_name, config=model_config)


tokenizer_config = ModelConfig(model_config=model_config)
tokenizer_config

ModelConfig(model_config={'pretrained_model_name_or_path': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'device': 'cpu', 'max_position_embeddings': 4096, 'max_length': 4096, 'pad_token_id': 2})

In [18]:
tokenizer = AutoTokenizer.from_pretrained(
    **tokenizer_config.get_config(), 
    **token_kwargs,
)
tokenizer

LlamaTokenizerFast(name_or_path='mistralai/Mixtral-8x7B-Instruct-v0.1', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

### Inference with transformers pipeline

Reference:
* https://huggingface.co/docs/transformers/pipeline_tutorial

Note:
* batch is not activated by default, batch is not necessary faster for `transformers.pipeline`
* the max_new_tokens set in the pipeline initialization works with langchain.llms.HuggingFacePipeline, but not as a param for the TextGenerationPipeline 

max_new_tokens https://github.com/huggingface/transformers/issues/19358

#### Return only generated text as LLM output
new transformers.pipeline update
```python
"return_full_text": False
```

In [19]:
MAX_NEW_TOKENS = 200 # 80

tp_kwargs = {
    "task": "text-generation",
    "model": model,
    "tokenizer": tokenizer,
    "device_map": "auto",
    "max_length": None, # remove the total length of the generated response
    "max_new_tokens": MAX_NEW_TOKENS, # set 200 instead of 80, since it may cut off the json response. set the size of new generated token 
    "return_full_text": False, # return only the generated text, not the input text with the generated text
}

tp_config = ModelConfig(model_config = tp_kwargs)

generator = transformers.pipeline(
    **tp_config.get_config(),
    **token_kwargs,
)

In [20]:
type(generator)

transformers.pipelines.text_generation.TextGenerationPipeline

### Huggingface with Local LLM

* https://python.langchain.com/docs/integrations/llms/huggingface_pipelines

HuggingFacePipeline from langchain need pydantic>=1.10.13

```shell
import pydantic
print(pydantic.__version__)
```
* https://stackoverflow.com/questions/76313592/import-langchain-error-typeerror-issubclass-arg-1-must-be-a-class

In [21]:
from applyllm.pipelines import ModelCatalog, ModelInfo, PromptHelper

model_info = ModelCatalog.get_model_info(model_name=model_name)
prompt_helper = PromptHelper(model_info=model_info)

model_info

ModelInfo(model_family='mistralai', inst_msg_begin='<s>[INST] You are a helpful, respectful and honest assistant.\nAlways answer as helpfully as possible using the context text provided.\nYour answers should only answer the question once and not have any text after the answer is done.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.\nIf you don\'t know the answer to a question, please don\'t share false information. Just return "</s>"\n', inst_msg_end='[/INST]')

In [22]:
import pydantic
import langchain
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

print(f"pydantic.__version__: {pydantic.__version__}")
print(f"langchain.__version__: {langchain.__version__}")

pydantic.__version__: 1.10.13
langchain.__version__: 0.1.16


### Init a HuggingFacePipeline with pipeline_kwargs

https://github.com/langchain-ai/langchain/issues/8280#issuecomment-1652085694

In [23]:
llm = HuggingFacePipeline(
    pipeline=generator 
)

print(llm)
print(llm.pipeline.model)

[1mHuggingFacePipeline[0m
Params: {'model_id': 'gpt2', 'model_kwargs': None, 'pipeline_kwargs': None}
MixtralForCausalLM(
  (model): MixtralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MixtralDecoderLayer(
        (self_attn): MixtralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MixtralRotaryEmbedding()
        )
        (block_sparse_moe): MixtralSparseMoeBlock(
          (gate): Linear4bit(in_features=4096, out_features=8, bias=False)
          (experts): ModuleList(
            (0-7): 8 x MixtralBlockSparseTop2MLP(
              (w1): Linear4bit(in_features=4096, out_features=14336, bias=False)
              (w2): Linear4bit

In [24]:
# there is a bug, the HuggingFacePipeine is not getting the param directly
# https://github.com/langchain-ai/langchain/issues/8280

# this must be set for the generator (HuggingFacePipeline) to work
llm.model_id = model_name
pipeline_kwargs_config = {
    "device_map": "auto",
    "max_length": MAX_LENGTH, # deactivate to use max_new_tokens
    "max_new_tokens": MAX_NEW_TOKENS, # this is not taken by the model ?
    "eos_token_id": tokenizer.eos_token_id, # also making trouble (optional)
    "temperature": 0.01,
    "repetition_penalty": 1.15, # 1.15,
}
model_kwargs_config = {
    "do_sample": True, # also making trouble with langchain (optional)
    "top_k": 3, # this param result in trouble with langchain (optional)
    "num_return_sequences": 1, # (optional)
    "eos_token_id": tokenizer.eos_token_id, # also making trouble (optional)
    "max_length": MAX_LENGTH, # deactivate to use max_new_tokens
    "max_new_tokens": MAX_NEW_TOKENS, # this is not taken by the model ?
    "temperature": 0.01,
    "top_p": 0.8, # 0.95 # alternative to top_k summerized probability while do_sample=True
    "repetition_penalty": 1.15, # 1.15,
}

llm.model_kwargs = config_tokenizer(model_name=model_name, config=model_kwargs_config, pad_token_id=tokenizer.eos_token_id)
llm.model_kwargs["trust_remote_code"] = True
llm.pipeline_kwargs = config_tokenizer(model_name=model_name, config=pipeline_kwargs_config, pad_token_id=tokenizer.eos_token_id)

print(llm.model_kwargs)
print(llm.pipeline_kwargs)

{'do_sample': True, 'top_k': 3, 'num_return_sequences': 1, 'eos_token_id': 2, 'max_length': 4096, 'max_new_tokens': 200, 'temperature': 0.01, 'top_p': 0.8, 'repetition_penalty': 1.15, 'pad_token_id': 2, 'trust_remote_code': True}
{'device_map': 'auto', 'max_length': 4096, 'max_new_tokens': 200, 'eos_token_id': 2, 'temperature': 0.01, 'repetition_penalty': 1.15, 'pad_token_id': 2}


In [25]:
gpu_status.gpu_usage()

num_of_gpus: 1
--------------------
Device name      : NVIDIA A100 80GB PCIe MIG 3g.40gb 
Device idx       : 0 
No. of processors: 42
Physical  memory : 39.250000 GB
Reserved  memory : 23.527344 GB
Allocated memory : 23.302026 GB
Free      memory : 0.225318 GB
--------------------


## Sequential Doc Chain

https://github.com/langchain-ai/langchain/discussions/8383

In [26]:
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import S3DirectoryLoader, S3FileLoader
# from langchain.document_loaders import S3DirectoryLoader, S3FileLoader
from langchain_community.vectorstores import DocArrayInMemorySearch
# from langchain.vectorstores import DocArrayInMemorySearch
from langchain.indexes import VectorstoreIndexCreator
from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
# from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
# from langchain.text_splitter import TextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents.base import Document
from langchain.prompts import PromptTemplate
from typing import List
import boto3
from applyllm.io import S3PdfObjHelper, DocMetaInfo, DocCorpusS3

print(boto3.__version__)

1.34.84


## Loading S3 objects

In [27]:
bucket_name = "scivias-medreports"
file_prefix = "KK-SCIVIAS"
PREFIX = f"{S3PdfObjHelper.DataContract.key_lead}/{file_prefix}"
access_key_id = os.environ.get('AWS_ACCESS_KEY_ID')
secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
s3_endpoint = os.environ.get('S3_ENDPOINT')
# VERIFY = False
VERIFY = True
print(PREFIX)

trans2en/KK-SCIVIAS


### Loading one text file
* https://python.langchain.com/docs/integrations/document_loaders/aws_s3_file

In [28]:
def get_single_file_loader(key: str):
    return S3FileLoader(bucket=bucket_name, 
                      key = key,
                      aws_access_key_id=access_key_id,
                      aws_secret_access_key=secret_access_key,
                      endpoint_url=s3_endpoint,
                      verify = VERIFY,
                      use_ssl = True)

In [29]:
from typing import List


@time_func
def fetch_s3_object(key: str) -> List[Document]:
    """
    Returns:
      a list of LangChain Document object
    """
    loader = get_single_file_loader(key)
    return loader.load()

### Loading all data from s3

In [30]:
def get_prefix_files_loader(prefix: str):
    return S3DirectoryLoader(bucket=bucket_name,
                           prefix=prefix, 
                           aws_access_key_id=access_key_id, 
                           aws_secret_access_key=secret_access_key,
                           endpoint_url=s3_endpoint,
                           verify = VERIFY,
                           use_ssl = True)

In [31]:
@time_func
def fetch_s3_corpus(prefix: str) -> List[Document]:
    loader = get_prefix_files_loader(prefix=prefix)
    return loader.load()

In [32]:
TEST_SINGLE_DOC = True

# missing closing }\n```
# TEST_S3_DOC_KEY = "trans2en/KK-SCIVIAS-00400^0054947100^2021-08-05^KIIID.txt"
# TEST_S3_DOC_KEY = "trans2en/KK-SCIVIAS-00401^0052906626^2017-12-18^KIIID.txt"
TEST_S3_DOC_KEY = "trans2en/KK-SCIVIAS-00403^0049215191^2011-06-30^KIIMUKO.txt"

# TEST_S3_DOC_KEY = "trans2en/KK-SCIVIAS-00003^0053360847^2018-09-28^KIIGAS.txt"

In [33]:
if TEST_SINGLE_DOC:
    data = fetch_s3_object(key=TEST_S3_DOC_KEY)
else:
    data = fetch_s3_corpus(pefix=PREFIX)

executed: fetch_s3_object() python function
walltime: 3.9831676483154297 in secs.


In [34]:
s3_corpus = DocCorpusS3(data)
print("--- Max Length Doc Info ---")
print(s3_corpus.max_doc_meta)
print("--- Min Length Doc Info ---")
print(s3_corpus.min_doc_meta)

--- Max Length Doc Info ---
source:s3://scivias-medreports/trans2en/KK-SCIVIAS-00403^0049215191^2011-06-30^KIIMUKO.txt
name:KK-SCIVIAS-00403^0049215191^2011-06-30^KIIMUKO.txt
tokens:1734
characters:13048
--- Min Length Doc Info ---
source:s3://scivias-medreports/trans2en/KK-SCIVIAS-00403^0049215191^2011-06-30^KIIMUKO.txt
name:KK-SCIVIAS-00403^0049215191^2011-06-30^KIIMUKO.txt
tokens:1734
characters:13048


### Setting the current file index from the total corpus

In [35]:
if TEST_SINGLE_DOC:
    file_idx = 0 # ID 0003 has weight 43.2 kg
else:
    file_idx = 0 # ID 0003 has weight 43.2 kg
    # file_idx = 1
    # file_idx = idx_of_max_token

show_content = False
# show_content = True

In [36]:
# the s3_corpus can be single doc corpus or multiple doc corpus
# for testing purpose, just get one doc using file_idx
CUR_DOC, CUR_DOC_INFO = s3_corpus.get_s3_obj_info(file_idx, show_content=show_content)

total objects: 1
s3 key     :s3://scivias-medreports/trans2en/KK-SCIVIAS-00403^0049215191^2011-06-30^KIIMUKO.txt
obj name   :KK-SCIVIAS-00403^0049215191^2011-06-30^KIIMUKO.txt
token size :1734
char. size :13048


### Langchain text splitter

* https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/recursive_text_splitter

## Restarting Point (Rerun below cells)

In [37]:
from applyllm.utils import token_size

CHUNK_SIZE = (MAX_POSITION_EMBEDDINGS // 1000) * 1000
model_config = {
    # Set a really small chunk size, just to show.
    "chunk_size": CHUNK_SIZE,
    "chunk_overlap": 20,
    "length_function": token_size, # len,
    "is_separator_regex": False,
}

splitter_config = ModelConfig(model_config=model_config)
text_splitter = RecursiveCharacterTextSplitter(
    **splitter_config.get_config()
)

splitter_config

ModelConfig(model_config={'chunk_size': 3000, 'chunk_overlap': 20, 'length_function': <function token_size at 0x7f31da610160>, 'is_separator_regex': False})

In [38]:
# Optional test of RecursiveCharacterTextSplitter on \n and other chars
test_text = CUR_DOC.page_content
text_split_list = text_splitter.split_text(test_text)

print(len(text_split_list))

for seg in text_split_list:
    print(f"len:    {len(seg)}")
    print(f"tokens: {token_size(seg)}")
# print(text_split_list[-1])    

1
len:    13048
tokens: 1734


### Langchain embeddings

use sentence-transformers  

* all-MiniLM-L12-v2 : 134MB https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2 
* all-MiniLM-L6-v2 : 90MB https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/tree/main

Llama2 does not support document embedding by default
* https://stackoverflow.com/questions/77037897/how-to-create-an-embeddings-model-in-langchain

HuggingFaceEmbedding embed_documents example
* https://python.langchain.com/docs/modules/data_connection/text_embedding/

In-memory vectorstore need DocArray
* https://python.langchain.com/docs/integrations/providers/docarray

TextEmbeddings in LangChain
* https://python.langchain.com/docs/modules/data_connection/text_embedding/

Sentence-transformers
* https://medium.com/@madhur.prashant7/demo-langchain-rag-demo-on-llama-2-7b-embeddings-model-using-chainlit-559c10ce3fbf

In [39]:
embed_model_map = {
    "sentence-transformers": "sentence-transformers/all-MiniLM-L12-v2", # 384
    "baai" : "BAAI/bge-base-en-v1.5" # 768 embedding dims
}

In [40]:
# embed_model_vendor = "sentence-transformers"
embed_model_vendor = "baai"

In [41]:
embed_model_name = embed_model_map[embed_model_vendor]

In [42]:
# model_kwargs = {'device': 'cpu'}
# model_kwargs = {'device_map': "auto",}
# encode_kwargs = {'normalize_embeddings': False}
# encode_kwargs = {'normalize_embeddings': True} # for the cosin similarity search

model_config = {
    "model_name" : embed_model_name,
    "model_kwargs": {'device': 'cpu'},
    "encode_kwargs": {'normalize_embeddings': True}
}
embed_config = ModelConfig(model_config=model_config)

# is downloaded at "{MODEL_CACHE_DIR}/models/torch/sentence_transformer" folder
embed_model = HuggingFaceEmbeddings(
    **embed_config.get_config()
)

embed_config

ModelConfig(model_config={'model_name': 'BAAI/bge-base-en-v1.5', 'model_kwargs': {'device': 'cpu'}, 'encode_kwargs': {'normalize_embeddings': True}})

## Langchain local LLM RAG

Langchain Vectorstore and RAG approach differences:
* https://github.com/langchain-ai/langchain/issues/5328

Langchain RetrievalQA 
* https://python.langchain.com/docs/use_cases/question_answering/local_retrieval_qa

DocArray
* https://python.langchain.com/docs/integrations/providers/docarray

LLama2 doesn't support Doc Embedding
* https://stackoverflow.com/questions/77037897/how-to-create-an-embeddings-model-in-langchain


In [43]:
# RAG one document
index = VectorstoreIndexCreator(
    vectorstore_cls=DocArrayInMemorySearch,
    embedding=embed_model,
    text_splitter=text_splitter,
    ).from_documents([data[file_idx]])

In [44]:
RETRIEVER_K = 3 # with two doc, there is not much i don't know
retriever = index.vectorstore.as_retriever(search_kwargs={'k': RETRIEVER_K})

In [45]:
# db = DocArrayInMemorySearch.from_documents(
#     [data[file_idx]], embed_model)

# retriever = db.as_retriever

#### Set the custom template

Use the object variable, instead of kwargs
https://github.com/langchain-ai/langchain/issues/6635#issuecomment-1659343109

The reduce_prompt_template can be set
```shell
qa_chain.combine_documents_chain.reduce_documents_chain.combine_documents_chain.llm_chain.prompt = reduce_prompt_template
```

In [46]:
# template = """
# Given the following extracted parts of a long document and a question, create a final answer.\n
# If you don't know the answer, just say that you don't know. Don't try to make up an answer.\n\n\n
# =========\n
# QUESTION: {question}\n
# =========\n
# {summaries}\n
# =========\n
# FINAL ANSWER:"""

# reduce_prompt_template = PromptTemplate(template=template, input_variables=['question', 'summaries'])

In [47]:
query_template = """
Context:
{context}

Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

map_template = prompt_helper.gen_prompt(query_template)


map_prompt_template = PromptTemplate.from_template(map_template)
map_prompt_template
# Relevant text, if any:

PromptTemplate(input_variables=['context', 'question'], template='<s>[INST] You are a helpful, respectful and honest assistant.\nAlways answer as helpfully as possible using the context text provided.\nYour answers should only answer the question once and not have any text after the answer is done.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.\nIf you don\'t know the answer to a question, please don\'t share false information. Just return "</s>"\n\n\nContext:\n{context}\n\nQuestion: {question}\n\nOnly return the helpful answer below and nothing else.\nHelpful answer:\n\n[/INST]')

In [48]:
# reduce_template = """[INST]<<SYS>>\nYou are a helpful, respectful and honest assistant.
# Always answer as helpfully as possible using the context text provided.
# Your answers should only answer the question once and not have any text after the answer is done.\n\n
# If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.\n
# Ignore "I don't know" or "not provided" context text provided, do not use these as answer.\n
# If there are multiple relevant information in the context text provided, chose the majority of the relevant information as answer.\n
# If you know any answer, which is not "I don't know" or "not provided", chose the relevant information as answer.\n
# If you don't know the answer to a question, please don't share false information. \n<</SYS>>\n\n

# CONTEXT:/n/n {summaries}/n/n/n

# Question: {question}/n/n

# Only return the helpful answer below and nothing else.
# Helpful answer:
# [/INST]"""


# reduce_template = """[INST]<<SYS>>\nYou are a helpful, respectful and honest assistant.
# Always answer as helpfully as possible using the context text provided.
# Your answers should only answer the question once and not have any text after the answer is done.\n\n
# If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.\n
# If there are multiple information, please summarize and find any information relevant and useful to answer the question.\n
# If you don't know the answer to a question, please don't share false information. \n<</SYS>>\n\n

# CONTEXT:/n/n {summaries}/n/n/n

# Question: {question}/n/n

# Only return the helpful answer below and nothing else.
# Helpful answer:
# [/INST]"""

#reduce_template = """[INST]<<SYS>>\nYou are a helpful, respectful and honest assistant.
#Always answer as helpfully as possible using the context text provided.
#Always summarise the context text provided.
#Your answers should only answer the question once and not have any text after the answer is done.\n\n
#If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.\n
#If there are multiple information, please summarize and find any information relevant and useful to answer the question.\n
#If you don't know the answer to a question, please don't share false information. \n<</SYS>>\n\n
#
#CONTEXT:/n/n {summaries}/n/n/n
#
#Question: {question}/n/n
#
#Only return the summarised answer below and nothing else.
#Summarised answer:
#[/INST]"""


reduce_template = """[INST]<<SYS>>\nYou are a helpful, respectful and honest assistant.
Always answer as helpfully as possible using the context text provided.
Always summarise the context text provided.
Your answers should only answer the question once and not have any text after the answer is done.\n\n
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.\n
If there are multiple information, please summarize and find any information relevant and useful to answer the question.\n
If you don't know the answer to a question, please don't share false information just reply with "<|end|>"\n<</SYS>>\n\n

CONTEXT:/n/n {summaries}/n/n/n

Question: {question}/n/n

Only return the summarised answer below and nothing else.
Summarised answer:
[/INST]"""

reduce_prompt_template = PromptTemplate.from_template(reduce_template)
reduce_prompt_template

PromptTemplate(input_variables=['question', 'summaries'], template='[INST]<<SYS>>\nYou are a helpful, respectful and honest assistant.\nAlways answer as helpfully as possible using the context text provided.\nAlways summarise the context text provided.\nYour answers should only answer the question once and not have any text after the answer is done.\n\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.\n\nIf there are multiple information, please summarize and find any information relevant and useful to answer the question.\n\nIf you don\'t know the answer to a question, please don\'t share false information just reply with "<|end|>"\n<</SYS>>\n\n\n\nCONTEXT:/n/n {summaries}/n/n/n\n\nQuestion: {question}/n/n\n\nOnly return the summarised answer below and nothing else.\nSummarised answer:\n[/INST]')

In [49]:
#refine_init_template = """[INST]<<SYS>>\nYou are a helpful, respectful and honest assistant.
#Always answer as helpfully as possible using the context text provided.
#Your answers should only answer the question once and not have any text after the answer is done.\n\n
#If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.
#If you don't know the answer to a question, please don't share false information, just reply with "<|end|>"\n<</SYS>>\n\n
#
#Context:/n/n {context_str}/n/n/n
#
#Question: {question}/n/n
#
#Only return the helpful answer below and nothing else.
#Helpful answer:
#[/INST]"""

# <|end|> for llama
refine_init_template = """[INST]<<SYS>>\nYou are a helpful, respectful and honest assistant.
Always answer as helpfully as possible using the context text provided.
Your answers should only answer the question once and not have any text after the answer is done.\n\n
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.
If you don't know the answer to a question, please don't share false information, just reply with "<|end|>"\n<</SYS>>\n\n

Context:/n/n {context_str}/n/n/n

Question: {question}/n/n

Only return the helpful answer below and nothing else.
Helpful answer:
[/INST]"""

init_prompt_template = PromptTemplate.from_template(refine_init_template)
init_prompt_template

PromptTemplate(input_variables=['context_str', 'question'], template='[INST]<<SYS>>\nYou are a helpful, respectful and honest assistant.\nAlways answer as helpfully as possible using the context text provided.\nYour answers should only answer the question once and not have any text after the answer is done.\n\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.\nIf you don\'t know the answer to a question, please don\'t share false information, just reply with "<|end|>"\n<</SYS>>\n\n\n\nContext:/n/n {context_str}/n/n/n\n\nQuestion: {question}/n/n\n\nOnly return the helpful answer below and nothing else.\nHelpful answer:\n[/INST]')

In [50]:

# https://python.langchain.com/docs/use_cases/question_answering/local_retrieval_qa
# "stuff", "map_reduce", "refine", "map_rerank"

# https://github.com/langchain-ai/langchain/issues/4613

chain_type = "map_reduce"
# chain_type = "stuff"
# chain_type = "refine" 
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type=chain_type,
    retriever=retriever,
    # combine_docs_chain_kwargs={'prompt': reduce_prompt_template},
    # chain_type_kwargs={"map_prompt": map_prompt_template},
    return_source_documents=True,
    verbose=True,
    )
# set the prompt template manually
# use the original prompttemplate to do the summary, the current custom template doesn't have the one-short summary example, but just the right format.
# qa_chain.combine_documents_chain.reduce_documents_chain.combine_documents_chain.llm_chain.prompt = reduce_prompt_template

In [51]:
if chain_type == "map_reduce":
    qa_chain.combine_documents_chain.llm_chain.prompt = map_prompt_template
    qa_chain.combine_documents_chain.reduce_documents_chain.combine_documents_chain.llm_chain.prompt = reduce_prompt_template
    # set the token max from 3000 to 4000
    qa_chain.combine_documents_chain.reduce_documents_chain.token_max = MAX_POSITION_EMBEDDINGS
    
    
if chain_type == "refine":
    # pass
    qa_chain.combine_documents_chain.initial_llm_chain.prompt = init_prompt_template
    # qa_chain.combine_documents_chain.refine_llm_chain.token_max = MAX_POSITION_EMBEDDINGS

### Set token max or max token for the llm
* https://github.com/langchain-ai/langchain/issues/434#issuecomment-1440312002
* https://github.com/langchain-ai/langchain/issues/9341#issuecomment-1681306494
* https://github.com/langchain-ai/langchain/issues/9341#issuecomment-1681306494

## Set debug mode

In [52]:
# set DEBUG to false to remove all the llm answer outputs
# DEBUG=True
DEBUG=False

In [53]:
qa_chain

RetrievalQA(verbose=True, combine_documents_chain=MapReduceDocumentsChain(llm_chain=LLMChain(prompt=PromptTemplate(input_variables=['context', 'question'], template='<s>[INST] You are a helpful, respectful and honest assistant.\nAlways answer as helpfully as possible using the context text provided.\nYour answers should only answer the question once and not have any text after the answer is done.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.\nIf you don\'t know the answer to a question, please don\'t share false information. Just return "</s>"\n\n\nContext:\n{context}\n\nQuestion: {question}\n\nOnly return the helpful answer below and nothing else.\nHelpful answer:\n\n[/INST]'), llm=HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7f3175003c70>, model_id='mistralai/Mixtral-8x7B-Instruct-v0.1', model_kwargs={'do_sample': True, 'top_k': 3, 'num_return_s

In [54]:
# query = "What is the ICD10 diagonis of the patient? (Remember to include 'The name of the patient is' in your answer.)"

In [55]:
query = "What is the name of the patient? (Remember to include 'The name of the patient is' in your answer.)"

In [56]:
# import mlflow

# with mlflow.start_run() as run:
#    logged_model = mlflow.langchain.log_model(qa_chain, "scivias_rag1")

if DEBUG:
    langchain.debug = True
response = qa_chain.invoke({"query": query})
if DEBUG:
    langchain.debug = False



[1m> Entering new RetrievalQA chain...[0m


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



[1m> Finished chain.[0m


In [57]:
if DEBUG:
    print(f"Response: {response['result']}")
    print('-'*20)
    print(data[file_idx])

### PromptParser

In [58]:
from langchain.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, TransformChain
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

# for LLama2
#query_template = """[INST]<<SYS>>\nYou are a helpful, respectful and honest assistant.
#Always answer as helpfully as possible using the context text provided.
#Your answers should only answer the question once and not have any text after the answer is done.\n\n
#If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.
#If you don't know the answer to a question, please don't share false information. \n<</SYS>>\n\n
#
#CONTEXT:/n/n {text}/n/n/n
#
#Question: {question}/n
#{format_instructions}
#[/INST]"""

# Mixtral gives leading extra text
#parser_query_template = """<s>[INST] You are a helpful, respectful and honest assistant.
#Always answer as helpfully as possible using the context text provided.
#Your answers should only answer the question once and not have any text after the answer is done.
#If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.
#If you don't know the answer to a question, please don't share false information. Just return "</s>"
#
#CONTEXT:
#{text}
#
#Question: 
#{question}
#
#{format_instructions}
#[/INST]"""

# no text about explain, no leading text such as "sure ..., ```json ... ```"
parser_query_template = """<s>[INST] You are a helpful, respectful and honest assistant.
Always answer as helpfully as possible using the context text provided.
Your answers should only answer the question once and not have any text after the answer is done.
If you don't know the answer to a question, please don't share false information. Just return "</s>"

CONTEXT:
{text}

Question: 
{question}

{format_instructions}
[/INST]"""

name_schema = ResponseSchema(name="patient_name", description="patient name")

response_schema = [name_schema]
output_parser = StructuredOutputParser.from_response_schemas(response_schema)

### LLama2 prompt style
* https://colab.research.google.com/drive/1hRjxdj53MrL0cv5LOn1l0VetFC98JvGR?usp=sharing#scrollTo=IrVIuygNuBVT

In [59]:
# ## Default LLaMA-2 prompt style
# B_INST, E_INST = "[INST]", "[/INST]"
# B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
# DEFAULT_SYSTEM_PROMPT = """\
# You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

# If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""

# def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
#     SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
#     prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
#     return prompt_template

In [60]:
# sys_prompt = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible using the context text provided. Your answers should only answer the question once and not have any text after the answer is done.

# If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. """

# instruction = """CONTEXT:/n/n {context}/n

# Question: {question}"""
# get_prompt(instruction, sys_prompt)

In [61]:
format_instructions = output_parser.get_format_instructions()
print(format_instructions)

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"patient_name": string  // patient name
}
```


In [62]:
# prompt_template = PromptTemplate.from_template(get_prompt(instruction, sys_prompt))
# prompt_template

In [63]:
# prompt_template = ChatPromptTemplate.from_template(name_query_template) # ChatPromptTemplate create Human and Output in the text
name_question="retrieve one: patient name"

# prompt_template = PromptTemplate.from_template(query_template)
prompt_template = PromptTemplate(
    template=parser_query_template,
    input_variables=["text","questions"],
    partial_variables={"format_instructions": format_instructions},
)

prompt_template

PromptTemplate(input_variables=['question', 'text'], partial_variables={'format_instructions': 'The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n\n```json\n{\n\t"patient_name": string  // patient name\n}\n```'}, template='<s>[INST] You are a helpful, respectful and honest assistant.\nAlways answer as helpfully as possible using the context text provided.\nYour answers should only answer the question once and not have any text after the answer is done.\nIf you don\'t know the answer to a question, please don\'t share false information. Just return "</s>"\n\nCONTEXT:\n{text}\n\nQuestion: \n{question}\n\n{format_instructions}\n[/INST]')

In [64]:
input_text = response['result'].strip()

In [65]:
# messages = prompt_template.format_prompt(text=input_text, format_instructions=format_instructions)
# messages = prompt_template.format_messages(text=input_text, format_instructions=format_instructions)

In [66]:
chain = LLMChain(prompt=prompt_template, llm=llm)
chain

LLMChain(prompt=PromptTemplate(input_variables=['question', 'text'], partial_variables={'format_instructions': 'The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n\n```json\n{\n\t"patient_name": string  // patient name\n}\n```'}, template='<s>[INST] You are a helpful, respectful and honest assistant.\nAlways answer as helpfully as possible using the context text provided.\nYour answers should only answer the question once and not have any text after the answer is done.\nIf you don\'t know the answer to a question, please don\'t share false information. Just return "</s>"\n\nCONTEXT:\n{text}\n\nQuestion: \n{question}\n\n{format_instructions}\n[/INST]'), llm=HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7f3175003c70>, model_id='mistralai/Mixtral-8x7B-Instruct-v0.1', model_kwargs={'do_sample': True, 'top_k': 3, 'num_return_sequences': 1, 'eos_toke

In [67]:
if DEBUG:
    langchain.debug = True 
# parser_response = chain.invoke(input={"text":input_text, "format_instructions":format_instructions, "question":name_question}, temperature=0.001)
dict_response = chain.invoke(input={"text":input_text, "question":name_question})
# parser_response = chain.run(text=input_text, format_instructions=format_instructions, question=name_question, temperature=0.001)
if DEBUG:
    langchain.debug = False

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [68]:
from applyllm.pipelines import StructuredOutputParserHelper as ParserHelper

patient_name = ParserHelper.parse_response_dict(
        parser_response=dict_response,
        output_parser=output_parser,
        text_key="text"
    ).get("patient_name", "").strip()


if DEBUG:
    print(f"patient_name is: {patient_name}")

### Weight question

PromptTemplate
* https://www.comet.com/site/blog/introduction-to-prompt-templates-in-langchain/
* https://stackoverflow.com/questions/77316112/langchain-how-do-input-variables-work-in-particular-how-is-context-replaced

Structured Parser uses partial_variables in langchain:
* https://python.langchain.com/docs/modules/model_io/output_parsers/types/structured

In [69]:
patient_name=f"{patient_name}" if patient_name is not None else ""
# query = f"What is the age of the patient {patient_name}? (Remember to include 'The age of the patient is' in your answer.)"
entity_query_template = """What is the weight of the patient {patient_name} in kilogram? (Remember to include 'The weight of the patient is' in your answer)"""

prompt_template = PromptTemplate.from_template(entity_query_template)
query = prompt_template.format(patient_name=patient_name) 

if DEBUG:
    print(query)

In [70]:
import mlflow
import logging
import time
from typing import Tuple

logging.getLogger("mlflow").setLevel(logging.DEBUG)

def get_ml_logging_info(exp_name: str = "scivias-med-reports", run_surfix: str = "weight") -> Tuple[str, str]:
    # Set the run name to time string
    run_name = time.strftime("%Y-%m-%d_%H-%M-%S")
    run_name = f"{run_name}_{run_surfix}"
    search_pattern = f"name = '{exp_name}'"
    experiments = mlflow.search_experiments(filter_string=search_pattern)
    
    if len(experiments) < 1:
        exp_id = mlflow.create_experiment(name=exp_name)
        print(f"experiment with string id {exp_id} is created.")
    else:
        exp_id = experiments[0].experiment_id
        # experiment_id = experiments.experiment_id[0]
        print(f"experiment with string id {exp_id} is reused.")
    return exp_id, run_name

In [71]:
'''mlflow log run start'''
exp_id, run_name = get_ml_logging_info()
mlflow.end_run()
mlflow.set_experiment(experiment_id=exp_id)
mlflow.start_run(run_name=run_name)

chain_type = "map_reduce"
# chain_type = "stuff"
# chain_type = "refine" 

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type=chain_type,
    retriever=retriever,
    # combine_docs_chain_kwargs={'prompt': reduce_prompt_template},
    # chain_type_kwargs={"map_prompt": map_prompt_template},
    return_source_documents=True,
    verbose=True,
    )

experiment with string id 1 is reused.


In [72]:
if chain_type == "map_reduce":
    qa_chain.combine_documents_chain.llm_chain.prompt = map_prompt_template
    qa_chain.combine_documents_chain.reduce_documents_chain.combine_documents_chain.llm_chain.prompt = reduce_prompt_template
    # set the token max from 3000 to 4000
    qa_chain.combine_documents_chain.reduce_documents_chain.token_max = MAX_POSITION_EMBEDDINGS
    
    
if chain_type == "refine":
    # pass
    qa_chain.combine_documents_chain.initial_llm_chain.prompt = init_prompt_template
    # qa_chain.combine_documents_chain.refine_llm_chain.token_max = MAX_POSITION_EMBEDDINGS

In [73]:
qa_chain

RetrievalQA(verbose=True, combine_documents_chain=MapReduceDocumentsChain(llm_chain=LLMChain(prompt=PromptTemplate(input_variables=['context', 'question'], template='<s>[INST] You are a helpful, respectful and honest assistant.\nAlways answer as helpfully as possible using the context text provided.\nYour answers should only answer the question once and not have any text after the answer is done.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.\nIf you don\'t know the answer to a question, please don\'t share false information. Just return "</s>"\n\n\nContext:\n{context}\n\nQuestion: {question}\n\nOnly return the helpful answer below and nothing else.\nHelpful answer:\n\n[/INST]'), llm=HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7f3175003c70>, model_id='mistralai/Mixtral-8x7B-Instruct-v0.1', model_kwargs={'do_sample': True, 'top_k': 3, 'num_return_s

In [74]:
# qa_chain.combine_documents_chain.initial_llm_chain.prompt
# qa_chain.combine_documents_chain.refine_llm_chain.prompt

In [75]:
qa_chain

RetrievalQA(verbose=True, combine_documents_chain=MapReduceDocumentsChain(llm_chain=LLMChain(prompt=PromptTemplate(input_variables=['context', 'question'], template='<s>[INST] You are a helpful, respectful and honest assistant.\nAlways answer as helpfully as possible using the context text provided.\nYour answers should only answer the question once and not have any text after the answer is done.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.\nIf you don\'t know the answer to a question, please don\'t share false information. Just return "</s>"\n\n\nContext:\n{context}\n\nQuestion: {question}\n\nOnly return the helpful answer below and nothing else.\nHelpful answer:\n\n[/INST]'), llm=HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7f3175003c70>, model_id='mistralai/Mixtral-8x7B-Instruct-v0.1', model_kwargs={'do_sample': True, 'top_k': 3, 'num_return_s

In [76]:
mlflow.log_param("map_prompt", map_prompt_template.template)
mlflow.log_param("user_query", entity_query_template)
mlflow.log_param("doc_name", CUR_DOC_INFO.name)
mlflow.log_param("doc_source", CUR_DOC_INFO.source)
mlflow.log_param("llm_model", model_name)

if DEBUG:
    langchain.debug = True
response = qa_chain.invoke({"query": query})
if DEBUG:
    langchain.debug = False



[1m> Entering new RetrievalQA chain...[0m


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



[1m> Finished chain.[0m


In [77]:
if DEBUG:
    print(f"Response: {response['result']}")
    print('-'*20)
    print(data[file_idx])

In [78]:
# use the string type
entity_schema = ResponseSchema(name="patient_weight", description="patient weight")
# age_schema = ResponseSchema(name="patient_age", description="patient age", type="int")

# response_schema = [age_schema]
entity_output_parser = StructuredOutputParser.from_response_schemas([entity_schema])
# age_output_parser

In [79]:
entity_question="retrieve one: patient weight as number in kilograms"

format_instructions = entity_output_parser.get_format_instructions()
chain.prompt.partial_variables["format_instructions"] = format_instructions

print(format_instructions)

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"patient_weight": string  // patient weight
}
```


In [80]:
input_text = response['result']

In [81]:
if DEBUG:
    langchain.debug = True
# parser_response = chain.invoke(input={"text":input_text, "format_instructions":format_instructions, "question":age_question}, temperature=0.001)
# parser_response = chain.run(text=input_text, format_instructions=format_instructions, question=age_question, temperature=0.001)
dict_response = chain.invoke(input={"text":input_text, "question":entity_question})
if DEBUG:
    langchain.debug = False

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [82]:
mlflow.log_param("response_json", dict_response.get("text", ""))
mlflow.end_run()

In [83]:
patient_entity_float_obj = ParserHelper.parse_response_dict(
        parser_response=dict_response,
        output_parser=entity_output_parser,
        text_key="text",
        verbose=DEBUG,
    ).get("patient_weight", "")

try:
    if isinstance(patient_entity_float_obj, str):
        patient_entity_float_obj = patient_entity_float_obj.strip()
        patient_entity_float = float(patient_entity_float_obj)
    if isinstance(patient_entity_float_obj, float):
        patient_entity_float = patient_entity_float_obj
except Exception as e:
    print(e)
    patient_entity_float = -1.0

if DEBUG:
    # print(f"str response: {dict_response}")
    print(f"patient_weight is: {patient_entity_float}")
    print(f"pateint_weight has type: {type(patient_entity_float)}")

### (optional) Additional Read

GPT4All
* https://python.langchain.com/docs/integrations/llms/gpt4all