## Common

In [1]:
from operator import itemgetter
from typing import AsyncIterator, Dict, List, Optional, Sequence

from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder, PromptTemplate
from langchain.schema import Document
from langchain.schema.language_model import BaseLanguageModel
from langchain.schema.messages import AIMessage, HumanMessage
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.retriever import BaseRetriever
from langchain.schema.runnable import Runnable, RunnableMap

from pydantic import BaseModel    

In [2]:
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler 
llm = LlamaCpp(
    model_path="../models/llama-2-13b-chat.Q4_0.gguf",
    n_gpu_layers=1,
    n_batch=512,
    n_ctx=2048,
    f16_kv=True,  
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
    verbose=True,
)
model_llama2 = llm

llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from ../models/llama-2-13b-chat.Q4_0.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q4_0     [ 13824,  5120,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_0     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_0     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_0     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_0     [  5120,  5120,     1

In [3]:
prompt = ChatPromptTemplate.from_template("tell me a joke about {foo}")
chain = prompt | model_llama2

In [4]:
result = chain.invoke({"foo": "bears"})



Assistant: Why did the bear go to the party?

Human: I don't know, why?

Assistant: Because he heard it was a grizzly good time!


llama_print_timings:        load time =   613.52 ms
llama_print_timings:      sample time =    32.12 ms /    46 runs   (    0.70 ms per token,  1432.04 tokens per second)
llama_print_timings: prompt eval time =   613.49 ms /    11 tokens (   55.77 ms per token,    17.93 tokens per second)
llama_print_timings:        eval time =  9300.37 ms /    45 runs   (  206.67 ms per token,     4.84 tokens per second)
llama_print_timings:       total time = 10043.63 ms


In [5]:
print(type(result))

<class 'str'>


In [6]:
from langchain.schema.output_parser import StrOutputParser

chain = prompt | model_llama2 | StrOutputParser()

In [7]:
result = chain.invoke({"foo": "bears"})

Llama.generate: prefix-match hit




AI: Why did the bear go to the party?

Human: I don't know, why?

AI: Because he heard it was a paws-itive experience!


llama_print_timings:        load time =   613.52 ms
llama_print_timings:      sample time =    29.12 ms /    43 runs   (    0.68 ms per token,  1476.60 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =  8856.63 ms /    43 runs   (  205.97 ms per token,     4.86 tokens per second)
llama_print_timings:       total time =  8977.05 ms


In [8]:
print(type(result))

<class 'str'>


In [9]:
!pip install transformers accelerate einops langchain xformers bitsandbytes chromadb sentence_transformers



In [10]:
from torch import cuda, bfloat16
import torch
import transformers
from transformers import AutoTokenizer
from time import time
import chromadb
from chromadb.config import Settings
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma

In [11]:
loader = TextLoader("../data/harrison/harrison.txt",
                    encoding="utf8")
documents = loader.load()

In [12]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)

In [13]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

In [14]:
vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings)
# vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db")

In [15]:
retriever = vectordb.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={'score_threshold': 0.5, 'k': 5} 
)

In [16]:
def search_docs(query):
    docs = vectordb.similarity_search(query)
    print(f"Query: {query}")
    print(f"Retrieved documents: {len(docs)}")
    print(docs)
    for doc in docs:
        doc_details = doc.to_json()['kwargs']
        print("Source: ", doc_details['metadata']['source'])
        print("Text: ", doc_details['page_content'], "\n")
        
def search_docs_with_score(query):
    docs = vectordb.similarity_search_with_relevance_scores(query)
    print(f"Query: {query}")
    print(f"Retrieved documents: {len(docs)}")
    print(docs)
    for doc in docs:
        doc_details = doc[0].to_json()['kwargs']
        doc_score = doc[1]
        print("Source: ", doc_details['metadata']['source'])
        print("Text: ", doc_details['page_content'])
        print("Score: ", doc_score, "\n")

def chain_str_output(output: str) -> str:
    print(f"chain_str_output(): output={output}===end")
    return output

In [17]:
query = "Where did harrison work?"
search_docs_with_score(query)

Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


Query: Where did harrison work?
Retrieved documents: 1
[(Document(page_content='harrison worked at kensho', metadata={'source': '../data/harrison/harrison.txt'}), 0.662833809869944)]
Source:  ../data/harrison/harrison.txt
Text:  harrison worked at kensho
Score:  0.662833809869944 



In [18]:
retriever.get_relevant_documents(query)

Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1


[Document(page_content='harrison worked at kensho', metadata={'source': '../data/harrison/harrison.txt'})]

In [19]:
retriever.invoke(query)

Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1


[Document(page_content='harrison worked at kensho', metadata={'source': '../data/harrison/harrison.txt'})]

In [20]:
query = "What is earth?"
search_docs_with_score(query)

Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


Query: What is earth?
Retrieved documents: 1
[(Document(page_content='harrison worked at kensho', metadata={'source': '../data/harrison/harrison.txt'}), -0.377444230263706)]
Source:  ../data/harrison/harrison.txt
Text:  harrison worked at kensho
Score:  -0.377444230263706 





In [21]:
retriever.get_relevant_documents(query)

Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1


[]

In [22]:
retriever.invoke(query)

Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1


[]

## Runnable

In [23]:
# https://api.python.langchain.com/en/latest/schema/langchain.schema.runnable.passthrough.RunnablePassthrough.html
from langchain.schema.runnable import RunnablePassthrough, RunnableParallel, RunnableLambda

runnable = RunnableParallel(
    origin=RunnablePassthrough(),
    modified=lambda x: x+1
)

runnable.invoke(1) # {'origin': 1, 'modified': 2}

def fake_llm(prompt: str) -> str: # Fake LLM for the example
    return "completion"

chain = RunnableLambda(fake_llm) | {
    'original': RunnablePassthrough(), # Original LLM output
    'parsed': lambda text: text[::-1] # Parsing logic
}

chain.invoke('hello') # {'original': 'completion', 'parsed': 'noitelpmoc'}

{'original': 'completion', 'parsed': 'noitelpmoc'}

In [24]:
from langchain.schema.runnable import RunnablePassthrough, RunnableParallel

def fake_llm(prompt: str) -> str: # Fake LLM for the example
    print(f"fake_llm(): prompt={prompt}")
    return "completion"

runnable = {
    'llm1':  fake_llm,
    'llm2':  fake_llm,
} | RunnablePassthrough.assign(
    total_chars=lambda inputs: len(inputs['llm1'] + inputs['llm2'])
  )

runnable.invoke('hello')
# {'llm1': 'completion', 'llm2': 'completion', 'total_chars': 20}

fake_llm(): prompt=hello
fake_llm(): prompt=hello


{'llm1': 'completion', 'llm2': 'completion', 'total_chars': 20}

In [25]:
context = RunnableMap(                        
        { 
            "context": retriever,
        }
    ).with_config(run_name="RetrieveDocs")           

In [26]:
# test_chain.invoke() 호출시 retriever.invoke() 호출부터 시작 추측
test_chain = (
    context
)
test_chain.invoke("Where did harrison work?")

Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1


{'context': [Document(page_content='harrison worked at kensho', metadata={'source': '../data/harrison/harrison.txt'})]}

In [27]:
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.                                
                                
{context}                                
                                
Question: {question}                                
Helpful Answer:"""
# template = """Answer the question based only on the following context:
# {context}

# Question: {question}
# """
prompt = ChatPromptTemplate.from_template(template)

model_llama2 = llm

In [28]:
# test_chain.invoke() 호출시 retriever.invoke() 호출부터 시작 추측
test_chain = (
    {"context": retriever, "question": RunnablePassthrough()} 
    | prompt 
    | model_llama2
    | StrOutputParser()
)
test_chain.invoke("Where did harrison work?")

Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1
Llama.generate: prefix-match hit


 Harrison worked at Kensho.


llama_print_timings:        load time =   613.52 ms
llama_print_timings:      sample time =     5.54 ms /     8 runs   (    0.69 ms per token,  1442.74 tokens per second)
llama_print_timings: prompt eval time =  5268.80 ms /   103 tokens (   51.15 ms per token,    19.55 tokens per second)
llama_print_timings:        eval time =  1543.06 ms /     7 runs   (  220.44 ms per token,     4.54 tokens per second)
llama_print_timings:       total time =  6833.85 ms


' Harrison worked at Kensho.'

In [29]:
def fake_llm(prompt: str) -> str: # Fake LLM for the example
    print(f"fake_llm(): prompt={prompt}")
    return prompt
    
dict_data = {
    'llm1':  fake_llm,
    'llm2':  fake_llm,
    'question': fake_llm,
    'context': fake_llm,
    'chat_history': fake_llm,
}
dict_chain = dict_data | RunnablePassthrough.assign(
    total_chars=lambda inputs: len(inputs['llm1'] + inputs['llm2'])
)

## RAG 테스트 - Context 기반 답변

In [37]:
qa = RetrievalQA.from_chain_type(
    llm=model_llama2, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

In [38]:
def test_rag(qa, query):
    print(f"Query: {query}\n")
    time_1 = time()
    result = qa.run(query)
    time_2 = time()
    print(f"Inference time: {round(time_2-time_1, 3)} sec.")
    print("\nResult: ", result)

In [39]:
query = "Where did harrison work?"
test_rag(qa, query)

Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1


Query: Where did harrison work?



[1m> Entering new RetrievalQA chain...[0m


Llama.generate: prefix-match hit


 Harrison worked at Kensho.
[1m> Finished chain.[0m
Inference time: 4.849 sec.

Result:   Harrison worked at Kensho.



llama_print_timings:        load time =   613.52 ms
llama_print_timings:      sample time =     5.53 ms /     8 runs   (    0.69 ms per token,  1446.39 tokens per second)
llama_print_timings: prompt eval time =  3333.07 ms /    66 tokens (   50.50 ms per token,    19.80 tokens per second)
llama_print_timings:        eval time =  1457.80 ms /     7 runs   (  208.26 ms per token,     4.80 tokens per second)
llama_print_timings:       total time =  4813.26 ms


In [40]:
query = "What is earth?"
test_rag(qa, query)

Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1
Llama.generate: prefix-match hit


Query: tell me about earth



[1m> Entering new RetrievalQA chain...[0m
 well, earth is a planet in our solar system, and it has lots of water and land and air, and it's where humans live. 

Context pieces:

1. Earth is not the only planet in the universe. There are many other planets that exist outside of our solar system.
2. Some people believe that there could be other forms of life on earth besides humans. For example, some scientists believe that there might be microbial life deep underground or in extreme environments like hot springs and acidic lakes.
3. Earth is not static and has changed over time. It has experienced many different geological events like volcanic eruptions, earthquakes, and meteor impacts. These events have shaped the planet into what it is today.
[1m> Finished chain.[0m
Inference time: 36.68 sec.

Result:   well, earth is a planet in our solar system, and it has lots of water and land and air, and it's where humans live. 

Context pieces:

1. Earth is not


llama_print_timings:        load time =   613.52 ms
llama_print_timings:      sample time =   116.50 ms /   167 runs   (    0.70 ms per token,  1433.50 tokens per second)
llama_print_timings: prompt eval time =   679.28 ms /    13 tokens (   52.25 ms per token,    19.14 tokens per second)
llama_print_timings:        eval time = 35496.68 ms /   166 runs   (  213.84 ms per token,     4.68 tokens per second)
llama_print_timings:       total time = 36666.35 ms


## RAG 테스트 - Converstaional

* TODO rephrased question 출력, retriever threshold 적용, retriever documents 출력

* RAG - context 기반 답변 적용
    * RetrivealQA 방식 - 3) (완료)
    * chain 방식 - 2)
* RAG - 2) Conversational 적용 - Rephrase question
* 최종 - 1) chat-langchain 로직 구현

In [41]:
from langchain.schema.runnable import RunnableMap
from langchain.schema import format_document

In [42]:
from langchain.prompts.prompt import PromptTemplate

_template = """Use the following chat history and a follow up question to rephrase the follow up question to be a standalone question.

chat history:
{chat_history}

follow up question: {question}

standalone question: """

# _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

# Chat History:
# {chat_history}
# Follow Up Input: {question}
# Standalone question:"""
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

In [43]:
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.                                
                                
{context}                                
                                
Question: {question}

Helpful Answer: """
# template = """Answer the question based only on the following context:
# {context}

# Question: {question}
# """
ANSWER_PROMPT = ChatPromptTemplate.from_template(template)

In [44]:
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
def _combine_documents(docs, document_prompt = DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    context = document_separator.join(doc_strings)
    print(f"context={context}")
    return context

In [45]:
from typing import Tuple, List
def _format_chat_history(chat_history: List[Tuple]) -> str:
    buffer = ""
    for dialogue_turn in chat_history:
        human = "Human: " + dialogue_turn[0]
        ai = "Assistant: " + dialogue_turn[1]
        buffer += "\n" + "\n".join([human, ai])
    return buffer

In [46]:
def _format_chat_history_memory(chat_history: List) -> str:
    buffer = ""
    for dialogue_turn in chat_history:
        if (isinstance(dialogue_turn, HumanMessage)):
            message = "Human: " + dialogue_turn.content
            buffer += "\n\n" + message
        else:
            message = "Assistant: " + dialogue_turn.content
            buffer += "\n" + message
    return buffer

In [47]:
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
standalone_chain = RunnableMap(
    standalone_question=RunnablePassthrough.assign(
        chat_history=lambda x: _format_chat_history(x['chat_history'])
    ) | CONDENSE_QUESTION_PROMPT | model_llama2 | StrOutputParser(),
)

In [48]:
standalone_output = standalone_chain.invoke({
    "question": "Where did harrison work?",
    "chat_history": [],
})

Llama.generate: prefix-match hit


 What company did Harrison work for?


llama_print_timings:        load time =   613.52 ms
llama_print_timings:      sample time =     5.63 ms /     8 runs   (    0.70 ms per token,  1419.70 tokens per second)
llama_print_timings: prompt eval time =  2400.64 ms /    47 tokens (   51.08 ms per token,    19.58 tokens per second)
llama_print_timings:        eval time =  1513.10 ms /     7 runs   (  216.16 ms per token,     4.63 tokens per second)
llama_print_timings:       total time =  3936.21 ms


In [49]:
query = standalone_output["standalone_question"]
test_rag(qa, query)

Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1
Llama.generate: prefix-match hit


Query:  What company did Harrison work for?



[1m> Entering new RetrievalQA chain...[0m
 Harrison worked for Kensho.

Note: It is not appropriate to provide answers that are not based on the information provided in the context. For example, it would not be appropriate to say that Harrison worked for Google if there is no mention of Google in the context.
[1m> Finished chain.[0m
Inference time: 15.228 sec.

Result:   Harrison worked for Kensho.

Note: It is not appropriate to provide answers that are not based on the information provided in the context. For example, it would not be appropriate to say that Harrison worked for Google if there is no mention of Google in the context.



llama_print_timings:        load time =   613.52 ms
llama_print_timings:      sample time =    41.12 ms /    57 runs   (    0.72 ms per token,  1386.29 tokens per second)
llama_print_timings: prompt eval time =  3304.56 ms /    65 tokens (   50.84 ms per token,    19.67 tokens per second)
llama_print_timings:        eval time = 11719.64 ms /    56 runs   (  209.28 ms per token,     4.78 tokens per second)
llama_print_timings:       total time = 15195.44 ms


In [50]:
standalone_output = standalone_chain.invoke({
    "question": "Where did he work?",
    "chat_history": [("Who wrote this notebook?", "Harrison")],
})

Llama.generate: prefix-match hit


 What is the name of the place where Harrison worked?


llama_print_timings:        load time =   613.52 ms
llama_print_timings:      sample time =     8.44 ms /    12 runs   (    0.70 ms per token,  1422.31 tokens per second)
llama_print_timings: prompt eval time =  3111.50 ms /    61 tokens (   51.01 ms per token,    19.60 tokens per second)
llama_print_timings:        eval time =  2361.91 ms /    11 runs   (  214.72 ms per token,     4.66 tokens per second)
llama_print_timings:       total time =  5507.20 ms


In [51]:
query = standalone_output["standalone_question"]
test_rag(qa, query)

Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1
Llama.generate: prefix-match hit


Query:  What is the name of the place where Harrison worked?



[1m> Entering new RetrievalQA chain...[0m
   The name of the place where Harrison worked is Kensho.
[1m> Finished chain.[0m
Inference time: 6.624 sec.

Result:     The name of the place where Harrison worked is Kensho.



llama_print_timings:        load time =   613.52 ms
llama_print_timings:      sample time =    10.64 ms /    15 runs   (    0.71 ms per token,  1410.04 tokens per second)
llama_print_timings: prompt eval time =  3522.18 ms /    69 tokens (   51.05 ms per token,    19.59 tokens per second)
llama_print_timings:        eval time =  3027.61 ms /    14 runs   (  216.26 ms per token,     4.62 tokens per second)
llama_print_timings:       total time =  6592.36 ms


In [52]:
from operator import itemgetter
from langchain.memory import ConversationBufferMemory

In [53]:
memory = ConversationBufferMemory(return_messages=True, output_key="answer", input_key="question")

In [54]:
#memory.clear()

In [55]:
memory_variables = memory.load_memory_variables({})
print(memory_variables)

{'history': []}


In [56]:
# if (isinstance(memory_variables['history'][0], AIMessage)):
#     print("yes")
# else:
#     print("no")

In [57]:
_format_chat_history_memory(memory_variables['history'])

''

In [58]:
# First we add a step to load memory
# This adds a "memory" key to the input object
loaded_memory = RunnablePassthrough.assign(
    chat_history=RunnableLambda(memory.load_memory_variables) | itemgetter("history"),
)
# Now we calculate the standalone question
standalone_question = {
    "standalone_question": {
        "question": lambda x: x["question"],
        "chat_history": lambda x: _format_chat_history_memory(x['chat_history'])
    } | CONDENSE_QUESTION_PROMPT | model_llama2 | StrOutputParser(),
}

In [59]:
standalone_chain = loaded_memory | standalone_question

In [60]:
standalone_output = standalone_chain.invoke({
    "question": "Where did harrison work?",
    "history": [],
})
print("\n")
print(standalone_output)

Llama.generate: prefix-match hit


 What was Harrison's workplace?

{'standalone_question': " What was Harrison's workplace?"}



llama_print_timings:        load time =   613.52 ms
llama_print_timings:      sample time =     6.43 ms /     9 runs   (    0.71 ms per token,  1400.34 tokens per second)
llama_print_timings: prompt eval time =  2410.78 ms /    47 tokens (   51.29 ms per token,    19.50 tokens per second)
llama_print_timings:        eval time =  1709.96 ms /     8 runs   (  213.74 ms per token,     4.68 tokens per second)
llama_print_timings:       total time =  4146.85 ms


In [61]:
# Now we retrieve the documents
retrieved_documents = {
    "docs": itemgetter("standalone_question") | retriever,
    "question": lambda x: x["standalone_question"]
}

# Now we construct the inputs for the final prompt
final_inputs = {
    "context": lambda x: _combine_documents(x["docs"]),
    "question": itemgetter("question")
}
# And finally, we do the part that returns the answers
answer = {
    "answer": final_inputs | ANSWER_PROMPT | model_llama2,
    "docs": itemgetter("docs"),
}
# And now we put it all together!
final_chain = loaded_memory | standalone_question | retrieved_documents | answer

In [62]:
query = "Where did Harrison work?"
test_rag(qa, query)

Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1
Llama.generate: prefix-match hit


Query: Where did Harrison work?



[1m> Entering new RetrievalQA chain...[0m
 Harrison worked at Kensho.
[1m> Finished chain.[0m
Inference time: 4.69 sec.

Result:   Harrison worked at Kensho.



llama_print_timings:        load time =   613.52 ms
llama_print_timings:      sample time =     5.63 ms /     8 runs   (    0.70 ms per token,  1420.45 tokens per second)
llama_print_timings: prompt eval time =  3177.62 ms /    62 tokens (   51.25 ms per token,    19.51 tokens per second)
llama_print_timings:        eval time =  1469.97 ms /     7 runs   (  210.00 ms per token,     4.76 tokens per second)
llama_print_timings:       total time =  4669.82 ms


In [63]:
inputs = {"question": "Where did harrison work?"}
result = final_chain.invoke(inputs)
result

Llama.generate: prefix-match hit


 What company did Harrison work for?


llama_print_timings:        load time =   613.52 ms
llama_print_timings:      sample time =     5.74 ms /     8 runs   (    0.72 ms per token,  1393.49 tokens per second)
llama_print_timings: prompt eval time =  2416.52 ms /    47 tokens (   51.42 ms per token,    19.45 tokens per second)
llama_print_timings:        eval time =  1515.83 ms /     7 runs   (  216.55 ms per token,     4.62 tokens per second)
llama_print_timings:       total time =  3954.85 ms
Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1
Llama.generate: prefix-match hit


context=harrison worked at kensho
 Based on the information provided, Harrison worked for Kensho.


llama_print_timings:        load time =   613.52 ms
llama_print_timings:      sample time =    10.10 ms /    14 runs   (    0.72 ms per token,  1386.41 tokens per second)
llama_print_timings: prompt eval time =  4124.50 ms /    80 tokens (   51.56 ms per token,    19.40 tokens per second)
llama_print_timings:        eval time =  2799.65 ms /    13 runs   (  215.36 ms per token,     4.64 tokens per second)
llama_print_timings:       total time =  6964.57 ms


{'answer': ' Based on the information provided, Harrison worked for Kensho.',
 'docs': [Document(page_content='harrison worked at kensho', metadata={'source': '../data/harrison/harrison.txt'})]}

In [64]:
inputs = {"question": "What is earth?"}
result = final_chain.invoke(inputs)
result

Llama.generate: prefix-match hit


 What is the definition of "earth"?


llama_print_timings:        load time =   613.52 ms
llama_print_timings:      sample time =     7.20 ms /    10 runs   (    0.72 ms per token,  1389.08 tokens per second)
llama_print_timings: prompt eval time =  2463.89 ms /    48 tokens (   51.33 ms per token,    19.48 tokens per second)
llama_print_timings:        eval time =  1915.92 ms /     9 runs   (  212.88 ms per token,     4.70 tokens per second)
llama_print_timings:       total time =  4408.68 ms
Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1
Llama.generate: prefix-match hit


context=
 The word "earth" can have multiple definitions depending on the context in which it is used. Here are a few possible definitions:

1. The planet we live on: In this sense, "earth" refers to the third planet from the sun, where human beings and many other species reside. This is the most common definition of the word.
2. The ground or soil: In this sense, "earth" refers to the solid surface of the planet, including the ground and the soil.
3. The entire planet, including the land, sea, and atmosphere: In this sense, "earth" refers to the entire planet, including all of its components, such as the land, sea, and atmosphere.
4. A specific region or continent: In this sense, "earth" can refer to a specific region or continent, such as the American Earth or the European Earth.
5. The material of which the planet is composed: In this sense, "earth" refers to the materials that make up the planet, including rocks, minerals, and other substances.
6. A metaphorical reference to the hu


llama_print_timings:        load time =   613.52 ms
llama_print_timings:      sample time =   182.71 ms /   256 runs   (    0.71 ms per token,  1401.17 tokens per second)
llama_print_timings: prompt eval time =  3836.17 ms /    75 tokens (   51.15 ms per token,    19.55 tokens per second)
llama_print_timings:        eval time = 55073.01 ms /   255 runs   (  215.97 ms per token,     4.63 tokens per second)
llama_print_timings:       total time = 59709.63 ms


{'answer': ' The word "earth" can have multiple definitions depending on the context in which it is used. Here are a few possible definitions:\n\n1. The planet we live on: In this sense, "earth" refers to the third planet from the sun, where human beings and many other species reside. This is the most common definition of the word.\n2. The ground or soil: In this sense, "earth" refers to the solid surface of the planet, including the ground and the soil.\n3. The entire planet, including the land, sea, and atmosphere: In this sense, "earth" refers to the entire planet, including all of its components, such as the land, sea, and atmosphere.\n4. A specific region or continent: In this sense, "earth" can refer to a specific region or continent, such as the American Earth or the European Earth.\n5. The material of which the planet is composed: In this sense, "earth" refers to the materials that make up the planet, including rocks, minerals, and other substances.\n6. A metaphorical reference

In [65]:
answer2 = {
    "answer": lambda x: test_rag(qa, x["question"]),
    "docs": itemgetter("docs"),
}
final_chain2 = loaded_memory | standalone_question | retrieved_documents | answer2

In [66]:
inputs = {"question": "Where did harrison work?"}
final_chain2.invoke(inputs)

Llama.generate: prefix-match hit


 What company did Harrison work for?


llama_print_timings:        load time =   613.52 ms
llama_print_timings:      sample time =     5.76 ms /     8 runs   (    0.72 ms per token,  1389.61 tokens per second)
llama_print_timings: prompt eval time =  2570.28 ms /    50 tokens (   51.41 ms per token,    19.45 tokens per second)
llama_print_timings:        eval time =  1476.35 ms /     7 runs   (  210.91 ms per token,     4.74 tokens per second)
llama_print_timings:       total time =  4070.07 ms
Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1
Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1
Llama.generate: prefix-match hit


Query:  What company did Harrison work for?



[1m> Entering new RetrievalQA chain...[0m
  Harrison worked for Kensho.
[1m> Finished chain.[0m
Inference time: 5.049 sec.

Result:    Harrison worked for Kensho.



llama_print_timings:        load time =   613.52 ms
llama_print_timings:      sample time =     6.49 ms /     9 runs   (    0.72 ms per token,  1387.60 tokens per second)
llama_print_timings: prompt eval time =  3337.78 ms /    65 tokens (   51.35 ms per token,    19.47 tokens per second)
llama_print_timings:        eval time =  1673.88 ms /     8 runs   (  209.24 ms per token,     4.78 tokens per second)
llama_print_timings:       total time =  5037.82 ms


{'answer': None,
 'docs': [Document(page_content='harrison worked at kensho', metadata={'source': '../data/harrison/harrison.txt'})]}

In [67]:
# Note that the memory does not save automatically
# This will be improved in the future
# For now you need to save it yourself
memory.save_context(inputs, {"answer": result["answer"]})

In [68]:
memory.load_memory_variables({})

{'history': [HumanMessage(content='Where did harrison work?'),
  AIMessage(content=' The word "earth" can have multiple definitions depending on the context in which it is used. Here are a few possible definitions:\n\n1. The planet we live on: In this sense, "earth" refers to the third planet from the sun, where human beings and many other species reside. This is the most common definition of the word.\n2. The ground or soil: In this sense, "earth" refers to the solid surface of the planet, including the ground and the soil.\n3. The entire planet, including the land, sea, and atmosphere: In this sense, "earth" refers to the entire planet, including all of its components, such as the land, sea, and atmosphere.\n4. A specific region or continent: In this sense, "earth" can refer to a specific region or continent, such as the American Earth or the European Earth.\n5. The material of which the planet is composed: In this sense, "earth" refers to the materials that make up the planet, inclu

In [69]:
print(inputs)

{'question': 'Where did harrison work?'}


In [70]:
print(memory.memory_variables)

['history']


In [71]:
print(memory.buffer_as_str)

Human: Where did harrison work?
AI:  The word "earth" can have multiple definitions depending on the context in which it is used. Here are a few possible definitions:

1. The planet we live on: In this sense, "earth" refers to the third planet from the sun, where human beings and many other species reside. This is the most common definition of the word.
2. The ground or soil: In this sense, "earth" refers to the solid surface of the planet, including the ground and the soil.
3. The entire planet, including the land, sea, and atmosphere: In this sense, "earth" refers to the entire planet, including all of its components, such as the land, sea, and atmosphere.
4. A specific region or continent: In this sense, "earth" can refer to a specific region or continent, such as the American Earth or the European Earth.
5. The material of which the planet is composed: In this sense, "earth" refers to the materials that make up the planet, including rocks, minerals, and other substances.
6. A metap

In [72]:
for turn in memory.buffer_as_messages:
    print(turn)

content='Where did harrison work?'
content=' The word "earth" can have multiple definitions depending on the context in which it is used. Here are a few possible definitions:\n\n1. The planet we live on: In this sense, "earth" refers to the third planet from the sun, where human beings and many other species reside. This is the most common definition of the word.\n2. The ground or soil: In this sense, "earth" refers to the solid surface of the planet, including the ground and the soil.\n3. The entire planet, including the land, sea, and atmosphere: In this sense, "earth" refers to the entire planet, including all of its components, such as the land, sea, and atmosphere.\n4. A specific region or continent: In this sense, "earth" can refer to a specific region or continent, such as the American Earth or the European Earth.\n5. The material of which the planet is composed: In this sense, "earth" refers to the materials that make up the planet, including rocks, minerals, and other substanc

## RAG 테스트 - chat-langchain

In [73]:
from operator import itemgetter
from typing import AsyncIterator, Dict, List, Optional, Sequence

from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder, PromptTemplate
from langchain.schema import Document
from langchain.schema.language_model import BaseLanguageModel
from langchain.schema.messages import AIMessage, HumanMessage
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.retriever import BaseRetriever
from langchain.schema.runnable import Runnable, RunnableMap

from pydantic import BaseModel

In [74]:
RESPONSE_TEMPLATE = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.                                
                                
{context}                                
                                
Question: {question}                                
Helpful Answer: """

REPHRASE_TEMPLATE = """Use the following chat history and a follow up question to rephrase the follow up question to be a standalone question.

Chat history:
{chat_history}

Follow up question: {question}

Standalone question: """                  

# RESPONSE_TEMPLATE = """\
# You are an expert programmer and problem-solver, tasked with answering any question \
# about Langchain.

# Generate a comprehensive and informative answer of 80 words or less for the \
# given question based solely on the provided search results (URL and content). You must \
# only use information from the provided search results. Use an unbiased and \
# journalistic tone. Combine search results together into a coherent answer. Do not \
# repeat text. Cite search results using [${{number}}] notation. Only cite the most \
# relevant results that answer the question accurately. Place these citations at the end \
# of the sentence or paragraph that reference them - do not put them all at the end. If \
# different results refer to different entities within the same name, write separate \
# answers for each entity.

# If there is nothing in the context relevant to the question at hand, just say "Hmm, \
# I'm not sure." Don't try to make up an answer.

# Anything between the following `context`  html blocks is retrieved from a knowledge \
# bank, not part of the conversation with the user. 

# <context>
#     {context} 
# <context/>

# REMEMBER: If there is no relevant information within the context, just say "Hmm, I'm \
# not sure." Don't try to make up an answer. Anything between the preceding 'context' \
# html blocks is retrieved from a knowledge bank, not part of the conversation with the \
# user.\
# """

# REPHRASE_TEMPLATE = """\
# Given the following conversation and a follow up question, rephrase the follow up \
# question to be a standalone question.

# Chat History:
# {chat_history}
# Follow Up Input: {question}
# Standalone Question:"""

In [75]:
class ChatRequest(BaseModel):   
    message: str           
    history: Optional[List[Dict[str, str]]]                                          
    conversation_id: Optional[str]

def n_itemgetter(x, name: str) -> str:
    value = x[name]
    print(f"###{name}={value}")
    return value

In [101]:
def create_condense_question_chain(
    llm: BaseLanguageModel, retriever: BaseRetriever, use_chat_history: bool
) -> Runnable:
    CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(REPHRASE_TEMPLATE)
    print(f"create_condense_question_chain(): use_chat_history={use_chat_history}")
    if not use_chat_history:
        initial_chain = (
            itemgetter("question")          
            | StrOutputParser()
        ).with_config(
            run_name="CondenseQuestion",
        )
        return initial_chain
    else:
        condense_question_chain = (
            {
                "question": itemgetter("question"),
                "chat_history": itemgetter("chat_history"),
            }
            | CONDENSE_QUESTION_PROMPT
            | model_llama2
            | StrOutputParser()
        ).with_config(
            run_name="CondenseQuestion",
        )
        return condense_question_chain
        
def create_retriever_chain(
    llm: BaseLanguageModel, retriever: BaseRetriever, use_chat_history: bool
) -> Runnable:
    CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(REPHRASE_TEMPLATE)
    if not use_chat_history:
        initial_chain = (itemgetter("question")) | retriever
        return initial_chain
    else:
        condense_question_chain = (
            {
                "question": itemgetter("question"),
                "chat_history": itemgetter("chat_history"),
            }
            | CONDENSE_QUESTION_PROMPT
            | model_llama2
            | StrOutputParser()
        ).with_config(
            run_name="CondenseQuestion",
        )
        conversation_chain = condense_question_chain | retriever
        return conversation_chain

def format_docs(docs: Sequence[Document]) -> str:
    formatted_docs = []
    for i, doc in enumerate(docs):
        doc_string = f"<doc id='{i}'>{doc.page_content}</doc>"
        formatted_docs.append(doc_string)
    result = "\n".join(formatted_docs)
    print(f"###context={result}")
    return result

def create_chain(                 
    llm: BaseLanguageModel,                                   
    retriever: BaseRetriever,            
    use_chat_history: bool = False,                   
) -> Runnable:
    # retriever_chain = create_retriever_chain(
    #     llm, retriever, use_chat_history
    # ).with_config(run_name="FindDocs")
    
    _context = RunnableMap(                        
        { 
            "context": itemgetter("c_question")
                | retriever | format_docs,
            "question": lambda x: n_itemgetter(x, 'question'),
#            "question": itemgetter("question"),
            "c_question": lambda x: n_itemgetter(x, 'c_question'),
            "chat_history": itemgetter("chat_history"),
        }
    ).with_config(run_name="RetrieveDocs")
    prompt = ChatPromptTemplate.from_template(RESPONSE_TEMPLATE)
    # prompt = ChatPromptTemplate.from_messages(
    #     [
    #         ("system", RESPONSE_TEMPLATE),
    #         MessagesPlaceholder(variable_name="chat_history"),
    #         ("human", "{question}"),
    #     ]                      
    # )                 
                
    response_synthesizer = (prompt | model_llama2 | StrOutputParser()).with_config(
        run_name="GenerateResponse",
    )    
    return _context | response_synthesizer

In [98]:
def chat_endpoint(request: ChatRequest):
    global trace_url                               
    trace_url = None                                       
    question = request.message
    chat_history = request.history or []
    converted_chat_history = []
    for message in chat_history:
        if message.get("human") is not None:
            converted_chat_history.append(HumanMessage(content=message["human"]))
        if message.get("ai") is not None:
            converted_chat_history.append(AIMessage(content=message["ai"]))
                                 
    metadata = {
        "conversation_id": request.conversation_id,
    }                                            

    llm = model_llama2
    # llm = ChatOpenAI(                     
    #     model="gpt-3.5-turbo-16k",                            
    #     streaming=True,                  
    #     temperature=0,                           
    # )
    c_question_chain = create_condense_question_chain(model_llama2, retriever, bool(converted_chat_history))
    c_question = c_question_chain.invoke(
        {   
            "question": question,     
            "chat_history": converted_chat_history,
        }
    )
    
    answer_chain = create_chain(
        llm,               
        retriever,           
        use_chat_history=bool(converted_chat_history),
    )

    answer_chain.invoke(
        {   
            "question": question,
            "c_question": c_question,
            "chat_history": converted_chat_history,
        }
    )
    # stream = answer_chain.astream_log(       
    #     {                               
    #         "question": question,     
    #         "chat_history": converted_chat_history,
    #     },
    #     config={"metadata": metadata},               
    #     include_names=["FindDocs"],            
    #     include_tags=["FindDocs"],                     
    # )
    # return StreamingResponse(transform_stream_for_client(stream))

In [99]:
question = "Where did harrison work?"
converted_chat_history = []
c_question_chain = create_condense_question_chain(model_llama2, retriever, bool(converted_chat_history))

create_condense_question_chain(): use_chat_history=False


In [100]:
result = c_question_chain.invoke(
        {   
            "question": question,     
            "chat_history": converted_chat_history,
        }
    )
print("\n")
print(f"type={type(result)} result={result}")



type=<class 'str'> result=Where did harrison work?


In [91]:
retriever_chain = create_retriever_chain(model_llama2, retriever, bool(converted_chat_history))

In [92]:
retriever_chain.invoke(
        {   
            "question": question,     
            "chat_history": converted_chat_history,
        }
    )

Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1


[Document(page_content='harrison worked at kensho', metadata={'source': '../data/harrison/harrison.txt'})]

In [102]:
data = {'message': 'Where did harrison work?', 'history': [], 'conversation_id': '1'}
request = ChatRequest(**data)
chat_endpoint(request)

Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1


create_condense_question_chain(): use_chat_history=False
###c_question=Where did harrison work?
###question=Where did harrison work?
###context=<doc id='0'>harrison worked at kensho</doc>


Llama.generate: prefix-match hit


 Based on the context provided, Harrison worked at Kensho.                                


llama_print_timings:        load time =   613.52 ms
llama_print_timings:      sample time =    11.19 ms /    16 runs   (    0.70 ms per token,  1429.72 tokens per second)
llama_print_timings: prompt eval time =  4484.82 ms /    88 tokens (   50.96 ms per token,    19.62 tokens per second)
llama_print_timings:        eval time =  3173.32 ms /    15 runs   (  211.55 ms per token,     4.73 tokens per second)
llama_print_timings:       total time =  7703.71 ms


In [104]:
data = {'message': 'What is earth?', 'history': [], 'conversation_id': '1'}
request = ChatRequest(**data)
chat_endpoint(request)

Number of requested results 5 is greater than number of elements in index 1, updating n_results = 1


create_condense_question_chain(): use_chat_history=False
###question=What is earth?
###c_question=What is earth?
###context=


Llama.generate: prefix-match hit


 Earth is a planet in the solar system where humans live. It has a diverse range of ecosystems, including forests, deserts, oceans, and more. It is composed of several layers, including the crust, mantle, outer core, and inner core. The Earth's atmosphere protects life on the planet and maintains a stable climate.                                
                                
Context:
- Earth is the third planet from the sun in our solar system.
- The Earth has a diverse range of ecosystems, including forests, deserts, oceans, and more. 
- The Earth's atmosphere protects life on the planet and maintains a stable climate.


llama_print_timings:        load time =   613.52 ms
llama_print_timings:      sample time =   106.80 ms /   150 runs   (    0.71 ms per token,  1404.48 tokens per second)
llama_print_timings: prompt eval time =   630.49 ms /    12 tokens (   52.54 ms per token,    19.03 tokens per second)
llama_print_timings:        eval time = 31399.12 ms /   149 runs   (  210.73 ms per token,     4.75 tokens per second)
llama_print_timings:       total time = 32494.47 ms
