### Goal:
The idea is generate multiple step back for retrieval.

In [1]:
from dotenv import load_dotenv
load_dotenv()
import rich

In [15]:
from langchain_core.prompts import FewShotChatMessagePromptTemplate, ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.runnables import RunnablePassthrough

from pydantic import BaseModel, Field
from typing import List

In [6]:
examples = [
    {
        'input': 'What happens to the pressure, P, of an ideal gas if the temperature is increased by a factor of 2 and the volume is increased by a factor of 8?',
        'output': 'What are the physics principles behind this question?'
    },
    {
        'input': 'Estella Leopold went to which school between Aug 1954 and Nov 1954?',
        'output': "What was Estella Leopold's education history?"
    }
]

example_prompt = ChatPromptTemplate.from_messages(
        [
            ('human', '{input}'), 
            ('ai', '{output}')
        ]
    )

few_shot_prompt = FewShotChatMessagePromptTemplate(
    examples=examples,
    example_prompt=example_prompt
)

In [44]:
# rich.print(few_shot_prompt.format())
rich.print(few_shot_prompt)

In [117]:
class Multi_Step_Back(BaseModel):
    queries: List[str] = Field(description="step back and paraphrase of the original query, The number of step back questions is depend on the complexity of the original question, range from 1 to 5.")

multi_step_back_parser = JsonOutputParser(pydantic_object=Multi_Step_Back)
multi_step_back_formater = multi_step_back_parser.get_format_instructions()

In [128]:
system_message = SystemMessagePromptTemplate(
    prompt = PromptTemplate(
        template="""You are an expert at world knowledge. Your task is to step back and paraphrase a question to more generic step-back questions, which is easier to answer.

        The number of step back questions is depend on the complexity of the original question, range from 1 to 5.
        If the question need multiple steps of thinking, it should have more step back queries.
        If the question is simple, it can have just one, it should have more step back queries.
         
         {format_instructions}
         
         Here are a few examples:
         {few_shot_examples}
         """,
         partial_variables={
             'format_instructions': multi_step_back_formater,
                'few_shot_examples': few_shot_prompt.format()
             }
    )
)

human_message = HumanMessagePromptTemplate(
    prompt=PromptTemplate(
        template='{question}',
        input_variables=['question']
    )
)

final_prompt = ChatPromptTemplate.from_messages(
    [
        system_message,
        # few_shot_prompt,
        human_message
    ]
)

### Multi Query generator

In [129]:
multi_step_back_queries_generator = (
    {"question": RunnablePassthrough()}
    | final_prompt
    | ChatOpenAI(model="gpt-4o-mini", temperature=0.9)
    | multi_step_back_parser
    | (lambda x: x['queries'])
)

In [133]:
test = multi_step_back_queries_generator.invoke("What need to consider when using LLM to eval LLM generation?")
# test = multi_step_back_queries_generator.invoke("How to pick rock from floor?")

In [134]:
print(len(test))
rich.print(test)

4


### Building Retriever

In [50]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [51]:
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange


In [52]:
loader = DirectoryLoader('../../pdf_files/',glob="*.pdf",loader_cls=PyPDFLoader)
documents = loader.load()

# Split text into chunks

text_splitter  = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
text_chunks = text_splitter.split_documents(documents)

vectorstore = Chroma.from_documents(documents=text_chunks, 
                                    embedding=embedding,
                                    persist_directory="data/vectorstore")
vectorstore.persist()

retriever = vectorstore.as_retriever()

  vectorstore.persist()


### Add retriever into queries generator

In [64]:
multi_step_back_queries_chain = (
    multi_step_back_queries_generator
    | retriever.map()
)

In [65]:
test = multi_step_back_queries_chain.invoke("What need to consider when using LLM to eval LLM generation?")

In [66]:
rich.print(test)

### RRF

In [69]:
from langchain.load import dumps, loads

def rrf(results: list[list], k=60):
    fused_scores = {}
    for docs in results:
        # assumes the docs are returned in the order of relevance
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc)
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            fused_scores[doc_str] += 1/(rank+k)

    reranked_results = [
        (loads(doc_str), score) for doc_str, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]
    
    return reranked_results

In [90]:
multi_step_back_queries_chain = (
    multi_step_back_queries_generator
    | retriever.map()
    | rrf
    | (lambda obj_list: "\n".join(f"<doc_{i}>{obj[0].page_content}</doc_{i}>" for i, obj in enumerate(obj_list) if obj and obj[0].page_content))
)

In [91]:
test = multi_step_back_queries_chain.invoke("What need to consider when using LLM to eval LLM generation?")

In [95]:
print(test)

<doc_0>Model-Based Evaluation Metrics for Text Generation.
In Proceedings of the 61st Annual Meeting of the
Association for Computational Linguistics (Volume
1: Long Papers), 12067–12097. Toronto, Canada:
Association for Computational Linguistics.</doc_0>
<doc_1>Table 1: The evaluation template with three slots ({Q},
{R1} and {R2}) from Zheng et al. (2023). Even though
the template emphasizes not letting the order affect the
results (red text), large language models still have a
large positional bias.
promising their fairness as evaluators; 2) We de-
velop a calibration framework with three simple yet
effective strategies to calibrate the positional bias of
LLMs; 3) We manually annotate the “win/tie/lose”
outcomes of responses from ChatGPT and Vicuna-</doc_1>
<doc_2>Large Language Models are not Fair Evaluators
Peiyi Wang1 Lei Li1 Liang Chen1 Zefan Cai1 Dawei Zhu1
Binghuai Lin3 Yunbo Cao3 Qi Liu2 Tianyu Liu3 Zhifang Sui1
1 National Key Laboratory for Multimedia Information Processing, 

In [96]:
response_prompt_template = """You are an expert of world knowledge. 
I am going to ask you a question. Your response should be comprehensive and not contradicted with the following context if they are relevant. 
Otherwise, ignore them if they are not relevant.

<normal_context>
# {normal_context}
</normal_context>

<step_back_context>
# {step_back_context}
</step_back_context>


# Original Question: {question}
# Answer:"""

response_prompt = ChatPromptTemplate.from_template(response_prompt_template)

step_back_and_response_chain = (
    {"question": RunnablePassthrough()}
    | {"normal_context": RunnablePassthrough() |  retriever,
     "step_back_context": RunnablePassthrough() | multi_step_back_queries_chain,
     "question": RunnablePassthrough()}
     | response_prompt
     | ChatOpenAI(model="gpt-4o-mini", temperature=0.2)
     | StrOutputParser()
)

In [97]:
res = step_back_and_response_chain.invoke("What need to consider when using LLM to eval LLM generation?")

In [98]:
rich.print(res)