# Retrieval Augmented Generation modification and Evaluation

note: httpx verion 0.27.0  is necessary to use the httpx.AsyncClient with groq. langchain issue that needs fixing.

In [63]:
import os
import json

# Third-party library imports
from dotenv import load_dotenv  # For loading environment variables from a .env file
from PyPDF2 import PdfReader  # For reading PDF files
import tqdm  # For displaying progress bars in loops
import tqdm as notebook_tqdm
from langchain_community.vectorstores import Chroma
import glob
from langchain.text_splitter import RecursiveCharacterTextSplitter  # For splitting text into manageable chunks
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate, ChatPromptTemplate  # For defining and managing prompt templates
from langchain_groq import ChatGroq
from langchain.chains.combine_documents import create_stuff_documents_chain  # For combining retrieved documents into a coherent chain
from langchain_core.messages import HumanMessage, AIMessage  # For handling human and AI messages
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser  # For parsing string outputs from models
from langchain_openai import ChatOpenAI
from huggingface_hub import InferenceClient
import openai
import random
import pandas as pd



In [11]:
load_dotenv()
groq_key = os.getenv("GROQ_API_KEY")
hf_key = os.getenv("HF_ACCESS_TOKEN")
openai.api_key = os.getenv("OPENAI_API_KEY")

In [3]:
### load the pdf from the path
glob_path = "data/*.pdf"
text = ""
for pdf_path in tqdm.tqdm(glob.glob(glob_path)):
    with open(pdf_path, "rb") as file:
        reader = PdfReader(file)
         # Extract text from all pages in the PDF
        text += " ".join(page.extract_text() for page in reader.pages if page.extract_text())

text[:50]

100%|██████████| 2/2 [00:02<00:00,  1.22s/it]


'Hyper tension in adul ts: \ndiagnosis and manag eme'

In [4]:
def split_texts(text:str, chunk_size:int, chunk_overlap:int):
    """
    Split a text into chunks of a given size with a given overlap.
    
    Args:
        text (str): The text to split.
        chunk_size (int): The size of the chunks to split the text into.
        chunk_overlap (int): The number of characters to overlap between chunks.
        
    Returns:
        List[str]: A list of text chunks.
    """
    
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    # Split the extracted text into manageable chunks
    chunks = splitter.split_text(text)
    return chunks



In [5]:
chunks = split_texts(text, 2000, 200)
print(len(chunks))
print(chunks[0])

130
Hyper tension in adul ts: 
diagnosis and manag emen t 
NICE guideline 
Published: 28 August 2019 
Last updat ed: 21 No vember 2023 
www .nice.or g.uk/guidance/ng136 
© NICE 202 4. All right s reserved. Subject t o Notice of right s (https://www .nice.or g.uk/t erms-and-
conditions#notice-of -right s). Your r esponsi bility 
The r ecommendations in t his guideline r epresent t he view of NICE, arriv ed at aft er car eful 
consideration of t he evidence a vailable. When e xercising t heir judgement, pr ofessionals 
and practitioners ar e expect ed to tak e this guideline fully int o account, alongside t he 
individual needs, pr eferences and v alues of t heir patient s or t he people using t heir ser vice. 
It is not mandat ory to apply t he recommendations, and t he guideline does not o verride t he 
responsibility t o mak e decisions appr opriat e to the cir cumstances of t he individual, in 
consultation wit h them and t heir f amilies and car ers or guar dian. 
All pr oblems (adv

In [6]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")



  from .autonotebook import tqdm as notebook_tqdm


In [7]:
vector_store = Chroma.from_texts(chunks, embeddings)

In [8]:
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 2})

In [13]:
docs = retriever.invoke("How do I diagnose Asthma?")
docs

[Document(metadata={}, page_content='medical r ecords, alongside t he coded diagnostic entr y. [NICE 2017 , amended \nBTS/NICE/SIGN 202 4] Asthma: diagnosis, monit oring and chr onic ast hma management (BTS, NICE, SIGN)\n(NG2 45)\n© NICE 202 4. All right s reserved. Subject t o Notice of right s (https://www .nice.or g.uk/t erms-and-\nconditions#notice-of -right s).Page 9 of\n64 Physical examina tion \n1.1.4 Examine people wit h suspect ed ast hma t o identify e xpirat ory polyphonic wheez e \nand signs of ot her causes of r espirat ory sympt oms but be awar e that e ven if \nexamination r esult s are normal, t he person ma y still ha ve ast hma. [NICE 2017] \nInitial tr eatmen t and obje ctive tests f or acu te sym ptoms a t \npresen tation \n1.1.5 Treat people immediat ely if t hey are acut ely unw ell or highly sympt omatic at \npresentation, and per form objectiv e tests that ma y help suppor t a diagnosis of \nasthma (f or example, eosinophil count , fractional e xhaled nitric o x

In [35]:
# Define the template for answering user questions based on a provided context
system_template = """
Answer the users question based on the below context:
<context> {context} </context>
Here is the question: <question> {question} </question>
"""
# Create a prompt template for the question-answering system
question_answering_prompt = PromptTemplate(template=system_template, input_variables=["context", "question"])
output_parser = StrOutputParser()

# Initialize the generative model for question answering
model = ChatGroq(model="llama-3.2-3b-preview", temperature=0, max_tokens=None, timeout=None, max_retries=2,)



In [42]:
#from langchain.globals import set_debug
#set_debug(True)
rag_chain = question_answering_prompt | model | output_parser

In [43]:
query = "How do I diagnose Asthma?"


In [44]:
print(rag_chain.invoke({"context": docs, "question": query}))

[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > prompt:PromptTemplate] Entering Prompt run with input:
[0m[inputs]
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableSequence > prompt:PromptTemplate] [0ms] Exiting Prompt run with output:
[0m[outputs]
[32;1m[1;3m[llm/start][0m [1m[chain:RunnableSequence > llm:ChatGroq] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: \nAnswer the users question based on the below context:\n<context> [Document(metadata={}, page_content='medical r ecords, alongside t he coded diagnostic entr y. [NICE 2017 , amended \\nBTS/NICE/SIGN 202 4] Asthma: diagnosis, monit oring and chr onic ast hma management (BTS, NICE, SIGN)\\n(NG2 45)\\n© NICE 202 4. All right s reserved. Subject t o Notice of right s (https://www .nice.or g.uk/t erms-and-\\nconditions#notice-of -right s).Page 9 of\\n64 Physical examina tion \\n1.1.4 Examine peopl

# Setup Question Generation

In [41]:
# Define the QA generation prompt template
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::
"""

In [55]:
# Function to call the OpenAI API
from openai import OpenAI
def call_llm(prompt: str):
    """
    Calls the OpenAI API to generate a response for a given prompt.

    Args:
        prompt (str): The input prompt for the LLM.
        model (str): The OpenAI model to use (default is "gpt-4").

    Returns:
        str: The generated response from the LLM.
    """
    client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

    response = client.chat.completions.create(
        model="gpt-4o-2024-08-06",
        messages=[
            {"role": "system", "content": prompt}
        ],
        temperature=0.7
    )
    return response.choices[0].message.content


In [42]:
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::"""

In [48]:
N_GENERATIONS = 10

print(f"Generating {N_GENERATIONS} QA couples...")

# Generate QA pairs
outputs = []
for sampled_context in tqdm.tqdm(random.sample(chunks, min(N_GENERATIONS, len(chunks)))):
    # Generate QA couple
    try:
        formatted_prompt = QA_generation_prompt.format(context=sampled_context)
        output_QA_couple = call_llm(formatted_prompt)
        # Extract question and answer from the output
        question = output_QA_couple.split("Factoid question: ")[-1].split("Answer: ")[0].strip()
        answer = output_QA_couple.split("Answer: ")[-1].strip()
        # Validate and append to outputs
        assert len(answer) < 300, "Answer is too long"
        outputs.append(
            {
                "context": sampled_context,
                "question": question,
                "answer": answer,
                
            }
        )
    except Exception as e:
        print(f"Skipped a context due to error: {e}")
        continue

# Print generated outputs
for output in outputs:
    print(output)

Generating 10 QA couples...


100%|██████████| 10/10 [00:13<00:00,  1.35s/it]

{'context': 'is not t he int ention of t he committ ee to stop people fr om tr ying r elaxation t herapies if t hey \nwish t o, but t o mak e people awar e that t here is less e vidence f or benefit of t his \nintervention compar ed wit h other lif estyle int erventions or pharmacological tr eatment. The \ncommitt ee agr eed t hat t he clinical f ocus f or non-pharmacological tr eatment of \nhyper tension should be on encouraging people t o mak e lifestyle changes, such as taking \nregular e xercise and maintaining a healt hy weight. \nThe committ ee agr eed t hat fur ther r esear ch w ould be useful t o det ermine whet her \nrelaxation t herapies ar e a clinically eff ectiv e treatment f or hyper tension in t erms of \nreducing car diovascular e vents or impr oving quality of lif e (see t he recommendation f or \nresear ch on r elaxation t herapies ). The y also not ed that a lar ger study w ould be needed t o \nobtain meaningful r esult s. \nHow this mig ht affect practice \nRelaxati




In [51]:
display(pd.DataFrame(outputs).head(1))

Unnamed: 0,context,question,answer
0,is not t he int ention of t he committ ee to s...,What was the committee's consensus on the upta...,The committee consensus was that uptake has be...


### Question Filtering with Critiques

In [56]:
question_groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

question_relevance_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to machine learning developers building NLP applications with the Hugging Face ecosystem.
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

question_standalone_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how context-independant this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.

For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independant from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

In [58]:
print("Generating critique for each QA couple...")
for output in tqdm.tqdm(outputs):
    evaluations = {
        "groundedness": call_llm(
            question_groundedness_critique_prompt.format(context=output["context"], question=output["question"]),
        ),
        "relevance": call_llm(
            question_relevance_critique_prompt.format(question=output["question"]),
        ),
        "standalone": call_llm(
            question_standalone_critique_prompt.format(question=output["question"]),
        ),
    }
    try:
        for criterion, evaluation in evaluations.items():
            score, eval = (
                int(evaluation.split("Total rating: ")[-1].strip()),
                evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1],
            )
            output.update(
                {
                    f"{criterion}_score": score,
                    f"{criterion}_eval": eval,
                }
            )
    except Exception as e:
        continue



Generating critique for each QA couple...


100%|██████████| 10/10 [00:51<00:00,  5.16s/it]


In [61]:

pd.set_option("display.max_colwidth", None)

generated_questions = pd.DataFrame.from_dict(outputs)

print("Evaluation dataset before filtering:")
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ]
)
generated_questions = generated_questions.loc[
    (generated_questions["groundedness_score"] >= 4)
    & (generated_questions["relevance_score"] >= 4)
    & (generated_questions["standalone_score"] >= 4)
]
print("============================================")
print("Final evaluation dataset:")
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ]
)

eval_dataset = datasets.Dataset.from_pandas(generated_questions, split="train", preserve_index=False)

Evaluation dataset before filtering:


Unnamed: 0,question,answer,groundedness_score,relevance_score,standalone_score
0,What was the committee's consensus on the uptake of relaxation therapies for hypertension treatment according to the 2011 guideline?,The committee consensus was that uptake has been low.,5,1,1
1,How long should the medicine be given for a trial period when using antagonist (LAMA) in addition to moderate-dose MART?,8 to 12 weeks.,5,1,2
2,What should be checked when asthma is uncontrolled according to the guidelines?,The fractional exhaled nitric oxide (FeNO) level.,5,1,1
3,What treatment pathway is recommended for people aged 12 and over for asthma management?,"MART with increasing dose of regular ICS/formoterol, depending on response to treatment.",3,1,5
4,"What type of medication is cheaper and simpler to add for asthma treatment, LTRA or LAMA?",LTRA (leukotriene receptor antagonist) is cheaper and simpler to add.,5,1,5
5,What was the lower systolic blood pressure target compared to the higher target in the SPRINT trial for people with primary hypertension without type 2 diabetes?,120 mmHg compared with 140 mmHg.,5,1,2
6,What is the ISBN number for the NICE guideline on hypertension in adults?,978-1-4731-5589-3,5,1,1
7,What type of drug is recommended for adults of Black African or African–Caribbean family origin when choosing antihypertensive treatment?,Angiotensin II receptor blocker (ARB),5,1,5
8,What treatment is recommended for young children with recurrent wheeze and features suggesting asthma?,A low dose of ICS for 8 to 12 weeks.,5,1,5
9,What test is used to check for the presence of protein in the urine for people with hypertension?,Estimation of the albumin:creatinine ratio.,5,1,5


Final evaluation dataset:


Unnamed: 0,question,answer,groundedness_score,relevance_score,standalone_score


NameError: name 'datasets' is not defined