# Realm Augmented Generation for Question Answering tasks

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [24]:
HF_TOKEN = "Your HuggingFace Token"
OPENAI_API_KEY = "Your Open AI API key"


In [None]:
!pip install -q tqdm openai pandas
!pip install -q ragatouille datasets torch transformers transformers accelerate bitsandbytes langchain sentence-transformers faiss-gpu openpyxl pacmap

In [4]:
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import datasets

pd.set_option("display.max_colwidth", None)

In [None]:
import os
from huggingface_hub import notebook_login

os.environ["HF_TOKEN"] = HF_TOKEN

notebook_login()

### Load your knowledge base

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document as LangchainDocument

NO_OF_FILES = 6
noOfFiles = NO_OF_FILES

langchain_docs = []
for i in range(1,noOfFiles+1):
  df = pd.read_csv("/content/drive/MyDrive/content/data/output"+str(i)+".csv")
  temp = [
      LangchainDocument(page_content=text, metadata={"source": "book"+str(i)})
      for text in tqdm(df['text'])
  ]
  # later add page number to the source
  langchain_docs.extend(temp)
print(langchain_docs)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    add_start_index=True,
    separators=["\n\n", "\n", ".", " ", ""],
)

docs_processed = []
for doc in langchain_docs:
    docs_processed += text_splitter.split_documents([doc])

RAW_KNOWLEDGE_BASE = langchain_docs

  0%|          | 0/241 [00:00<?, ?it/s]

  0%|          | 0/230 [00:00<?, ?it/s]

  0%|          | 0/241 [00:00<?, ?it/s]

  0%|          | 0/268 [00:00<?, ?it/s]

  0%|          | 0/294 [00:00<?, ?it/s]

  0%|          | 0/202 [00:00<?, ?it/s]



### 1.2. Setup agents for question generation

We use [Mixtral](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) for QA couple generation because it it has excellent performance in leaderboards such as [Chatbot Arena](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard).

In [7]:
from huggingface_hub import InferenceClient


repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
api_token = HF_TOKEN

llm_client = InferenceClient(
    model=repo_id,
    token = api_token,
    timeout=120,
)


def call_llm(inference_client: InferenceClient, prompt: str):
    response = inference_client.post(
        json={
            "inputs": prompt,
            "parameters": {"max_new_tokens": 1000},
            "task": "text-generation",
        },
    )
    return json.loads(response.decode())[0]["generated_text"]


call_llm(llm_client, "This is a test context")

'This is a test context for the `@mui/material` library.\n\n## Installation\n\n```sh\nnpm install @mui/material\n```\n\n## Usage\n\n```jsx\nimport React from \'react\';\nimport { Button } from \'@mui/material\';\n\nfunction App() {\n  return (\n    <div className="App">\n      <Button variant="contained" color="primary">\n        Hello World\n      </Button>\n    </div>\n  );\n}\n\nexport default App;\n```\n\n## Documentation\n\n- [Material-UI](https://material-ui.com/)\n- [Material Design](https://material.io/)'

In [8]:
QA_generation_prompt = """
Your task is to write a context-based question and provide a comprehensive answer based on the given context.
The context-based question should seek detailed explanations, reasoning, or descriptions that can be inferred from the context.
Avoid questions that require only short, factual responses and instead focus on questions that allow for elaborated answers.
Your answer should utilize the information from the context, synthesizing it to form a cohesive and informative response as if the author were elaborating on the topic.

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::"""


Now let's generate our QA couples.
For this example, we generate only 10 QA couples and will load the rest from the Hub.



In [None]:
import random

N_GENERATIONS = 100  # We intentionally generate only 100 QA couples here for cost and time considerations

print(f"Generating {N_GENERATIONS} QA couples...")

outputs = []
for sampled_context in tqdm(random.sample(docs_processed, N_GENERATIONS)):
    # Generate QA couple
    output_QA_couple = call_llm(
        llm_client, QA_generation_prompt.format(context=sampled_context.page_content)
    )
    try:
        question = output_QA_couple.split("Factoid question: ")[-1].split("Answer: ")[0]
        answer = output_QA_couple.split("Answer: ")[-1]
        assert len(answer) < 300, "Answer is too long"
        outputs.append(
            {
                "context": sampled_context.page_content,
                "question": question,
                "answer": answer,
                "source_doc": sampled_context.metadata["source"],
            }
        )
    except:
        continue

###  Setup critique agents


In [10]:
question_groundedness_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'groundedness rating' assessing how deeply the question is anchored in and can be answered using the provided context.
This rating should reflect whether the question draws substantially on the content and nuances of the context.

Rate the groundedness on a scale from 1 to 5, where:
- 1 means the question is not answerable at all based on the context (the context provides no support for answering the question).
- 5 means the question is deeply grounded and can be clearly and unambiguously answered with the context (the context provides robust support for answering the question).

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

question_relevance_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'relevance rating' representing how useful this question and its context are for people with little to no background in finance or commerce to understand concepts related to wealth generation, finance, and economy.

Rate the relevance on a scale from 1 to 5, where:
- 1 means the question and context are not useful at all for understanding these topics.
- 5 means the question and context are extremely useful and provide significant insights into financial and economic concepts.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

question_standalone_critique_prompt = """
You will be given a context and a question.
Your task is to provide a 'stand-alone rating' representing how understandable this question is without needing to refer back to the given context.
Give your answer on a scale of 1 to 5, where:
- 1 means that the question cannot be understood without additional context specific to the given document or situation.
- 5 means that the question is completely understandable on its own, even without the provided context, to someone with domain knowledge or internet access.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """





In [None]:
print("Generating critique for each QA couple...")
for output in tqdm(outputs):
    evaluations = {
        "groundedness": call_llm(
            llm_client,
            question_groundedness_critique_prompt.format(
                context=output["context"], question=output["question"]
            ),
        ),
        "relevance": call_llm(
            llm_client,
            question_relevance_critique_prompt.format(question=output["question"]),
        ),
        "standalone": call_llm(
            llm_client,
            question_standalone_critique_prompt.format(question=output["question"]),
        ),
    }
    try:
        for criterion, evaluation in evaluations.items():
            score, eval = (
                int(evaluation.split("Total rating: ")[-1].strip()),
                evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1],
            )
            output.update(
                {
                    f"{criterion}_score": score,
                    f"{criterion}_eval": eval,
                }
            )
    except Exception as e:
        continue

Now let us filter out bad questions based on our critique agent scores:

In [13]:
import pandas as pd

pd.set_option("display.max_colwidth", None)

generated_questions = pd.DataFrame.from_dict(outputs)

print("Evaluation dataset before filtering:")
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ]
)
generated_questions = generated_questions.loc[
    ((generated_questions["groundedness_score"] >= 3)
    | (generated_questions["relevance_score"] >= 3))
    & (generated_questions["standalone_score"] >= 5)
]
print("============================================")
print()
print()
print("Final evaluation dataset:")
print("No of questions: ",len(generated_questions))
display(
    generated_questions[
        [
            "question",
            "answer",
            "groundedness_score",
            "relevance_score",
            "standalone_score",
        ]
    ]
)

eval_dataset = datasets.Dataset.from_pandas(
    generated_questions, split="train", preserve_index=False
)

Evaluation dataset before filtering:


Unnamed: 0,question,answer,groundedness_score,relevance_score,standalone_score
0,Why did the derelict hurry off after getting the dollar from rich dad?\n,The derelict hurried off after getting the dollar from rich dad because he was ecstatic with his good fortune.,2.0,2.0,4.0
1,How much did Luke and his wife save annually?\n,"Luke and his wife saved $50,000 annually.",5.0,5.0,3.0
2,What is Ken's occupation?\n,Ken is a key executive with a major communications and entertainment corporation.,2.0,5.0,1.0
3,What is the title of the guide to getting out of the Rat Race and onto the Fast Track?\n,Rich Dad’s CASHFLOW Quadrant,5.0,2.0,3.0
4,Who started the business called Kinko's?\n,"Paul Orfalea started the business called Kinko's with a $5,000 loan co-signed by his father in 1969. He initially rented a small garage and sold about $2,000 worth of services daily with the help of a few friends.",5.0,5.0,5.0
5,What is the name of the book that the author is best known for?\n,"The author is best known for his book ""Rich Dad Poor Dad"".",5.0,1.0,1.0
6,How long did it take for the family to pay off all of their consumer debt?\n,It took the family ten months to pay off all of their consumer debt.,,,
7,What is an example of a future goal that young children may have?\n,"An example of a future goal that young children may have is driving a tractor when they grow up. This goal reflects the interests and values of a young child, who may not yet fully understand the complexities and challenges of pursuing such a goal in the context of their future lives.",3.0,2.0,5.0
8,What was the surprise present given to Mr. Allan?\n,The surprise present given to Mr. Allan was a Rolls-Royce.,5.0,2.0,3.0
9,What is the focus of the authors regarding the wealth accumulation habits of stock brokers?\n,"The authors express an interest in studying the wealth accumulation habits of stock brokers, but no specific information or findings are provided in the context.",2.0,5.0,




Final evaluation dataset:
No of questions:  10


Unnamed: 0,question,answer,groundedness_score,relevance_score,standalone_score
4,Who started the business called Kinko's?\n,"Paul Orfalea started the business called Kinko's with a $5,000 loan co-signed by his father in 1969. He initially rented a small garage and sold about $2,000 worth of services daily with the help of a few friends.",5.0,5.0,5.0
7,What is an example of a future goal that young children may have?\n,"An example of a future goal that young children may have is driving a tractor when they grow up. This goal reflects the interests and values of a young child, who may not yet fully understand the complexities and challenges of pursuing such a goal in the context of their future lives.",3.0,2.0,5.0
23,What percentage of American households have a money market deposit account?\n,"According to the context, approximately 15 percent of American households have a money market deposit account.",5.0,4.0,5.0
28,Who is responsible for budgeting in the Rule household?\n,Mrs. Rule is responsible for budgeting in the Rule household. She is in charge of both household and business budgeting and spending.,5.0,2.0,5.0
32,"Who gets the chance to buy shares in small, newborn companies at the initial offering price?\n","Generally, ""big"" investors, such as fund managers who have millions, and even billions of dollars to work with, get the chance to buy shares in small, newborn companies at the initial offering price. These opportunities are rarely available to individual investors.",5.0,4.0,5.0
34,What is the purpose of the beginner emergency fund?\n,"The purpose of the beginner emergency fund is to serve as a financial safety net for unexpected expenses or emergencies, helping individuals avoid borrowing to cover these costs and maintain financial stability.",2.0,5.0,5.0
37,What do people do when they experience an increase in their realized income?\n,"When people encounter an increase in their realized income, they tend to spend it rather than save it. This behavior is attributed to their need for immediate satisfaction or consumption, rather than deferring the use of the additional income for future needs or wants.",2.0,4.0,5.0
40,Why are entrepreneurs becoming more common in the millionaire market?\n,"The real growth in the millionaire market continues to come from the entrepreneurial segment. Entrepreneurs, as a rule, are becoming more common in the millionaire market because they contribute significantly to the market's expansion.",3.0,5.0,5.0
41,When did Home Depot become the largest home-repair chain in the country?\n,"Home Depot became the largest home-repair chain in the country in 1989, three years after selling more stock and using the proceeds to pay some of its debts in 1986.",5.0,3.0,5.0
42,Who was John Law?\n,"John Law was a flashy wheeler-dealer and the founder of the Mississippi Company, a pet project that involved selling shares to tens of thousands of gullible customers in the stock markets of Paris and London. He was one of the most interesting characters of his century.",2.0,4.0,5.0


Now our synthetic evaluation dataset is complete! We can evaluate different RAG systems on this evaluation dataset.

We have generated only a few QA couples here to reduce time and cost. But let's kick start the next part by loading a pre-generated dataset:

###  Preprocessing documents to build our vector database


In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer


def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
    tokenizer_name: str,
) -> List[LangchainDocument]:
    """
    Split documents into chunks of size `chunk_size` characters and return a list of documents.
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=["\n\n", "\n", ".", " ", ""],
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique

###  Retriever - embeddings 🗂️


In [15]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
import os


def load_embeddings(
    langchain_docs: List[LangchainDocument],
    chunk_size: int,
    embedding_model_name: Optional[str] = "sentence-transformers/all-MiniLM-L6-v2",
) -> FAISS:
    """
    Creates a FAISS index from the given embedding model and documents. Loads the index directly if it already exists.

    Args:
        langchain_docs: list of documents
        chunk_size: size of the chunks to split the documents into
        embedding_model_name: name of the embedding model to use

    Returns:
        FAISS index
    """
    # load embedding_model
    embedding_model = HuggingFaceEmbeddings(
        model_name=embedding_model_name,
        multi_process=True,
        model_kwargs={"device": "cuda"},
        encode_kwargs={
            "normalize_embeddings": True
        },  # set True to compute cosine similarity
    )

    # Check if embeddings already exist on disk
    index_name = (
        f"index_chunk:{chunk_size}_embeddings:{embedding_model_name.replace('/', '~')}"
    )
    index_folder_path = f"/content/drive/MyDrive/content/data/indexes/{index_name}/"
    if os.path.isdir(index_folder_path):
        return FAISS.load_local(
            index_folder_path,
            embedding_model,
            distance_strategy=DistanceStrategy.COSINE,
            # mine
            allow_dangerous_deserialization=True
        )

    else:
        print("Index not found, generating it...")
        docs_processed = split_documents(
            chunk_size,
            langchain_docs,
            embedding_model_name,
        )
        knowledge_index = FAISS.from_documents(
            docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
        )
        knowledge_index.save_local(index_folder_path)
        return knowledge_index

### 2.3. Reader - LLM 💬



In [16]:
RAG_PROMPT_TEMPLATE = """
Using the information provided in the context, elaborate on the question asked by integrating insights and interpretations from the context.
Ensure the response is thorough and explores the dimensions of the question, drawing extensively on the contextual information provided.
The response should not only answer the question but should also illuminate the question's broader implications or underlying themes as suggested by the context.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.</s>
<|user|>
Context:
{context}
---
Now here is the question you need to answer.

Question: {question}
</s>
<|assistant|>
"""

In [17]:
from langchain_community.llms import HuggingFaceHub

repo_id = "HuggingFaceH4/zephyr-7b-beta"
READER_MODEL_NAME = "zephyr-7b-beta"
hugging_face_api_token = HF_TOKEN

READER_LLM = HuggingFaceHub(
    repo_id=repo_id,
    task="text-generation",
    model_kwargs={
        "max_new_tokens": 512,
        "top_k": 30,
        "temperature": 0.1,
        "repetition_penalty": 1.03,
    },
    huggingfacehub_api_token=hugging_face_api_token
)

  warn_deprecated(


In [18]:
from ragatouille import RAGPretrainedModel
from langchain_core.vectorstores import VectorStore
from langchain_core.language_models.llms import LLM


def answer_with_rag(
    question: str,
    llm: LLM,
    knowledge_index: VectorStore,
    reranker: Optional[RAGPretrainedModel] = None,
    num_retrieved_docs: int = 30,
    num_docs_final: int = 7,
) -> Tuple[str, List[LangchainDocument]]:
    """Answer a question using RAG with the given knowledge index."""
    # Gather documents with retriever
    relevant_docs = knowledge_index.similarity_search(
        query=question, k=num_retrieved_docs
    )
    relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text

    # Optionally rerank results
    if reranker:
        relevant_docs = reranker.rerank(question, relevant_docs, k=num_docs_final)
        relevant_docs = [doc["content"] for doc in relevant_docs]

    relevant_docs = relevant_docs[:num_docs_final]

    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join(
        [f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)]
    )

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    # Redact an answer
    answer = llm(final_prompt)

    return answer, relevant_docs

#  Benchmarking the RAG system


In [19]:
def run_rag_tests(
    eval_dataset: datasets.Dataset,
    llm,
    knowledge_index: VectorStore,
    output_file: str,
    reranker: Optional[RAGPretrainedModel] = None,
    verbose: Optional[bool] = True,
    test_settings: Optional[str] = None,  # To document the test settings used
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    try:  # load previous generations if they exist
        with open(output_file, "r") as f:
            outputs = json.load(f)
    except:
        outputs = []

    for example in tqdm(eval_dataset):
        question = example["question"]
        if question in [output["question"] for output in outputs]:
            continue

        answer, relevant_docs = answer_with_rag(
            question, llm, knowledge_index, reranker=reranker
        )
        if verbose:
            print("=======================================================")
            print(f"Question: {question}")
            print(f"Answer: {answer}")
            print(f'True answer: {example["answer"]}')
        result = {
            "question": question,
            "true_answer": example["answer"],
            "source_doc": example["source_doc"],
            "generated_answer": answer,
            "retrieved_docs": [doc for doc in relevant_docs],
        }
        if test_settings:
            result["test_settings"] = test_settings
        outputs.append(result)

        with open(output_file, "w") as f:
            json.dump(outputs, f)


In [20]:
EVALUATION_PROMPT = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assesses the quality of the response based on how effectively it uses the context to address the question, rather than evaluating merely on factual accuracy.
2. After writing the feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Is the response correct, accurate, and factual based on the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback:"""

from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)

from langchain.schema import SystemMessage

evaluation_prompt_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(content="You are a fair evaluator language model."),
        HumanMessagePromptTemplate.from_template(EVALUATION_PROMPT),
    ]
)


In [None]:
!pip install langchain-openai

In [22]:
from langchain_openai import ChatOpenAI
eval_chat_model = ChatOpenAI(model="gpt-4-turbo-preview",temperature=0, openai_api_key=OPENAI_API_KEY)
evaluator_name = "GPT"


def evaluate_answers(
    answer_path: str,
    eval_chat_model,
    evaluator_name: str,
    evaluation_prompt_template: ChatPromptTemplate,
) -> None:
    """Evaluates generated answers. Modifies the given answer file in place for better checkpointing."""
    answers = []
    if os.path.isfile(answer_path):  # load previous generations if they exist
        answers = json.load(open(answer_path, "r"))

    for experiment in tqdm(answers):
        if f"eval_score_{evaluator_name}" in experiment:
            continue

        eval_prompt = evaluation_prompt_template.format_messages(
            instruction=experiment["question"],
            response=experiment["generated_answer"],
            reference_answer=experiment["true_answer"],
        )
        eval_result = eval_chat_model.invoke(eval_prompt)
        feedback, score = [
            item.strip() for item in eval_result.content.split("[RESULT]")
        ]
        experiment[f"eval_score_{evaluator_name}"] = score
        experiment[f"eval_feedback_{evaluator_name}"] = feedback

        with open(answer_path, "w") as f:
            json.dump(answers, f)


Let's run the tests and evaluate answers

In [None]:
if not os.path.exists("/content/drive/MyDrive/content/data/output"):
    os.mkdir("/content/drive/MyDrive/content/data/output")

for chunk_size in [320]:  # Add other chunk sizes (in tokens) as needed like [100,200,300,400,500]
    for embeddings in ["thenlper/gte-small"]:  # Add other embeddings as needed like ["thenlper/gte-small","sentence-transformers/all-MiniLM-L6-v2","sentence-transformers/roberta-base-nli-stsb-mean-tokens"]
        for rerank in [True, False]:
            settings_name = f"chunk:{chunk_size}_embeddings:{embeddings.replace('/', '~')}_rerank:{rerank}_reader-model:{READER_MODEL_NAME}"
            output_file_name = f"/content/drive/MyDrive/content/data/output/rag_{settings_name}.json"

            print(f"Running evaluation for {settings_name}:")

            print("Loading knowledge base embeddings...")
            knowledge_index = load_embeddings(
                RAW_KNOWLEDGE_BASE,
                chunk_size=chunk_size,
                embedding_model_name=embeddings,
            )

            print("Running RAG...")
            reranker = (
                RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
                if rerank
                else None
            )
            run_rag_tests(
                eval_dataset=eval_dataset,
                llm=READER_LLM,
                knowledge_index=knowledge_index,
                output_file=output_file_name,
                reranker=reranker,
                verbose=False,
                test_settings=settings_name,
            )

            print("Running evaluation...")
            evaluate_answers(
                output_file_name,
                eval_chat_model,
                evaluator_name,
                evaluation_prompt_template,
            )

### Inspect results

In [None]:
import glob

outputs, i = [], 0
for file in glob.glob("/content/drive/MyDrive/content/data/output/*.json"):
    output = pd.DataFrame(json.load(open(file, "r")))
    output["settings"] = file
    outputs.append(output)
    output.to_json("/content/drive/MyDrive/content/data/results"+str(i)+".json")
    i+=1
result = pd.concat(outputs)


In [None]:
result["eval_score_GPT"] = result["eval_score_GPT"].apply(
    lambda x: int(x) if isinstance(x, str) else 1
)
result["eval_score_GPT"] = (result["eval_score_GPT"] - 1) / 4

In [None]:
average_scores = result.groupby("settings")["eval_score_GPT"].mean()
average_scores.sort_values()
evaluation_questions = pd.DataFrame(average_scores)
evaluation_questions.to_json("/content/drive/MyDrive/content/data/average_scores.json")