# Evaluation of RAG Using Ragas

# Task 1: Installing Required Libraries

In [1]:
pip install -U -q langchain langchain-openai langchain_core langchain-community langchainhub openai

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install -qU ragas nest-asyncio

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install -qU qdrant-client pymupdf pandas

Note: you may need to restart the kernel to use updated packages.


# Task 2: Set Environment Variables

In [4]:
import os
import openai
from getpass import getpass

openai.api_key = getpass("Please provide your OpenAI Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key
os.environ["PYTHONTRACEMALLOC"] = "1"

# Task 3: Creating a Simple RAG Pipeline with LangChain v0.1.0

## Building our RAG pipeline

### Creating an Index

### Loading Data

In [5]:
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader(
    "https://d1lamhf6l6yk6d.cloudfront.net/uploads/2021/08/The-pmarca-Blog-Archives.pdf",
)

documents = loader.load()

In [6]:
documents[0].metadata

{'source': 'https://d1lamhf6l6yk6d.cloudfront.net/uploads/2021/08/The-pmarca-Blog-Archives.pdf',
 'file_path': 'https://d1lamhf6l6yk6d.cloudfront.net/uploads/2021/08/The-pmarca-Blog-Archives.pdf',
 'page': 0,
 'total_pages': 195,
 'format': 'PDF 1.3',
 'title': 'The Pmarca Blog Archives',
 'author': '',
 'subject': '',
 'keywords': '',
 'creator': '',
 'producer': 'Mac OS X 10.10 Quartz PDFContext',
 'creationDate': "D:20150110020418Z00'00'",
 'modDate': "D:20150110020418Z00'00'",
 'trapped': ''}

### Transforming Data

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 200,
    chunk_overlap = 50
)

documents = text_splitter.split_documents(documents)

In [8]:
len(documents)

1864

### Loading OpenAI Embeddings Model

In [9]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    model="text-embedding-ada-002"
)

### Creating a QDrant VectorStore

In [10]:
from langchain_community.vectorstores import Qdrant

qdrant_vector_store = Qdrant.from_documents(
    documents,
    embeddings,
    location=":memory:",
    collection_name="PMarca Blogs",
)

### ❓ Question #1:
List out a few of the techniques that Qdrant uses that make it performant.

Qdrant uses many techniques to reduce search latency, including caching disk data in RAM and preloading data from disk to RAM. As a result, the Qdrant process might use more memory than the minimum required to run the service. 

Approximate Nearest Neighbor Search (ANNS): Qdrant utilizes approximate nearest neighbor search algorithms, such as HNSW (Hierarchical Navigable Small World) graphs, to quickly find approximate closest vectors. This significantly reduces search time compared to other similar methods.

Indexing Techniques: Qdrant uses advanced indexing techniques, including quantization and clustering, to organize and partition the vector space. This improves search efficiency by narrowing down the search space.

## Creating a Retriever

In [11]:
retriever = qdrant_vector_store.as_retriever()


## Testing our Retriever

In [12]:
retrieved_documents = retriever.invoke("What is a rule of thumb for selecting an industry to invest in?")
for doc in retrieved_documents:
  print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
  print(doc)

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
page_content='the existing order — and make sure that those forces of change
have a reasonable chance at succeeding.
Second rule of thumb:
Once you have picked an industry, get right to the center of it' metadata={'source': 'https://d1lamhf6l6yk6d.cloudfront.net/uploads/2021/08/The-pmarca-Blog-Archives.pdf', 'file_path': 'https://d1lamhf6l6yk6d.cloudfront.net/uploads/2021/08/The-pmarca-Blog-Archives.pdf', 'page': 125, 'total_pages': 195, 'format': 'PDF 1.3', 'title': 'The Pmarca Blog Archives', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'Mac OS X 10.10 Quartz PDFContext', 'creationDate': "D:20150110020418Z00'00'", 'modDate': "D:20150110020418Z00'00'", 'trapped': '', '_id': 'ba4cae00ee58482ba45c7ad82d236557', '_collection_name': 'PMarca Blogs'}
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
page_content='Third rule:
In a rapidly changing Held like technology, the best place to
get experience when you’re starting out is in younger, high-
growth c

## Creating a RAG Chain

## Creating a Prompt Template

In [13]:
from langchain import hub

retrieval_qa_prompt = hub.pull("langchain-ai/retrieval-qa-chat")

In [14]:
print(retrieval_qa_prompt.messages[0].prompt.template)


Answer any use questions based solely on the context below:

<context>
{context}
</context>


In [15]:
from langchain.prompts import ChatPromptTemplate

template = """Answer the question based only on the following context. If you cannot answer the question with the context, please respond with 'I don't know':

Context:
{context}

Question:
{question}
"""

prompt = ChatPromptTemplate.from_template(template)
     

# Setting Up our Basic QA Chain

In [16]:
from operator import itemgetter

from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

primary_qa_llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

retrieval_augmented_qa_chain = (
    # INVOKE CHAIN WITH: {"question" : "<>"}
    # "question" : populated by getting the value of the "question" key
    # "context"  : populated by getting the value of the "question" key and chaining it into the base_retriever
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    # "context"  : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
    #              by getting the value of the "context" key from the previous step
    | RunnablePassthrough.assign(context=itemgetter("context"))
    # "response" : the "context" and "question" values are used to format our prompt object and then piped
    #              into the LLM and stored in a key called "response"
    # "context"  : populated by getting the value of the "context" key from the previous step
    | {"response": prompt | primary_qa_llm, "context": itemgetter("context")}
)

# 🏗️ Activity #1:
Describe the pipeline shown above in simple terms. You can include a diagram if desired.


The provided pipeline is a structured process that leverages an LLM (Large Language Model) to perform a retrieval-augmented question-answering task. Here's a breakdown of each step in simpler terms:

1. Initialize LLM:

    - primary_qa_llm is a ChatOpenAI instance, set to use the "gpt-3.5-turbo" model with a specific seed for randomness.

2. Setup Pipeline:

    - The pipeline is a sequence of steps that take in a user's question and process it to generate an answer using both retrieval and the LLM.

3. Step-by-Step Explanation:

    Step 1:

        - The input {"question" : "<<SOME USER QUESTION>>"} is received.
        - The "context" is populated by retrieving relevant information based on the "question".
        - The itemgetter("question") | retriever part indicates that the value of "question" is passed to a retriever to fetch the context.
Step 2:

        - The "context" obtained from the previous step is passed through a RunnablePassthrough, which simply passes the value along without modifying it.

Step 3:

        - The final step formats the input (context and question) into a prompt for the LLM.
        - The LLM generates a response based on this prompt, and the response is stored in a key called "response".

In [17]:
response = retrieval_augmented_qa_chain.invoke({"question": "What is the The Moby Dick theory of big companies, explain it to me?"})
response["response"].content

'The Moby Dick theory of big companies is a concept that involves comparing big companies to the character Captain Ahab from the novel Moby Dick. It suggests that companies should not become obsessed or consumed by pursuing a single goal or target, similar to how Captain Ahab was consumed by his obsession with hunting the white whale. Instead, companies should engage in various discussions and activities with big companies but avoid becoming fixated on one particular aspect.'

In [18]:
for context in response["context"]:
  print("Context:")
  print(context)
  print("----")

Context:
page_content='Part 5: The Moby Dick theory of big companies
37' metadata={'source': 'https://d1lamhf6l6yk6d.cloudfront.net/uploads/2021/08/The-pmarca-Blog-Archives.pdf', 'file_path': 'https://d1lamhf6l6yk6d.cloudfront.net/uploads/2021/08/The-pmarca-Blog-Archives.pdf', 'page': 41, 'total_pages': 195, 'format': 'PDF 1.3', 'title': 'The Pmarca Blog Archives', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'Mac OS X 10.10 Quartz PDFContext', 'creationDate': "D:20150110020418Z00'00'", 'modDate': "D:20150110020418Z00'00'", 'trapped': '', '_id': 'd91bb9c85121408697385d1905ca8ecb', '_collection_name': 'PMarca Blogs'}
----
Context:
page_content='Part 5: The Moby Dick theory of big
companies
“There she blows,” was sung out from the mast-head.
“Where away?” demanded the captain.
“Three points oE the lee bow, sir.”' metadata={'source': 'https://d1lamhf6l6yk6d.cloudfront.net/uploads/2021/08/The-pmarca-Blog-Archives.pdf', 'file_path': 'https://d1lamhf6l6yk6d.cloudfr

In [19]:
question = "What is a rule of thumb for selecting an industry to invest in?"

result = retrieval_augmented_qa_chain.invoke({"question" : question})

print(result["response"].content)

Get right to the center of it.


In [20]:
question = "What did Pink Floyd have to say about how to proceed when investing in a new industry?"

result = retrieval_augmented_qa_chain.invoke({"question" : question})

print(result["response"].content)
print(result["context"])

I don't know.
[Document(metadata={'source': 'https://d1lamhf6l6yk6d.cloudfront.net/uploads/2021/08/The-pmarca-Blog-Archives.pdf', 'file_path': 'https://d1lamhf6l6yk6d.cloudfront.net/uploads/2021/08/The-pmarca-Blog-Archives.pdf', 'page': 15, 'total_pages': 195, 'format': 'PDF 1.3', 'title': 'The Pmarca Blog Archives', 'author': '', 'subject': '', 'keywords': '', 'creator': '', 'producer': 'Mac OS X 10.10 Quartz PDFContext', 'creationDate': "D:20150110020418Z00'00'", 'modDate': "D:20150110020418Z00'00'", 'trapped': '', '_id': '657b9e4f8e644894bb1c605cc8200ccc', '_collection_name': 'PMarca Blogs'}, page_content='ask if you can call them again if things change.\nTrust me — they’d much rather be saying “yes” than “no” —\nthey need all the good investments they can get.\nSecond, consider the environment.'), Document(metadata={'source': 'https://d1lamhf6l6yk6d.cloudfront.net/uploads/2021/08/The-pmarca-Blog-Archives.pdf', 'file_path': 'https://d1lamhf6l6yk6d.cloudfront.net/uploads/2021/08/The-

# Task 4: Synthetic Dataset Generation for Evaluation using Ragas

# Synthetic Test Set Generation

In [21]:
loader = PyMuPDFLoader(
    "https://d1lamhf6l6yk6d.cloudfront.net/uploads/2021/08/The-pmarca-Blog-Archives.pdf",
)

eval_documents = loader.load()

text_splitter_eval = RecursiveCharacterTextSplitter(
    chunk_size = 600,
    chunk_overlap = 50
)

eval_documents = text_splitter_eval.split_documents(eval_documents)

# ❓ Question #2:
Why is it important to split our documents using different parameters when creating our synthetic data?

We need to split documents differently because using the same chunks as the original RAG setup might not challenge the system effectively. If our evaluation questions are too much like the original documents, we won't be able to create varied and tough questions that truly test the RAG pipeline. It's like giving students a test with questions they've already seen, which doesn't really check their knowledge. For a proper evaluation, our test needs to be realistic and varied to see how the model will do in real-world situations.


In [22]:
len(eval_documents)


624

In [24]:
import nest_asyncio
nest_asyncio.apply()

from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

generator_llm = ChatOpenAI(model="gpt-3.5-turbo-16k")
# critic_llm = ChatOpenAI(model="gpt-3.5-turbo") # <--- If you don't have GPT-4 access, or to reduce cost/rate limiting issues.
critic_llm = ChatOpenAI(model="gpt-4o")
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

distributions = {
    simple: 0.5,
    multi_context: 0.4,
    reasoning: 0.1
}

testset = generator.generate_with_langchain_docs(eval_documents, 20, distributions, is_async = False, raise_exceptions=True)
testset.to_pandas()

embedding nodes:   0%|          | 0/1248 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/20 [00:00<?, ?it/s]

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What is the importance of having a technical e...,[workforce in a high-impact way when you gradu...,Having a technical element in undergraduate de...,simple,[{'source': 'https://d1lamhf6l6yk6d.cloudfront...,True
1,How does the faster cycle time in startups mak...,[would have spent 10 or 20 or 30 years buildin...,The faster cycle time in startups makes it eas...,simple,[{'source': 'https://d1lamhf6l6yk6d.cloudfront...,True
2,What is the emotional incentive that founders ...,[and give her a large stock-option grant with ...,Founders naturally have an emotional incentive...,simple,[{'source': 'https://d1lamhf6l6yk6d.cloudfront...,True
3,How does the quality of output in a creative c...,[becomes irrelevant to determining the success...,Quality of output in a creative career does no...,simple,[{'source': 'https://d1lamhf6l6yk6d.cloudfront...,True
4,How does precociousness and longevity affect t...,[These three components are conspicuously link...,The answer to given question is not present in...,simple,[{'source': 'https://d1lamhf6l6yk6d.cloudfront...,True
5,What are some techniques that can help entrepr...,[I want to tell you about my new startup” is a...,If you engage in a set of these techniques ove...,simple,[{'source': 'https://d1lamhf6l6yk6d.cloudfront...,True
6,What are the potential consequences of not bei...,[Here’s why you shouldn’t do that:\nWhat are t...,Not being able to secure additional funding wh...,simple,[{'source': 'https://d1lamhf6l6yk6d.cloudfront...,True
7,How can structured procrastination be used to ...,[like?\nStructured procrastination\nThis is a ...,Structured procrastination can be used to acco...,simple,[{'source': 'https://d1lamhf6l6yk6d.cloudfront...,True
8,What criteria should be valued when evaluating...,[How to hire the best people you've\never work...,Criteria that should be valued when evaluating...,simple,[{'source': 'https://d1lamhf6l6yk6d.cloudfront...,True
9,What factors determine the peak age for creati...,"[ods when productivity is highest, the peak ag...",The expected age optimum for quantity and qual...,simple,[{'source': 'https://d1lamhf6l6yk6d.cloudfront...,True


## ❓ Question #3:
{simple: 0.5, reasoning: 0.25, multi_context: 0.25}

What exactly does this mapping refer to?

This mapping refers to the number of questions that RAGAS need to generate for each type. 
1. Simple: Simple questions, 50%.
2. Reasoning: Questions that need reasoning to answer them, 25%.
3. multi_context: Questions that needs the use of multiple contexts to provide answer, 25%.

In [25]:
testset.test_data[0]

DataRow(question='What is the importance of having a technical element in undergraduate degrees?', contexts=['workforce in a high-impact way when you graduate, and you’ll\nhave to go get a useful graduate degree.\nAnd even if you are already planning to get a useful graduate\ndegree, you are much better oW combining it with a substantive\nundergraduate degree — thereby becoming a “double threat”.\nMore on this in a bit.\nWhich undergraduate degrees are useful in\nthe real world?\nTypically, those that have a technical element of some form —\nthat teach you how to do something substantive.\nEngineering degrees obviously qualify. The current myth that\nengineering and computer science degrees are less useful'], ground_truth='Having a technical element in undergraduate degrees is important because it teaches students how to do something substantive and prepares them for the real world. Engineering degrees are a prime example of degrees that have a technical element.', evolution_type='simp

# Generating Responses with RAG Pipeline

In [26]:
test_df = testset.to_pandas()

In [27]:
test_df

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What is the importance of having a technical e...,[workforce in a high-impact way when you gradu...,Having a technical element in undergraduate de...,simple,[{'source': 'https://d1lamhf6l6yk6d.cloudfront...,True
1,How does the faster cycle time in startups mak...,[would have spent 10 or 20 or 30 years buildin...,The faster cycle time in startups makes it eas...,simple,[{'source': 'https://d1lamhf6l6yk6d.cloudfront...,True
2,What is the emotional incentive that founders ...,[and give her a large stock-option grant with ...,Founders naturally have an emotional incentive...,simple,[{'source': 'https://d1lamhf6l6yk6d.cloudfront...,True
3,How does the quality of output in a creative c...,[becomes irrelevant to determining the success...,Quality of output in a creative career does no...,simple,[{'source': 'https://d1lamhf6l6yk6d.cloudfront...,True
4,How does precociousness and longevity affect t...,[These three components are conspicuously link...,The answer to given question is not present in...,simple,[{'source': 'https://d1lamhf6l6yk6d.cloudfront...,True
5,What are some techniques that can help entrepr...,[I want to tell you about my new startup” is a...,If you engage in a set of these techniques ove...,simple,[{'source': 'https://d1lamhf6l6yk6d.cloudfront...,True
6,What are the potential consequences of not bei...,[Here’s why you shouldn’t do that:\nWhat are t...,Not being able to secure additional funding wh...,simple,[{'source': 'https://d1lamhf6l6yk6d.cloudfront...,True
7,How can structured procrastination be used to ...,[like?\nStructured procrastination\nThis is a ...,Structured procrastination can be used to acco...,simple,[{'source': 'https://d1lamhf6l6yk6d.cloudfront...,True
8,What criteria should be valued when evaluating...,[How to hire the best people you've\never work...,Criteria that should be valued when evaluating...,simple,[{'source': 'https://d1lamhf6l6yk6d.cloudfront...,True
9,What factors determine the peak age for creati...,"[ods when productivity is highest, the peak ag...",The expected age optimum for quantity and qual...,simple,[{'source': 'https://d1lamhf6l6yk6d.cloudfront...,True


In [28]:
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()

In [29]:
answers = []
contexts = []

for question in test_questions:
  response = retrieval_augmented_qa_chain.invoke({"question" : question})
  answers.append(response["response"].content)
  contexts.append([context.page_content for context in response["context"]])

In [31]:

from datasets import Dataset

response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [32]:
response_dataset[0]


{'question': 'What is the importance of having a technical element in undergraduate degrees?',
 'answer': 'The importance of having a technical element in undergraduate degrees is that it teaches individuals how to do something substantive, difficult, and useful that matters in the real world.',
 'contexts': ['Which undergraduate degrees are useful in\nthe real world?\nTypically, those that have a technical element of some form —\nthat teach you how to do something substantive.',
  'ify, as do mathematics and economics.\nWhy do I take this stance?\n•\nTechnical degrees teach you how to do something diZcult\nand useful that matters in the real world. Even if you don’t',
  'that teach you how to do something substantive.\nEngineering degrees obviously qualify. The current myth that\nengineering and computer science degrees are less useful',
  'path to doing anything big.\n•\nPlus, technical degrees indicate seriousness of purpose to\nfuture employers and partners. You get coded right up 

# Task 1: Evaluating our Pipeline with Ragas

In [33]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

In [37]:
from ragas.run_config import RunConfig
run_config = RunConfig(
    timeout=60,
    max_retries=10,
    max_wait = 180, # default: 60
    max_workers= 1, # default: 16 <--- I think this is the setting that ensures that there are no rate limit exceptions!
)
results = evaluate(response_dataset, metrics, run_config=run_config)


Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]

No statements were generated from the answer.


In [38]:
results

{'faithfulness': 0.6723, 'answer_relevancy': 0.8507, 'context_recall': 0.5700, 'context_precision': 0.7292, 'answer_correctness': 0.4132}

In [39]:
results_df = results.to_pandas()
results_df

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,What is the importance of having a technical e...,The importance of having a technical element i...,[Which undergraduate degrees are useful in\nth...,Having a technical element in undergraduate de...,1.0,1.0,1.0,1.0,0.454154
1,How does the faster cycle time in startups mak...,The faster cycle time in startups makes it eas...,[product families or grow market share. This o...,The faster cycle time in startups makes it eas...,0.666667,0.993744,0.0,0.75,0.455512
2,What is the emotional incentive that founders ...,Founders have an emotional incentive to see th...,[have an emotional incentive to see the compan...,Founders naturally have an emotional incentive...,1.0,0.945197,1.0,0.25,0.998458
3,How does the quality of output in a creative c...,The quality of output in a creative career rel...,[creator’s most distinguished work will appear...,Quality of output in a creative career does no...,0.0,0.999998,0.0,1.0,0.230813
4,How does precociousness and longevity affect t...,Precociousness and longevity are positively as...,"[early, end late, and produce at above-average...",The answer to given question is not present in...,1.0,0.954687,1.0,0.0,0.176977
5,What are some techniques that can help entrepr...,Pitching VCs and having a working product that...,[If\nyou\nengage\nin\na\nset\nof\nthese\ntechn...,If you engage in a set of these techniques ove...,1.0,0.890365,1.0,1.0,0.726996
6,What are the potential consequences of not bei...,Not raising enough money risks the survival of...,[Here’s why you shouldn’t do that:\nWhat are t...,Not being able to secure additional funding wh...,1.0,0.935973,0.2,1.0,0.480676
7,How can structured procrastination be used to ...,Structured procrastination can be used to acco...,[standing.)\nThe gist of Structured Procrastin...,Structured procrastination can be used to acco...,1.0,0.970069,1.0,1.0,0.490186
8,What criteria should be valued when evaluating...,Criteria: what to value when evaluating candid...,[with — particularly for a startup.\nI’m going...,Criteria that should be valued when evaluating...,1.0,0.909506,0.0,0.25,0.22172
9,What factors determine the peak age for creati...,Possible intrinsic factors that could explain ...,[creator’s most distinguished work will appear...,The expected age optimum for quantity and qual...,0.5,0.910477,1.0,0.916667,0.217767


# Task 2: Making Adjustments to our RAG Pipeline


In [40]:
from langchain.retrievers import MultiQueryRetriever

advanced_retriever = MultiQueryRetriever.from_llm(retriever=retriever, llm=primary_qa_llm)

In [41]:
from langchain.chains.combine_documents import create_stuff_documents_chain

document_chain = create_stuff_documents_chain(primary_qa_llm, retrieval_qa_prompt)

In [42]:
from langchain.chains import create_retrieval_chain

retrieval_chain = create_retrieval_chain(advanced_retriever, document_chain)

In [43]:
response = retrieval_chain.invoke({"input": "Who is Taylor Swift fueding with?"})


In [44]:
print(response["answer"])


I'm sorry, but based on the context provided, I cannot determine who Taylor Swift is feuding with.


In [45]:
response = retrieval_chain.invoke({"input": "Why are they fueding?"})


In [46]:
print(response["answer"])


The text does not provide any information about a feud or conflict between individuals or groups. It mainly discusses factors that contribute to success or failure in business, human behavior, and decision-making processes.


In [47]:
answers = []
contexts = []

for question in test_questions:
  response = retrieval_chain.invoke({"input" : question})
  answers.append(response["answer"])
  contexts.append([context.page_content for context in response["context"]])

In [48]:
response_dataset_advanced_retrieval = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [50]:
from ragas.run_config import RunConfig
run_config = RunConfig(
    timeout=60,
    max_retries=10,
    max_wait = 180, # default: 60
    max_workers= 1, # default: 16 <--- I think this is the setting that ensures that there are no rate limit exceptions!
)
advanced_retrieval_results = evaluate(response_dataset_advanced_retrieval, metrics, run_config=run_config)


Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]

In [52]:
advanced_retrieval_results

{'faithfulness': 0.8085, 'answer_relevancy': 0.8880, 'context_recall': 0.4828, 'context_precision': 0.7495, 'answer_correctness': 0.5415}

In [51]:
advanced_retrieval_results_df = advanced_retrieval_results.to_pandas()
advanced_retrieval_results_df

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,What is the importance of having a technical e...,Having a technical element in undergraduate de...,[Which undergraduate degrees are useful in\nth...,Having a technical element in undergraduate de...,1.0,0.987819,1.0,0.95,0.881542
1,How does the faster cycle time in startups mak...,The faster cycle time in startups makes it eas...,[product families or grow market share. This o...,The faster cycle time in startups makes it eas...,0.428571,0.955483,0.0,0.81,0.377099
2,What is the emotional incentive that founders ...,Founders have an emotional incentive to see th...,[have an emotional incentive to see the compan...,Founders naturally have an emotional incentive...,1.0,0.889491,1.0,0.325,0.741114
3,How does the quality of output in a creative c...,The quality of output in a creative career is ...,"[the connection between productive precocity, ...",Quality of output in a creative career does no...,0.666667,0.949234,0.0,0.866667,0.228545
4,How does precociousness and longevity affect t...,Creators who are precocious (showing early tal...,[creator’s most distinguished work will appear...,The answer to given question is not present in...,0.333333,0.945921,0.0,0.0,0.175416
5,What are some techniques that can help entrepr...,Some techniques that can help entrepreneurs en...,[venture capitalists for follow-on funding. It...,If you engage in a set of these techniques ove...,0.583333,0.978161,1.0,0.770833,0.500679
6,What are the potential consequences of not bei...,Not being able to secure additional funding wh...,[Here’s why you shouldn’t do that:\nWhat are t...,Not being able to secure additional funding wh...,1.0,0.936073,0.2,0.767857,0.521837
7,How can structured procrastination be used to ...,Structured procrastination can be used to acco...,[standing.)\nThe gist of Structured Procrastin...,Structured procrastination can be used to acco...,1.0,0.970069,1.0,1.0,0.742655
8,What criteria should be valued when evaluating...,"When evaluating candidates for a startup, it i...",[with — particularly for a startup.\nI’m going...,Criteria that should be valued when evaluating...,1.0,0.997149,0.0,0.444444,0.664004
9,What factors determine the peak age for creati...,The text mentions that there are possible intr...,[creator’s most distinguished work will appear...,The expected age optimum for quantity and qual...,0.666667,0.0,0.0,0.8875,0.216452


# Task 3: Evaluating our Adjusted Pipeline Against Our Baseline

In [53]:
results

{'faithfulness': 0.6723, 'answer_relevancy': 0.8507, 'context_recall': 0.5700, 'context_precision': 0.7292, 'answer_correctness': 0.4132}

In [54]:
advanced_retrieval_results

{'faithfulness': 0.8085, 'answer_relevancy': 0.8880, 'context_recall': 0.4828, 'context_precision': 0.7495, 'answer_correctness': 0.5415}

In [55]:
import pandas as pd

df_original = pd.DataFrame(list(results.items()), columns=['Metric', 'Baseline'])
df_comparison = pd.DataFrame(list(advanced_retrieval_results.items()), columns=['Metric', 'MultiQueryRetriever with Document Stuffing'])

df_merged = pd.merge(df_original, df_comparison, on='Metric')

df_merged['Delta'] = df_merged['MultiQueryRetriever with Document Stuffing'] - df_merged['Baseline']

df_merged

Unnamed: 0,Metric,Baseline,MultiQueryRetriever with Document Stuffing,Delta
0,faithfulness,0.672306,0.808452,0.136147
1,answer_relevancy,0.850735,0.888024,0.037289
2,context_recall,0.57,0.482778,-0.087222
3,context_precision,0.729167,0.749532,0.020365
4,answer_correctness,0.413226,0.541522,0.128296


# Task 4: Testing OpenAI's Claim

In [56]:
new_embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small") # <--- This is the new embedding model that we are going to use to generate embeddings for the documents and queries.

In [57]:
vector_store = Qdrant.from_documents(
    documents,
    new_embeddings_model,
    location=":memory:",
    collection_name="PMarca Blogs - TE3 - MQR",
) # <--- Qdrant vector store to generate embeddings for documents based on `text-embedding-3-small` and store them in memory.

In [58]:
new_retriever = vector_store.as_retriever() # <--- Vectore store will be used as a retriever of context to add in prompt. Context is retrieved based on the closest embeddings to the question.

In [59]:
new_advanced_retriever = MultiQueryRetriever.from_llm(retriever=new_retriever, llm=primary_qa_llm) # Pass the new_retriever to the MultiQueryRetriever. 

In [60]:
new_retrieval_chain = create_retrieval_chain(new_advanced_retriever, document_chain) # <--- This is the new retrieval chain that we are going to use

In [61]:
# Generate answers for the test questions using the new retrieval chain. Store answers and their contexts in lists.
answers = []
contexts = []

for question in test_questions:
  response = new_retrieval_chain.invoke({"input" : question})
  answers.append(response["answer"])
  contexts.append([context.page_content for context in response["context"]])

In [62]:
# Add test questions, answers, contexts and ground truths to the HuggingFace dataset.
new_response_dataset_advanced_retrieval = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [63]:
# Run Ragas evaluation on the new dataset to get the results.
new_advanced_retrieval_results = evaluate(new_response_dataset_advanced_retrieval, metrics, run_config=run_config)


Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]

In [64]:
new_advanced_retrieval_results

{'faithfulness': 0.8054, 'answer_relevancy': 0.9347, 'context_recall': 0.4725, 'context_precision': 0.6653, 'answer_correctness': 0.4469}

In [65]:
df_baseline = pd.DataFrame(list(results.items()), columns=['Metric', 'ADA + Baseline'])
df_original = pd.DataFrame(list(advanced_retrieval_results.items()), columns=['Metric', 'ADA + MQR'])
df_comparison = pd.DataFrame(list(new_advanced_retrieval_results.items()), columns=['Metric', 'TE3 + MQR'])

df_merged = pd.merge(df_original, df_comparison, on='Metric')
df_merged = pd.merge(df_baseline, df_merged, on="Metric")

df_merged['ADA + MQR -> TE3 + MQR'] = df_merged['TE3 + MQR'] - df_merged['ADA + MQR']
df_merged['Baseline -> TE3 + MQR'] = df_merged['TE3 + MQR'] - df_merged['ADA + Baseline']

df_merged

Unnamed: 0,Metric,ADA + Baseline,ADA + MQR,TE3 + MQR,ADA + MQR -> TE3 + MQR,Baseline -> TE3 + MQR
0,faithfulness,0.672306,0.808452,0.805417,-0.003036,0.133111
1,answer_relevancy,0.850735,0.888024,0.93468,0.046656,0.083946
2,context_recall,0.57,0.482778,0.4725,-0.010278,-0.0975
3,context_precision,0.729167,0.749532,0.665262,-0.08427,-0.063905
4,answer_correctness,0.413226,0.541522,0.446936,-0.094586,0.03371


# ❓ Question #4:
Do you think, in your opinion, text-embedding-3-small is significantly better than ada?

I am seeing deprovement from ADA to TE3 in 4 metrics, so I feel like text-embedding-3-small is not better than ada. Maybe there is some issue with my pipeline and I need to rerun it as seems like everyone else noticed an improvement.

1. faithfulness, decreased 0.3%
2.	answer_relevancy, increase 4.6%
3.	context_recall, decreased 1.0%
4.	context_precision, decreased 8.4%
5.	answer_correctness, decreased 9.4%

# BONUS ACTIVITY: Using a Better Generator

In [73]:
gpt4o_llm = ChatOpenAI(model_name="gpt-4o")
gpt4o_retriever = MultiQueryRetriever.from_llm(retriever=new_retriever, llm=gpt4o_llm)
gpt4o_retrieval_chain = create_retrieval_chain(gpt4o_retriever, document_chain)

gpt4o_answers = []
gpt4o_contexts = []
for question in test_questions:
  response = gpt4o_retrieval_chain.invoke({"input" : question})
  gpt4o_answers.append(response["answer"])
  gpt4o_contexts.append([context.page_content for context in response["context"]])

In [74]:
gpt4o_response_dataset_advanced_retrieval = Dataset.from_dict({
    "question" : test_questions,
    "answer" : gpt4o_answers,
    "contexts" : gpt4o_contexts,
    "ground_truth" : test_groundtruths
})    

In [67]:
gpt4o_advanced_retrieval_results = evaluate(gpt4o_response_dataset_advanced_retrieval, metrics, run_config=run_config)

Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]

{'faithfulness': 0.8054, 'answer_relevancy': 0.9347, 'context_recall': 0.4725, 'context_precision': 0.6653, 'answer_correctness': 0.4469}


In [70]:
new_advanced_retrieval_results

{'faithfulness': 0.8054, 'answer_relevancy': 0.9347, 'context_recall': 0.4725, 'context_precision': 0.6653, 'answer_correctness': 0.4469}

In [71]:
df_baseline = pd.DataFrame(list(results.items()), columns=['Metric', 'ADA + Baseline'])
df_original = pd.DataFrame(list(advanced_retrieval_results.items()), columns=['Metric', 'ADA + MQR'])
df_comparison = pd.DataFrame(list(new_advanced_retrieval_results.items()), columns=['Metric', 'TE3 + MQR'])
df_comparison2 = pd.DataFrame(list(gpt4o_advanced_retrieval_results.items()), columns=['Metric', 'TE3 + MQR (GPT4o)'])
df_merged = pd.merge(df_original, df_comparison, on='Metric')
df_merged = pd.merge(df_baseline, df_merged, on="Metric")
df_merged = pd.merge(df_merged, df_comparison2, on="Metric")
df_merged['TE3 + MQR -> TE3 + MQR (GPT4o)'] = df_merged['TE3 + MQR (GPT4o)'] - df_merged['TE3 + MQR']

df_merged

Unnamed: 0,Metric,ADA + Baseline,ADA + MQR,TE3 + MQR,TE3 + MQR (GPT4o),TE3 + MQR -> TE3 + MQR (GPT4o)
0,faithfulness,0.672306,0.808452,0.805417,0.854167,0.04875
1,answer_relevancy,0.850735,0.888024,0.93468,0.919956,-0.014724
2,context_recall,0.57,0.482778,0.4725,0.605,0.1325
3,context_precision,0.729167,0.749532,0.665262,0.723209,0.057947
4,answer_correctness,0.413226,0.541522,0.446936,0.465185,0.01825


From the above results, it seems like GPT-4o is better than gpt-3.5-turbo