In [1]:
# This notebook is  used to evaluate perofrmance of our RAG ingestion and query pipeline.
# Method:
#   - Generated test data using a sample GitHub repository (offline process not in this notebook)
#     - https://docs.ragas.io/en/stable/getstarted/rag_testset_generation/
#   - Use our ingestion pipeline to parse the same reporisiroty and index in our vector store
#   - Use RAGAS on teadt dataset + our answers fromn our RAG and visualise the mterics.
import os
import sys
import pandas as pd

sys.path.insert(1, '/home/jovyan/work/code')
from opentelemetry import trace
from config import VectorDBConfig, EmbeddingConfig, ProcessingConfig, ChatConfig
from config_helper import ConfigHelper
from pipeline import DocumentPipeline
from localrag import LocalRAG

# We are using Aspire. Of course we will see the telemetry and logs in our dashboard!
# see config_helper.py for the not to tidy details.
tracer = trace.get_tracer(__name__)

config_helper = ConfigHelper(True)


In [2]:
# Inspect the test data
# The test data is generated using the method described at: 
#    https://docs.ragas.io/en/stable/getstarted/rag_testset_generation/
file_name= "test_data_50.pkl" 
test_dataset =  pd.read_pickle(file_name)
test_dataset.head()
#for index, row in test_dataset.iterrows():
#    print(row["reference_contexts"])
#    reference= row["reference"]

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,How can Zoom be effectively utilized to addres...,[Skills and Behaviors of allies To be an effec...,Zoom can be effectively utilized to address is...,single_hop_specifc_query_synthesizer
1,How can corporate sales teams effectively supp...,[Skills and Behaviors of allies To be an effec...,Corporate sales teams can effectively support ...,single_hop_specifc_query_synthesizer
2,Wht is GitLab's role in allyship?,[Tips on being an ally Identifying your power ...,GitLab promotes allyship through initiatives l...,single_hop_specifc_query_synthesizer
3,Wht is the Ally Lab Larning Group and how does...,[Tips on being an ally Identifying your power ...,The Ally Lab Learning Group is an initiative d...,single_hop_specifc_query_synthesizer
4,Wht is GitLab's role in diversity and inclusion?,[What it means to be an ally Take on the strug...,"GitLab provides Diversity, Inclusion & Belongi...",single_hop_specifc_query_synthesizer


In [3]:
# We need to ingest the test data into our vector store so that we can query later for evaluation

pipeline = DocumentPipeline(
    vector_db_config=config_helper.vector_db_config,
    embedding_config=config_helper.embedding_config
)

repository="https://github.com/dotnet/docs-aspire"
input_file_name="merged_output.txt"
with tracer.start_as_current_span(f"Starting ingesting file {input_file_name}"):
    pipeline.process_single_file(input_file_name,repository)

  self.qdrant = QdrantClient(url=vector_db_config.url, api_key=vector_db_config.api_key)


Total parts found: 25
Total files processed: 12
Sample file paths:
 1. dib-events-program.md
 2. advisory-group-members.md
 3. being-an-ally.md
done


In [4]:
#https://docs.ragas.io/en/latest/getstarted/rag_eval/#basic-setup
evaluation_data=[]

def query_using_rag(rag, question): 
    references=[]
    with tracer.start_as_current_span("Getting answer and context."): 
        print(f"Question: {question}")
        with tracer.start_as_current_span("rag get context"):
            chunks = rag.get_relevant_chunks(question, k=5)
            for i, chunk in enumerate(chunks, 1):
                references.append(chunk.page_content)
        with tracer.start_as_current_span("Retrieve answers."):
            answer = rag.retrieve_and_answer(question, k=5)
            return (answer, references)

rag = LocalRAG(
    vector_db_config=config_helper.vector_db_config,
    embedding_config=config_helper.embedding_config, 
    chat_config=config_helper.chat_config
)

with tracer.start_as_current_span("Starting demo"):
    for index, row in test_dataset.iterrows():
        print(f"Question {index}:")
        question = row["user_input"]
        reference= row["reference"]
        answer,contexts = query_using_rag(rag, question)        
        print(answer[:50])
        evaluation_data.append({
            "user_input": question,
            "retrieved_contexts": contexts,
            "response": answer,
            "reference": reference
        })

  self.qdrant = QdrantClient(url=vector_db_config.url, api_key=vector_db_config.api_key)


Question 0:
Question: How can Zoom be effectively utilized to address issues of diversity and inclusion within a sales team, particularly when addressing the misuse of pronouns?
While the context mentions using Zoom calls for sp
Question 1:
Question: How can corporate sales teams effectively support the Black Lives Matter movement?
I cannot find the answer in the provided context. 
Question 2:
Question: Wht is GitLab's role in allyship?
GitLab requires its employees to be inclusive, mea
Question 3:
Question: Wht is the Ally Lab Larning Group and how does it help in allyship?


KeyboardInterrupt: 

In [11]:
import os
# Save the eval data so we can skip this step for the same dataset next time.
if not os.path.exists('eval_data_with_answers_50.pkl'):
    with open('eval_data_with_answers_50.pkl', 'wb') as f:
        pickle.dump(evaluation_data, f)

In [12]:
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness
from ragas import EvaluationDataset

llm = ChatOpenAI(model="gpt-4o")
embeddings = OpenAIEmbeddings()
evaluation_dataset = EvaluationDataset.from_list(evaluation_data)
evaluator_llm = LangchainLLMWrapper(llm)

with tracer.start_as_current_span("Starting model evaluation"):
    result = evaluate(dataset=evaluation_dataset,metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness()],llm=evaluator_llm)
    print(result)
#{'context_recall': 0.5450, 'faithfulness': 0.5920, 'factual_correctness': 0.3941}

Evaluating:   0%|          | 0/159 [00:00<?, ?it/s]

{'context_recall': 0.5450, 'faithfulness': 0.5920, 'factual_correctness': 0.3941}
