# Evaluating each RAG pipeline using giskard

In [1]:
import os
from dotenv import load_dotenv, find_dotenv

# Load environment variables from the .env file using 'from dotenv import find_dotenv, load_dotenv'
load_dotenv(find_dotenv(filename='SURF-Project_Optimizing-PerunaBot/setup/.env'))

openai_api_key = os.environ['OPENAI_API_KEY']

In [2]:
print(os.environ["LANGCHAIN_TRACING_V2"])

false


In [3]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI()
llm.invoke("Hello")

AIMessage(content='Hello! How can I assist you today?', response_metadata={'token_usage': {'completion_tokens': 9, 'prompt_tokens': 8, 'total_tokens': 17}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-cdeb23f5-6efd-474c-84fc-e1f9224bd6fc-0', usage_metadata={'input_tokens': 8, 'output_tokens': 9, 'total_tokens': 17})

In [4]:
import nest_asyncio
nest_asyncio.apply()

In [5]:
from OG_PerunaBot_chain import Original_PerunaBot_eval_chain
from chain_0 import base_retriever_eval_chain_0
from chain_1 import parent_retriever_eval_chain_1
from chain_2 import ensemble_retriever_eval_chain_2

In [6]:
from langchain_community.document_loaders import PyPDFLoader

# file paths of PDFs to be used
pdf_paths = ['../Data/Evaluation Data/Southern Methodist University - 2023-2024 Undergraduate Catalog from About SMU to Right to Know.pdf',
             '../Data/Evaluation Data/Important University Resources from SMU Student Handbook 23-24.pdf',
             '../Data/Evaluation Data/Important SMU Numbers and Websites.pdf'
             ]

# Function to load PDFs using LangChain's PyPDFLoader
def load_pdfs_with_langchain(pdf_paths):
    documents = []
    for path in pdf_paths:
        try:
            # Use LangChain's PyPDFLoader to load the PDF
            loader = PyPDFLoader(path)
            # Load and pase the PDF into document instances
            pdf_doc = loader.load()
            # Insert the parsed PDF documents into the documents list
            documents.extend(pdf_doc)
        except Exception as e:
            print(f"Error loading {path}: {e}")
    return documents

# Load PDF documents using the function
evaluation_pdf_docs = load_pdfs_with_langchain(pdf_paths)

print(len(evaluation_pdf_docs))
print(evaluation_pdf_docs[0].page_content[0:100])
print(evaluation_pdf_docs[7].metadata)

93
14 
 About SMU  
The Vision of Southern Methodist University  
To create and impart knowledge that w
{'source': '../Data/Evaluation Data/Southern Methodist University - 2023-2024 Undergraduate Catalog from About SMU to Right to Know.pdf', 'page': 7}


In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100, 
                                                length_function=len, add_start_index=True)  

# Split the PDF documents into chunks using the text splitter
split_evaluation_pdf_docs = text_splitter.split_documents(evaluation_pdf_docs)
print(len(split_evaluation_pdf_docs))

395


In [8]:
import pandas as pd

df = pd.DataFrame([d.page_content for d in split_evaluation_pdf_docs], columns = ['text'])
df.head(10)

Unnamed: 0,text
0,14 \n About SMU \nThe Vision of Southern Meth...
1,"Metroplex, Southern Methodist University seeks..."
2,Education and Human Development. \nFounded in...
3,"The full -time equivalent enrollment was 8,703..."
4,"university, but a great university."
5,15 \n Academic Accreditation \nSouthern Method...
6,Dedman School of Law \n(7-year cycle) The Am...
7,Dance \n(10-year cycle) National Association...
8,16 \n UNIT State/Regional/National Accreditin...
9,17 \n Admission \nSouthern Methodist Universi...


In [9]:
from giskard.rag import KnowledgeBase

knowledge_base = KnowledgeBase(df)

In [22]:
prompt = """
    You are PerunaBot, an AI assistant trained on domain-specific information on
    Southern Methodist University.  Your primary role is to provide detailed, accurate, 
    and helpful responses to questions based on the following retrieved context. 
    You assist faculty, administrators, prospective students, and current students by 
    offering precise and specific answers. Maintain a welcoming and friendly tone. 
    Try to keep responses to 3-5 sentences maximum unless more detail is requested.
"""

from giskard.rag import generate_testset

testset = generate_testset(
    knowledge_base=knowledge_base,
    num_questions=25,
    agent_description=prompt,
)

2024-08-01 16:51:11,255 pid:11724 MainThread giskard.rag  INFO     Finding topics in the knowledge base.
2024-08-01 16:52:31,478 pid:11724 MainThread giskard.rag  INFO     Found 19 topics in the knowledge base.


Generating questions:   0%|          | 0/25 [00:00<?, ?it/s]

In [23]:
test_set_df = testset.to_pandas()

for index, row in enumerate(test_set_df.head(5).iterrows()):
    print(f"Question {index + 1}: {row[1]['question']}")
    print(f"Reference Answer: {row[1]['reference_answer']}")
    print("Reference Context:")
    print(row[1]['reference_context'])
    print("********************", end="\n\n")



Question 1: Where can I find information about the certification process?
Reference Answer: Information about the certification process is available in the Enrollment and Academic Records section of the catalog, as well as the www.smu.edu/EnrollmentServices/Veterans website.
Reference Context:
Document 95: certification process is available in the Enrollment and Academic Records  section of this catalog, as well as the 
www.smu.edu/EnrollmentS ervices/Veterans website.
********************

Question 2: What is the policy for students who have been reinstated to the University following suspension?
Reference Answer: Students who have been reinstated to the University following suspension remain on probation and are normally allowed two regular terms within which they can make up their academic deficiencies and return to good standing. However, special conditions for the first term may be set in individual cases. If they fail to meet the terms of academic probation, they will be suspende

In [24]:
testset.save("../Data/Evaluation Data/giskard-testset_1.jsonl")

In [37]:
def predict_OG_chain(question, history=None):
    response = Original_PerunaBot_eval_chain.invoke({"question": question})
    return response["output"]

def predict_chain_0(question, history=None):
    response = base_retriever_eval_chain_0.invoke({"question": question})
    return response["output"]

def predict_chain_1(question, history=None):
    response = parent_retriever_eval_chain_1.invoke({"question": question})
    return response["output"]

def predict_chain_2(question, history=None):
    response = ensemble_retriever_eval_chain_2.invoke({"question": question})
    return response["output"]

In [28]:
from giskard.rag import evaluate

In [38]:
OG_PerunaBot_report = evaluate(
    answer_fn=predict_OG_chain,
    testset=testset,
    knowledge_base=knowledge_base
)
display(OG_PerunaBot_report)

Asking questions to the agent:   0%|          | 0/25 [00:00<?, ?it/s]

CorrectnessMetric evaluation:   0%|          | 0/25 [00:00<?, ?it/s]

In [39]:
OG_PerunaBot_report.to_html("../Data/Evaluation Results/OG_PerunaBot_giskard_report_8-1.html")

In [55]:
OG_PerunaBot_report.correctness_by_question_type()

Unnamed: 0_level_0,correctness
question_type,Unnamed: 1_level_1
complex,0.75
conversational,0.25
distracting element,0.25
double,0.75
simple,1.0
situational,0.75


In [56]:
OG_PerunaBot_report.correctness_by_topic()

Unnamed: 0_level_0,correctness
topic,Unnamed: 1_level_1
Academic Credit and Placement,1.0
Others,0.333333
Southern Methodist University Life,0.75
University Academic Policies,0.5
University Admission Process,1.0
University Admission and Credit Transfer,1.0
University Admissions and English Proficiency,0.0
University Credit and Course Policies,0.0
University Education Curriculum,0.0
University Financial Policies,0.5


In [40]:
PerunaBot_0_report = evaluate(
    answer_fn=predict_chain_0,
    testset=testset,
    knowledge_base=knowledge_base
)
display(PerunaBot_0_report)

Asking questions to the agent:   0%|          | 0/25 [00:00<?, ?it/s]

CorrectnessMetric evaluation:   0%|          | 0/25 [00:00<?, ?it/s]

In [41]:
PerunaBot_0_report.to_html("../Data/Evaluation Results/PerunaBot_0_giskard_report_8-1.html")

In [53]:
PerunaBot_0_report.correctness_by_question_type()

Unnamed: 0_level_0,correctness
question_type,Unnamed: 1_level_1
complex,1.0
conversational,0.5
distracting element,0.5
double,1.0
simple,0.8
situational,0.75


In [54]:

PerunaBot_0_report.correctness_by_topic()

Unnamed: 0_level_0,correctness
topic,Unnamed: 1_level_1
Academic Credit and Placement,1.0
Others,0.666667
Southern Methodist University Life,1.0
University Academic Policies,0.0
University Admission Process,1.0
University Admission and Credit Transfer,1.0
University Admissions and English Proficiency,0.5
University Credit and Course Policies,1.0
University Education Curriculum,0.0
University Financial Policies,0.5


In [42]:
PerunaBot_1_report = evaluate(
    answer_fn=predict_chain_1,
    testset=testset,
    knowledge_base=knowledge_base
)
display(PerunaBot_1_report)

Asking questions to the agent:   0%|          | 0/25 [00:00<?, ?it/s]

CorrectnessMetric evaluation:   0%|          | 0/25 [00:00<?, ?it/s]

In [43]:
PerunaBot_1_report.to_html("../Data/Evaluation Results/PerunaBot_1_giskard_report_8-1.html")

In [51]:
PerunaBot_1_report.correctness_by_question_type()

Unnamed: 0_level_0,correctness
question_type,Unnamed: 1_level_1
complex,1.0
conversational,0.25
distracting element,0.75
double,0.75
simple,1.0
situational,1.0


In [52]:

PerunaBot_1_report.correctness_by_topic()

Unnamed: 0_level_0,correctness
topic,Unnamed: 1_level_1
Academic Credit and Placement,1.0
Others,0.666667
Southern Methodist University Life,0.5
University Academic Policies,0.5
University Admission Process,1.0
University Admission and Credit Transfer,1.0
University Admissions and English Proficiency,1.0
University Credit and Course Policies,1.0
University Education Curriculum,1.0
University Financial Policies,0.5


In [45]:
PerunaBot_2_report = evaluate(
    answer_fn=predict_chain_2,
    testset=testset,
    knowledge_base=knowledge_base
)

Asking questions to the agent:   0%|          | 0/25 [00:00<?, ?it/s]

CorrectnessMetric evaluation:   0%|          | 0/25 [00:00<?, ?it/s]

In [46]:
display(PerunaBot_2_report)

In [47]:
PerunaBot_2_report.to_html("../Data/Evaluation Results/PerunaBot_2_giskard_report_8-1.html")

In [49]:
PerunaBot_2_report.correctness_by_question_type()


Unnamed: 0_level_0,correctness
question_type,Unnamed: 1_level_1
complex,1.0
conversational,0.0
distracting element,0.75
double,1.0
simple,0.8
situational,1.0


In [50]:

PerunaBot_2_report.correctness_by_topic()

Unnamed: 0_level_0,correctness
topic,Unnamed: 1_level_1
Academic Credit and Placement,1.0
Others,0.666667
Southern Methodist University Life,0.5
University Academic Policies,0.5
University Admission Process,1.0
University Admission and Credit Transfer,1.0
University Admissions and English Proficiency,1.0
University Credit and Course Policies,1.0
University Education Curriculum,1.0
University Financial Policies,0.5


In [10]:
from OG_PerunaBot_chain import Original_PerunaBot_eval_chain_v1
from chain_0 import base_retriever_eval_chain_0_v1
from chain_1 import parent_retriever_eval_chain_1_v1
from chain_2 import ensemble_retriever_eval_chain_2_v1

In [12]:
prompt_v1 = """
    You are Dr. Alex, a university academic advisor with over 15 years
    of experience in higher education as a academic advisor at Southern
    Methodist University. You have engaged with hundreds of students and 
    understand the ins and outs of the university's academic programs,
    policies, and procedures. You are going to mimic the role of 
    prospective and current students, asking questions about the university.
    Then you will provide detailed, accurate, and specific answers 
    to these questions based on your knowledge. Try to keep answers to 
    3-5 sentences unless more detail is requested.
"""

from giskard.rag import generate_testset

testset_2 = generate_testset(
    knowledge_base=knowledge_base,
    num_questions=25,
    agent_description=prompt_v1
)

2024-08-05 08:06:16,941 pid:432 MainThread giskard.rag  INFO     Finding topics in the knowledge base.
2024-08-05 08:08:03,255 pid:432 MainThread giskard.rag  INFO     Found 23 topics in the knowledge base.


Generating questions:   0%|          | 0/25 [00:00<?, ?it/s]

In [13]:
test_set_df_2 = testset_2.to_pandas()   

for index, row in enumerate(test_set_df_2.head(5).iterrows()):
    print(f"Question {index + 1}: {row[1]['question']}")
    print(f"Reference Answer: {row[1]['reference_answer']}")
    print("Reference Context:")
    print(row[1]['reference_context'])
    print("********************", end="\n\n")

Question 1: What is the focus of the Fort Burgwin Library?
Reference Answer: The Fort Burgwin Library is focused on the history, literature, cultures and environment of New Mexico and the Southwest.
Reference Context:
Document 344: data from premier providers. The Business Library includes the Kitt Investing and Trading Center, quiet and group 
study areas, a periodicals area, facility wide wireless access, more than 400 electronic resources, th e Hillcrest 
Foundation International Resource Library, the Edwin L. Cox Business Leadership Center Resource Collection, the 
Maguire Energy Institute Resource Collection and the Cox Career Services Collection. Librarians are available all 
hours that the bus iness library is open, providing research assistance both in person and virtually.  
Fort Burgwin Library 
The Fort Burgwin Library, located in Taos, New Mexico, serves students and faculty in the SMU -in-Taos program. 
It is focused on the history, literature, cultures and environment of 

In [14]:
testset_2.save("../Data/Evaluation Data/giskard-testset_2.jsonl")

In [11]:
chain_experiment_v1 = {
    "Original PerunaBot v1": Original_PerunaBot_eval_chain_v1,
    "PerunaBot 0 v1": base_retriever_eval_chain_0_v1,
    "PerunaBot 1 v1": parent_retriever_eval_chain_1_v1,
    "PerunaBot 2 v1": ensemble_retriever_eval_chain_2_v1
}


In [12]:
evaluation_prompt = """
    You are Dr. Alexa Morgana, a university academic advisor with over 15 years
    of experience in higher education as an academic advisor at Southern Methodist
    University. You have engaged with hundreds of students and understand the ins
    and outs of the university's academic programs, policies, and procedures. You
    are in the process of training academic advisors at the university to provide
    detailed, accurate, and specific answers to prospective and current students'
    questions. Evaluate the responses to the following questions based on these
    criteria: helpfulness, relevance, accuracy, depth, and level of detail.
"""

In [13]:
from giskard.rag import evaluate

In [21]:
for name in chain_experiment_v1:
    def predict_chain(question, history=None):
        chain = chain_experiment_v1[name]
        response = chain.invoke({"question": question})
        return response["output"]
    
    results = evaluate(
        answer_fn=predict_chain,
        testset=testset_2,
        knowledge_base=knowledge_base,
        agent_description = evaluation_prompt
    )
    
    display(results)
    results.to_html(f"../Data/Evaluation Results/{name}_giskard_report_8-1.html")
    results.correctness_by_question_type()
    results.correctness_by_topic()
    results.get_failures()

Asking questions to the agent:   0%|          | 0/25 [00:00<?, ?it/s]

CorrectnessMetric evaluation:   0%|          | 0/25 [00:00<?, ?it/s]

Asking questions to the agent:   0%|          | 0/25 [00:00<?, ?it/s]

CorrectnessMetric evaluation:   0%|          | 0/25 [00:00<?, ?it/s]

Asking questions to the agent:   0%|          | 0/25 [00:00<?, ?it/s]

CorrectnessMetric evaluation:   0%|          | 0/25 [00:00<?, ?it/s]

Asking questions to the agent:   0%|          | 0/25 [00:00<?, ?it/s]

CorrectnessMetric evaluation:   0%|          | 0/25 [00:00<?, ?it/s]

In [14]:
from OG_PerunaBot_chain import Original_PerunaBot_eval_chain_v2
from chain_0 import base_retriever_eval_chain_0_v2
from chain_1 import parent_retriever_eval_chain_1_v2
from chain_2 import ensemble_retriever_eval_chain_2_v2

In [15]:
chain_experiment_v2 = {
    "Original PerunaBot v2": Original_PerunaBot_eval_chain_v2,
    "PerunaBot 0 v2": base_retriever_eval_chain_0_v2,
    "PerunaBot 1 v2": parent_retriever_eval_chain_1_v2,
    "PerunaBot 2 v2": ensemble_retriever_eval_chain_2_v2
}


In [16]:
from giskard.rag import QATestset
combined_testset = QATestset.load("../Data/Evaluation Data/giskard-testset_combined.jsonl")


In [17]:
from giskard.rag import evaluate

In [19]:
all_results = {}

for name in chain_experiment_v2:
    def predict_chain(question, history=None):
        chain = chain_experiment_v2[name]
        response = chain.invoke({"question": question})
        return response["output"]
    
    results = evaluate(
        answer_fn=predict_chain,
        testset=combined_testset,
        knowledge_base=knowledge_base,
        agent_description = evaluation_prompt
    )
    
    display(results)
    results.to_html(f"../Data/Evaluation Results/{name}_giskard_report_8-1.html")
    
    all_results.update({name: results})


Asking questions to the agent:   0%|          | 0/50 [00:00<?, ?it/s]

CorrectnessMetric evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

2024-08-05 09:58:47,703 pid:29184 MainThread giskard.rag  INFO     Finding topics in the knowledge base.
2024-08-05 10:00:37,530 pid:29184 MainThread giskard.rag  INFO     Found 17 topics in the knowledge base.


Asking questions to the agent:   0%|          | 0/50 [00:00<?, ?it/s]

CorrectnessMetric evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

Asking questions to the agent:   0%|          | 0/50 [00:00<?, ?it/s]

CorrectnessMetric evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

Asking questions to the agent:   0%|          | 0/50 [00:00<?, ?it/s]

CorrectnessMetric evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

In [28]:
all_results

{'Original PerunaBot v2': <giskard.rag.report.RAGReport at 0x279c28fb4d0>,
 'PerunaBot 0 v2': <giskard.rag.report.RAGReport at 0x279c274ff50>,
 'PerunaBot 1 v2': <giskard.rag.report.RAGReport at 0x279c983bb50>,
 'PerunaBot 2 v2': <giskard.rag.report.RAGReport at 0x279c98ed210>}

In [34]:
all_results["Original PerunaBot v2"].correctness_by_question_type()

Unnamed: 0_level_0,correctness
question_type,Unnamed: 1_level_1
complex,0.75
conversational,0.125
distracting element,0.625
double,0.875
simple,0.8
situational,0.625


In [36]:
all_results["PerunaBot 0 v2"].correctness_by_question_type()

Unnamed: 0_level_0,correctness
question_type,Unnamed: 1_level_1
complex,1.0
conversational,0.25
distracting element,0.75
double,1.0
simple,0.9
situational,1.0


In [37]:
all_results["PerunaBot 1 v2"].correctness_by_question_type()

Unnamed: 0_level_0,correctness
question_type,Unnamed: 1_level_1
complex,1.0
conversational,0.25
distracting element,0.75
double,1.0
simple,0.9
situational,1.0


In [38]:
all_results["PerunaBot 2 v2"].correctness_by_question_type()

Unnamed: 0_level_0,correctness
question_type,Unnamed: 1_level_1
complex,1.0
conversational,0.125
distracting element,1.0
double,1.0
simple,0.7
situational,1.0


In [40]:
OG_PerunaBot_chains = {
    "Original PerunaBot v1": Original_PerunaBot_eval_chain_v1, # gpt-3.5-turbo
    "OG PerunaBot": Original_PerunaBot_eval_chain, # gpt-4o
    "Original PerunaBot v2": Original_PerunaBot_eval_chain_v2, # gpt-4o-mini

}

PerunaBot_0_chains = {
    "PerunaBot 0 v1": base_retriever_eval_chain_0_v1, # gpt-3.5-turbo
    "PerunaBot 0": base_retriever_eval_chain_0, # gpt-4o
    "PerunaBot 0 v2": base_retriever_eval_chain_0_v2, # gpt-4o-mini
}

PerunaBot_1_chains = {
    "PerunaBot 1 v1": parent_retriever_eval_chain_1_v1, # gpt-3.5-turbo
    "PerunaBot 1": parent_retriever_eval_chain_1, # gpt-4o
    "PerunaBot 1 v2": parent_retriever_eval_chain_1_v2, # gpt-4o-mini
}

PerunaBot_2_chains = {
    "PerunaBot 2 v1": ensemble_retriever_eval_chain_2_v1, # gpt-3.5-turbo
    "PerunaBot 2": ensemble_retriever_eval_chain_2, # gpt-4o
    "PerunaBot 2 v2": ensemble_retriever_eval_chain_2_v2 # gpt-4o-mini
}

In [50]:
from IPython.display import display, HTML

OG_PerunaBot_reports = [
        "../Data/Evaluation Results/OG_PerunaBot_giskard_report_8-1.html",
    "../Data/Evaluation Results/Original PerunaBot_v1_giskard_report_8-5.html",
    "../Data/Evaluation Results/Original PerunaBot v2_giskard_report_8-5.html"
]

# Display the HTML reports
for report_path in OG_PerunaBot_reports:
    with open(report_path, "r") as f:
        html_content = f.read()
        display(HTML(html_content))

In [52]:
PerunaBot_0_reports = [
    "../Data/Evaluation Results/PerunaBot 0_v1_giskard_report_8-5.html",
    "../Data/Evaluation Results/PerunaBot_0_giskard_report_8-1.html",
    "../Data/Evaluation Results/PerunaBot 0 v2_giskard_report_8-5.html"
]

# Display the HTML reports
for report_path in PerunaBot_0_reports:
    with open(report_path, "r") as f:
        html_content = f.read()
        display(HTML(html_content))

In [53]:
PerunaBot_1_reports = [
    "../Data/Evaluation Results/PerunaBot 1_v1_giskard_report_8-5.html",
    "../Data/Evaluation Results/PerunaBot_1_giskard_report_8-1.html",
    "../Data/Evaluation Results/PerunaBot 1 v2_giskard_report_8-5.html"
]

# Display the HTML reports
for report_path in PerunaBot_1_reports:
    with open(report_path, "r") as f:
        html_content = f.read()
        display(HTML(html_content))

In [54]:
PerunaBot_2_reports = [
    "../Data/Evaluation Results/PerunaBot 2_v1_giskard_report_8-5.html",
    "../Data/Evaluation Results/PerunaBot_2_giskard_report_8-1.html",
    "../Data/Evaluation Results/PerunaBot 2 v2_giskard_report_8-5.html"
]

# Display the HTML reports
for report_path in PerunaBot_2_reports:
    with open(report_path, "r") as f:
        html_content = f.read()
        display(HTML(html_content))