In [2]:
import os
from dotenv import load_dotenv, find_dotenv

# Load environment variables from the .env file using 'from dotenv import find_dotenv, load_dotenv'
load_dotenv(find_dotenv(filename='SURF-Project_Optimizing-PerunaBot/setup/.env'))

openai_api_key = os.environ['OPENAI_API_KEY']

In [3]:
#langsmith
from langsmith import Client
import os

langsmith_api_key = os.environ["LANGSMITH_API_KEY"]
langchain_endpoint = os.environ["LANGCHAIN_ENDPOINT"]
langsmith_project = os.environ["LANGCHAIN_PROJECT"]
os.environ["LANGCHAIN_TRACING_V2"]

# Initialize LangSmith Client using 'from langsmith import Client'
langsmith_client = Client()

In [4]:
from OG_PerunaBot_chain import Original_PerunaBot_eval_chain
from chain_0 import base_retriever_eval_chain_0
from chain_1 import parent_retriever_eval_chain_1
from chain_2 import ensemble_retriever_eval_chain_2

  warn_deprecated(


In [6]:
import nest_asyncio
nest_asyncio.apply()

In [8]:
from ragas.integrations.langchain import EvaluatorChain
from langchain.smith import run_on_dataset, RunEvalConfig
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)

# create evaluation chains
faithfulness_chain = EvaluatorChain(metric=faithfulness)
answer_rel_chain = EvaluatorChain(metric=answer_relevancy)
context_rel_chain = EvaluatorChain(metric=context_precision)
context_recall_chain = EvaluatorChain(metric=context_recall)


# Wrap the RAGAS metrics to use in LangChain
evaluators = [
    faithfulness_chain,
    answer_rel_chain,
    context_rel_chain,
    context_recall_chain,
]

eval_config = RunEvalConfig(custom_evaluators=evaluators)

# datasets in langsmith
data_set_1 = "SMU Schools Basic Info"
data_set_2 = "SMU Campus Facts"
project_name = "First test eval for "


In [None]:

# Original PerunaBot chain on dataset 1
chain_results = run_on_dataset(
    client=langsmith_client,
    llm_or_chain_factory=Original_PerunaBot_eval_chain,
    dataset_name=data_set_1,
    verbose=True,
    evaluation=eval_config,
    project_name=project_name + "Original PerunaBot chain",
    project_metadata={
            "chain": "Original_PerunaBot_eval_chain",
            "dataset": data_set_1,
            "version": "0.1"
        }
)

In [19]:
from langchain_community.document_loaders import PyPDFLoader

# file paths of PDFs to be used
pdf_paths = ['../Data/Evaluation Data/Southern Methodist University - 2023-2024 Undergraduate Catalog from About SMU to Right to Know.pdf',
             '../Data/Evaluation Data/Important University Resources from SMU Student Handbook 23-24.pdf',
             '../Data/Evaluation Data/Important SMU Numbers and Websites.pdf'
             ]

# Function to load PDFs using LangChain's PyPDFLoader
def load_pdfs_with_langchain(pdf_paths):
    documents = []
    for path in pdf_paths:
        try:
            # Use LangChain's PyPDFLoader to load the PDF
            loader = PyPDFLoader(path)
            # Load and pase the PDF into document instances
            pdf_doc = loader.load()
            # Insert the parsed PDF documents into the documents list
            documents.extend(pdf_doc)
        except Exception as e:
            print(f"Error loading {path}: {e}")
    return documents

# Load PDF documents using the function
evaluation_pdf_docs = load_pdfs_with_langchain(pdf_paths)

print(len(evaluation_pdf_docs))
print(evaluation_pdf_docs[0].page_content[0:100])
print(evaluation_pdf_docs[7].metadata)

93
14 
 About SMU  
The Vision of Southern Methodist University  
To create and impart knowledge that w
{'source': '../Data/Evaluation Data/Southern Methodist University - 2023-2024 Undergraduate Catalog from About SMU to Right to Know.pdf', 'page': 7}


In [20]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50, 
                                                length_function=len, add_start_index=True)  

# Split the PDF documents into chunks using the text splitter
split_evaluation_pdf_docs = text_splitter.split_documents(evaluation_pdf_docs)
print(len(split_evaluation_pdf_docs))
print(split_evaluation_pdf_docs[0].page_content[0:100])
print(split_evaluation_pdf_docs[0].metadata)

798
14 
 About SMU  
The Vision of Southern Methodist University  
To create and impart knowledge that w
{'source': '../Data/Evaluation Data/Southern Methodist University - 2023-2024 Undergraduate Catalog from About SMU to Right to Know.pdf', 'page': 0, 'start_index': 0}


In [23]:
from openai import embeddings
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

#generator with openai models
generator_llm = ChatOpenAI(model="gpt-3.5-turbo")
critic_llm= ChatOpenAI(model="gpt-3.5-turbo")
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
     generator_llm=generator_llm,
    critic_llm=critic_llm,
    embeddings=embeddings, 
)

# Generate testset
testset_1 = generator.generate_with_langchain_docs(split_evaluation_pdf_docs, test_size=10, distributions = {simple: 0.5, reasoning: 0.3, multi_context: 0.2})


embedding nodes:   0%|          | 0/1596 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/10 [00:00<?, ?it/s]

In [25]:
# Generate testset
testset_2 = generator.generate_with_langchain_docs(split_evaluation_pdf_docs, test_size=10, distributions = {simple: 0.5, reasoning: 0.3, multi_context: 0.2})

embedding nodes:   0%|          | 0/1596 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/10 [00:00<?, ?it/s]

In [29]:
# Generate testset
testset_3 = generator.generate_with_langchain_docs(split_evaluation_pdf_docs, test_size=10, distributions = {simple: 0.2, reasoning: 0.4, multi_context: 0.4})

embedding nodes:   0%|          | 0/1596 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/10 [00:00<?, ?it/s]

In [1]:

test_df_1 = testset_1.to_pandas()
test_df_2 = testset_2.to_pandas()
test_df_3 = testset_3.to_pandas()

NameError: name 'testset_1' is not defined

In [None]:
# Convert test set data frames to pandas data frames
test_df_1 = testset_1.to_pandas()
test_df_2 = testset_2.to_pandas()
test_df_3 = testset_3.to_pandas()

# Save pandas data frames as CSV files
test_df_1.to_csv('testset_1.csv', index=False)
test_df_2.to_csv('testset_2.csv', index=False)
test_df_3.to_csv('testset_3.csv', index=False)