In [None]:
import os
from dotenv import load_dotenv, find_dotenv

# Load environment variables from the .env file using 'from dotenv import find_dotenv, load_dotenv'
load_dotenv(find_dotenv(filename='SURF-Project_Optimizing-PerunaBot/setup/.env'))

In [None]:
from OG_PerunaBot_chain import Original_PerunaBot_eval_chain
from chain_0 import base_retriever_eval_chain_0
from chain_1 import parent_retriever_eval_chain_1
from chain_2 import ensemble_retriever_eval_chain_2

In [None]:
#langsmith
from langsmith import Client
import os

langsmith_api_key = os.environ["LANGSMITH_API_KEY"]
langchain_endpoint = os.environ["LANGCHAIN_ENDPOINT"]
langsmith_project = os.environ["LANGCHAIN_PROJECT"]
os.environ["LANGCHAIN_TRACING_V2"]

# Initialize LangSmith Client using 'from langsmith import Client'
langsmith_client = Client()

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
def predict_function(chain):
    def predict(inputs: dict) -> dict:
        text = inputs.get("question", "test")  # Extract the 'text' key from the input dictionary
        result = chain.invoke({"input": text})  # Call your chain with the extracted text
        return {"output": result}  # Return the result as a dictionary
    return predict

In [None]:
from langchain.smith import RunEvalConfig, run_on_dataset
from langchain.evaluation import EvaluatorType, load_evaluator
from langchain.evaluation.criteria import CriteriaEvalChain, Criteria
from langchain_openai import ChatOpenAI


# datasets in langsmith
data_set_1 = "SMU Schools Basic Info"
data_set_2 = "SMU Campus Facts"
project_name = "First test eval for "

eval_llm = ChatOpenAI(model="gpt-4o", temperature=0)

# Evaluation configuration
eval_config = RunEvalConfig(
    evaluators = [
        load_evaluator(EvaluatorType.QA),
        load_evaluator(EvaluatorType.CONTEXT_QA),
        load_evaluator(EvaluatorType.COT_QA),
        CriteriaEvalChain.from_llm(eval_llm, criteria=Criteria.RELEVANCE),
        CriteriaEvalChain.from_llm(eval_llm, criteria=Criteria.COHERENCE),
        CriteriaEvalChain.from_llm(eval_llm, criteria=Criteria.DETAIL),
        CriteriaEvalChain.from_llm(eval_llm, criteria=Criteria.HELPFULNESS)
    ],
    eval_llm = ChatOpenAI(model="gpt-4o", temperature=0)
)



# Evaluate the target task
def evaluate_chain(chain, dataset, chain_name):
    chain_results = run_on_dataset(
        client=langsmith_client,
        llm_or_chain_factory=predict_function(chain),
        evaluation=eval_config,
        dataset_name=dataset,
        verbose=True,
        project_name= project_name + chain_name,
        project_metadata={
            "chain": chain_name,
            "dataset": dataset,
            "version": "0.1"
        }
    )

In [None]:

evaluate_chain(Original_PerunaBot_eval_chain, data_set_1, "Original PerunaBot chain")

In [None]:

# Chain 0 on data set 1
evaluate_chain(base_retriever_eval_chain_0, data_set_1, "Base Retriever Chain")

In [None]:

# Chain 1 on data set 1
evaluate_chain(parent_retriever_eval_chain_1, data_set_1, "Parent Retriever Chain")

In [None]:

# Chain 2 on data set 1
evaluate_chain(ensemble_retriever_eval_chain_2, data_set_1, "Ensemble Retriever Chain")

In [None]:
# OG PerunaBot Chain on data set 2
evaluate_chain(Original_PerunaBot__eval_chain, data_set_2, "Original PerunaBot chain")

In [None]:

# Chain 0 on data set 2
evaluate_chain(base_retriever__eval_chain_0, data_set_2, "Base Retriever Chain")

In [None]:

# Chain 1 on data set 2
evaluate_chain(parent_retriever__eval_chain_1, data_set_2, "Parent Retriever Chain")

In [None]:

# Chain 2 on data set 2
evaluate_chain(ensemble_retriever__eval_chain_2, data_set_2, "Ensemble Retriever Chain")