In [14]:
from langchain_openai import ChatOpenAI
import pandas as pd
from dotenv import load_dotenv
from datasets import load_dataset, Dataset


from langchain_openai.chat_models import ChatOpenAI
from langchain_core.tools import Tool
from langchain_experimental.plan_and_execute import (
    PlanAndExecute,
    load_agent_executor,
    load_chat_planner,
)
from langchain_openai import ChatOpenAI

from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)

load_dotenv()

True

In [15]:
# eval_df=pd.read_csv("tesla.csv")

In [16]:

# amnesty_qa = load_dataset("explodinggradients/amnesty_qa", "english")
# eval_df = amnesty_qa["eval"].to_pandas()
# eval_df

In [17]:
eval_df = load_dataset("harpreetsahota/ragas-example-dataset")["train"].to_pandas()

eval_df = eval_df.rename(columns={"answer":"ground_truth"})
eval_df


Unnamed: 0,question,ground_truth,contexts
0,Who was the first person to walk on the moon?,Buzz Aldrin was the first person to walk on th...,[Neil Armstrong was the first person to walk o...
1,What is the capital city of France?,Berlin is the capital city of France,"[Paris is the capital city of France, not Berl..."
2,Who is the current president of the United Sta...,Joe Biden is the current president of the Unit...,[Joe Biden is the current president of the Uni...
3,What is the square root of 81?,The square root of 81 is 9,"[The square root of 81 is indeed 9., This is a..."
4,"Who wrote the play ""Romeo and Juliet""?","William Shakespeare wrote the play ""Romeo and ...",[William Shakespeare is the playwright who wro...
5,What is the largest planet in our solar system?,Saturn is the largest planet in our solar system,[Jupiter is the largest planet in our solar sy...
6,Who is the founder of Microsoft?,Bill Gates is the founder of Microsoft,"[Bill Gates is the co-founder of Microsoft, al..."
7,What is the chemical symbol for hydrogen?,The chemical symbol for hydrogen is Hg,"[The chemical symbol for hydrogen is H, not Hg..."
8,Who is the current prime minister of Canada?,Justin Trudeau is the current prime minister o...,[Justin Trudeau is the current prime minister ...
9,What is the name of the world's largest desert?,The Sahara is the name of the world's largest ...,[The Sahara is the world's largest hot desert....


In [18]:
def get_answers(eval_df, agent):
    answers = []
    for index, row in eval_df.iterrows(): 
        context = row["contexts"]
        question = row["question"]
        
        prompt_input = f"""Answer the question: {question}
        Use the provided context to answer the question. The relevant contexts extracted from relevant documents are listed below:
        {context}"""
        
        result = agent.invoke(prompt_input)
        answers.append(result)
        
        return answers
    

def evaluate_answers(eval_df, answers, caption):

    result_df = eval_df.copy(deep=True)
    result_df["answer"] = pd.Series(answers)
    eval_dataset = Dataset.from_pandas(result_df)

    evaluation = evaluate(
        eval_dataset,
        metrics=[
            context_precision,
            faithfulness,
            answer_relevancy,
            context_recall,
        ],
    )
    print(f"Evaluation results - {caption}")
    print(evaluation)
    return evaluation


In [20]:

plain_llm = ChatOpenAI(model= "gpt-4o-mini", temperature=0.7)

plain_answers = get_answers(eval_df, plain_llm)
plain_answers = [answer.content for answer in plain_answers]
evaluate_answers(eval_df, plain_answers, "Plain OpenAI")


Evaluating:   3%|▎         | 3/100 [00:03<01:33,  1.04it/s]No statements were generated from the answer.
Evaluating:  14%|█▍        | 14/100 [00:08<00:50,  1.71it/s]No statements were generated from the answer.
Evaluating:  17%|█▋        | 17/100 [00:10<00:52,  1.57it/s]No statements were generated from the answer.
Evaluating:  31%|███       | 31/100 [00:25<01:00,  1.15it/s]No statements were generated from the answer.
No statements were generated from the answer.
Evaluating:  37%|███▋      | 37/100 [00:27<00:29,  2.12it/s]No statements were generated from the answer.
Evaluating:  38%|███▊      | 38/100 [00:28<00:28,  2.14it/s]No statements were generated from the answer.
Evaluating:  39%|███▉      | 39/100 [00:29<00:35,  1.70it/s]No statements were generated from the answer.
Evaluating:  41%|████      | 41/100 [00:30<00:43,  1.35it/s]No statements were generated from the answer.
Evaluating:  46%|████▌     | 46/100 [00:33<00:26,  2.07it/s]No statements were generated from the answer.
E

Evaluation results - Plain OpenAI
{'context_precision': 0.8400, 'faithfulness': 1.0000, 'answer_relevancy': 0.1197, 'context_recall': 0.8400}


{'context_precision': 0.8400, 'faithfulness': 1.0000, 'answer_relevancy': 0.1197, 'context_recall': 0.8400}

In [21]:

model = ChatOpenAI(model= "gpt-4o-mini", temperature=0.7)
planner = load_chat_planner(model)
executor = load_agent_executor(model, [])
plan_execute_agent = agent = PlanAndExecute(planner=planner, executor=executor)


plan_and_execute_answers = get_answers(eval_df, plan_execute_agent)
plan_and_execute_answers = [ answer["output"] for answer in plan_and_execute_answers]
evaluate_answers(eval_df, plan_and_execute_answers, "Plan and Execute")

Evaluating:   8%|▊         | 8/100 [00:06<01:32,  1.01s/it]No statements were generated from the answer.
Evaluating:   9%|▉         | 9/100 [00:07<01:21,  1.11it/s]No statements were generated from the answer.
Evaluating:  12%|█▏        | 12/100 [00:07<00:43,  2.01it/s]No statements were generated from the answer.
No statements were generated from the answer.
Evaluating:  21%|██        | 21/100 [00:10<00:28,  2.73it/s]No statements were generated from the answer.
Evaluating:  25%|██▌       | 25/100 [00:12<00:32,  2.33it/s]No statements were generated from the answer.
Evaluating:  27%|██▋       | 27/100 [00:15<00:50,  1.45it/s]No statements were generated from the answer.
No statements were generated from the answer.
Evaluating:  29%|██▉       | 29/100 [00:15<00:37,  1.87it/s]No statements were generated from the answer.
Evaluating:  32%|███▏      | 32/100 [00:16<00:26,  2.52it/s]No statements were generated from the answer.
Evaluating:  38%|███▊      | 38/100 [00:21<00:52,  1.19it/s]No

Evaluation results - Plan and Execute
{'context_precision': 0.8400, 'faithfulness': 0.5000, 'answer_relevancy': 0.1581, 'context_recall': 0.8400}


{'context_precision': 0.8400, 'faithfulness': 0.5000, 'answer_relevancy': 0.1581, 'context_recall': 0.8400}