In [9]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from dotenv import load_dotenv

load_dotenv()

loader = DirectoryLoader("/Users/tomasz/plan-and-execute-rag/docs/")
documents = loader.load()


# generator with openai models
generator_llm = ChatOpenAI(model="gpt-4o-mini")
critic_llm = ChatOpenAI(model="gpt-4o-mini")
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# generate testset
# testset = generator.generate_with_langchain_docs(documents, test_size=10, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})


Filename and doc_id are the same for all nodes.               
Generating:  70%|███████   | 7/10 [00:09<00:02,  1.00it/s]max retries exceeded for MultiContextEvolution(generator_llm=LangchainLLMWrapper(run_config=RunConfig(timeout=180, max_retries=15, max_wait=90, max_workers=16, exception_types=<class 'openai.RateLimitError'>, log_tenacity=False, seed=42)), docstore=InMemoryDocumentStore(splitter=<langchain_text_splitters.base.TokenTextSplitter object at 0x162de5790>, nodes=[Node(metadata={'source': '/Users/tomasz/plan-and-execute-rag/docs/transform-data-databricks-notebook.md'}, page_content='title: Transform data with Databricks Notebook titleSuffix: Azure Data Factory & Azure Synapse description: Learn how to process or transform data by running a Databricks notebook in Azure Data Factory and Synapse Analytics pipelines. ms.custom: synapse author: nabhishek ms.author: abnarain ms.topic: conceptual ms.date: 05/15/2024\n\nTransform data by running a Databricks notebook\n\n[!INCLUDEap

In [21]:
from datasets import load_dataset
# loading the V2 dataset
amnesty_qa = load_dataset("explodinggradients/amnesty_qa", "english_v2")
amnesty_qa

Downloading builder script: 100%|██████████| 5.72k/5.72k [00:00<00:00, 43.6kB/s]
Downloading readme: 100%|██████████| 1.90k/1.90k [00:00<00:00, 14.7kB/s]
Repo card metadata block was not found. Setting CardData to empty.
Downloading data: 100%|██████████| 70.8k/70.8k [00:00<00:00, 303kB/s]
Generating eval split: 20 examples [00:00, 2972.58 examples/s]


DatasetDict({
    eval: Dataset({
        features: ['question', 'ground_truth', 'answer', 'contexts'],
        num_rows: 20
    })
})

In [22]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)



Evaluating: 100%|██████████| 80/80 [01:00<00:00,  1.31it/s]


In [23]:
result

{'context_precision': 0.9417, 'faithfulness': 0.5336, 'answer_relevancy': 0.9250, 'context_recall': 0.9500}

In [41]:
from langchain_openai.chat_models import ChatOpenAI
from dotenv import load_dotenv

from langchain_core.tools import Tool
from langchain_experimental.plan_and_execute import (
    PlanAndExecute,
    load_agent_executor,
    load_chat_planner,
)


from langchain_openai import ChatOpenAI


from langchain_core.tools import Tool

load_dotenv()




def generate_response(prompt_input):
    model = ChatOpenAI(model= "gpt-4o-mini", temperature=0.7)
    
    planner = load_chat_planner(model)
    tools = []
    
    executor = load_agent_executor(model, tools, verbose=True)
    
    agent = PlanAndExecute(planner=planner, executor=executor, verbose=True)
    response = agent.invoke({"input": prompt_input})
    return response["output"]



ds_for_parse = amnesty_qa["eval"].to_pandas()
results = []
prompts = []
for index, row in ds_for_parse.iterrows(): 
    context = row["contexts"]
    question = row["question"]
    prompt_input = f"""Answer the question: {question}
    Use the provided context to answer the question. The relevant contexts extracted from relevant documents are listed below:
    {context}"""
    prompts.append(prompt_input)
    result = generate_response(prompt_input)
    results.append(result)






[1m> Entering new PlanAndExecute chain...[0m
steps=[Step(value='Analyze the implications of the USA Supreme Court ruling on abortion within the United States, particularly regarding access to abortion and maternal health.'), Step(value="Examine the ruling's influence on international perspectives and policies regarding abortion, especially in countries that look to the USA for guidance."), Step(value='Explore how the ruling has affected global reproductive rights organizations and their advocacy efforts.'), Step(value='Discuss the potential for the ruling to inspire anti-abortion legislation and policy shifts in other countries.'), Step(value="Given the above steps taken, please respond to the user's original question regarding the global implications of the USA Supreme Court ruling on abortion. \n")]

[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to analyze the implications of the USA Supreme Court ruling on abortion, focusing on access to abortion and

In [44]:
from pandas import Series
Series(results)

0     The USA Supreme Court ruling on abortion has s...
1     The main companies responsible for greenhouse ...
2     The largest GHG emitting private companies in ...
3     Amnesty International urged its supporters to ...
4     Amnesty International's recommendations to the...
5     The target audiences for Amnesty International...
6     The right that guarantees access to informatio...
7     Key stakeholders who should be informed about ...
8     Individuals can be found guilty under Article ...
9     Statements are considered 'false' under Articl...
10    Independent civil society organizations in Nic...
11    The Ramsar Convention designates wetlands as R...
12          COP15 was held in Montreal, Canada in 2022.
13    The 30x30 agreement aims to protect 30% of the...
14    At COP15, there was significant emphasis on th...
15    The criminalization of abortion has significan...
16    Social media companies have a vital role in re...
17    Social media companies have a crucial role

In [45]:
ds_for_parse["answer"] = Series(results)

In [48]:
my_dataset = amnesty_qa["eval"].to_pandas()
my_dataset["answer"] = Series(results)

from datasets import Dataset
my_ds_for_eval = Dataset.from_pandas(my_dataset)

In [50]:
from ragas import evaluate
result = evaluate(
    my_ds_for_eval,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
    ],
)

Evaluating: 100%|██████████| 80/80 [00:32<00:00,  2.46it/s]


In [52]:
result

{'context_precision': 0.9417, 'faithfulness': 0.6190, 'answer_relevancy': 0.9113, 'context_recall': 0.9500}