In [1]:
import os
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    context_precision,
    context_recall,
    faithfulness,
    answer_relevancy,
)
from rag import RAGAgent

test_samples = [
    {   
        "question": "What was Apple’s revenue in Q1 2024?",
        "ground_truth": "$119.58 billion"
        },
    {
        "question": "What earnings per share (EPS) did Apple report for the December 2024 quarter?",
        "ground_truth": "$2.18"
    },
       {
        "question": "What was Apple’s net income for the December 2024 quarter?",
        "ground_truth": "$33.9 billion"
    },
        {
        "question": "What guidance did Apple give for company gross margin in the March 2024 quarter?",
        "ground_truth": "between 46% and 47%"
    },
    {
        "question": "What were Apple’s total assets at September 28, 2024?",
        "ground_truth": "$364.98 billion" 
    },
    {
        "question": "What dividend per share did Apple declare in fiscal year 2024?",
        "ground_truth": "$0.98 per share or RSU" 
    },
        {
        "question": "What was Apple’s net income for fiscal year 2024?",
        "ground_truth": "$93.7 billion"  
    },
    {
        "question": "What was Apple’s basic earnings per share for fiscal year 2024?",
        "ground_truth": "$6.11" 
    },
    {
        "question": "What was Apple’s diluted earnings per share for fiscal year 2024?",
        "ground_truth": "$6.08"  
    },
        {
        "question": "How much did Apple spend on repurchasing its common stock in fiscal year 2024?",
        "ground_truth": "$95.8 billion"
    },

]

In [3]:
from dotenv import load_dotenv
load_dotenv()
agent = RAGAgent(model_name="gpt-4o")

dataset_dict = {"question": [], "ground_truth": [], "contexts": [], "answer": []}

for sample in test_samples:
    q = sample["question"]
    gt = sample["ground_truth"]

    out = agent.process_query(q)
    # out["source_documents"] is a list of dicts with a "content" field:
    contexts = [d["content"] for d in out["source_documents"]]

    dataset_dict["question"].append(q)
    dataset_dict["ground_truth"].append(gt)
    dataset_dict["contexts"].append(contexts)
    dataset_dict["answer"].append(out["response"])

dataset = Dataset.from_dict(dataset_dict)

result = evaluate(
    dataset=dataset,
    metrics=[
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
    ],
    # you can also pass llm= or embeddings= overrides here if you like
)

df = result.to_pandas()
print(df)

Collection selected: aapl_10k_10q_forms
Collection selected: earnings_calls
Collection selected: aapl_10k_10q_forms
Collection selected: earnings_calls
Collection selected: aapl_10k_10q_forms
Collection selected: aapl_10k_10q_forms
Collection selected: aapl_10k_10q_forms
Collection selected: aapl_10k_10q_forms


Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

                                          user_input  \
0               What was Apple’s revenue in Q1 2024?   
1  What earnings per share (EPS) did Apple report...   
2  What was Apple’s net income for the December q...   
3  What guidance did Apple give for company gross...   
4  What were Apple’s total assets at September 28...   
5  What dividend per share did Apple declare in f...   
6  What was Apple’s net income for fiscal year 2024?   
7  What was Apple’s basic earnings per share for ...   
8  What was Apple’s diluted earnings per share fo...   
9  How much did Apple spend on repurchasing its c...   

                                  retrieved_contexts  \
0  [, 2023 (the “2023 Form 10-K”).\nThe Company’s...   
1  [n additional week in the\nquarter. And second...   
2  [ment for the three- and six-month periods end...   
3  [l turn it over to Luca.\nLuca Maestri\nThank ...   
4  [s, and par value)\nMarch 30,\n2024\nSeptember...   
5                                              