In [3]:
"""
RAGAs Evaluation for NaiveRAG and then Enhanced RAG
"""
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
from datasets import Dataset
import json
import pandas as pd
import os

# Setting OPENAI Key
openai_key = os.getenv('OPENAI_API_KEY')
if not openai_key:
    raise ValueError("Set OPENAI_API_KEY environment variable")
os.environ['OPENAI_API_KEY'] = openai_key
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o-mini")

# Load both datasets
with open('../results/ragas_naive_data.json', 'r') as f:
	naive_data = json.load(f)

with open('../results/ragas_enhanced_data.json', 'r') as f:
	enhanced_data = json.load(f)

print(f"Loaded naive: {len(naive_data)} predictions")
print(f"Loaded enhanced: {len(enhanced_data)} predictions")

Loaded naive: 50 predictions
Loaded enhanced: 50 predictions


In [4]:
print("\n")
print("EVALUATING NAIVE RAG")
print("\n")

naive_dataset = Dataset.from_dict({
	'question': [d['question'] for d in naive_data],
	'answer': [d['answer'] for d in naive_data],
	'contexts': [d['contexts'] for d in naive_data],
	'ground_truth': [d['ground_truth'] for d in naive_data]
})

naive_results = evaluate(
	naive_dataset,
	metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
    llm=llm
)



EVALUATING NAIVE RAG




Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Exception raised in Job[0]: TimeoutError()
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Exception raised in Job[17]: TimeoutError()
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Exception raised in Job[68]: TimeoutError()
Exception raised in Job[72]: TimeoutError()
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Exception raised in Job[106]: TimeoutError()
Exception raised in Job[107]: TimeoutError()
Exception raised in Job[123]: TimeoutError()
Exception raised in Job[127]: TimeoutError()
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Exception raised in Job[172]: TimeoutError()
Exception raised in Job[186]: TimeoutError()
Exception raised in Job[188]: TimeoutError()


In [5]:
#Extracting the evaluation scores - calculating means from individual metric per question.
#Ignoring nans.
import numpy as np

naive_scores = {
	'faithfulness': np.nanmean(naive_results['faithfulness']),
	'answer_relevancy': np.nanmean(naive_results['answer_relevancy']),
	'context_precision': np.nanmean(naive_results['context_precision']),
	'context_recall': np.nanmean(naive_results['context_recall'])
}

print("\nNaive RAG Results:")
for metric, score in naive_scores.items():
	print(f"  {metric}: {score:.3f}")
	
# Count how many questions actually evaluated
valid_count = sum(1 for x in naive_results['faithfulness'] if not np.isnan(x))
print(f"\nSuccessfully evaluated: {valid_count}/50 questions")


Naive RAG Results:
  faithfulness: 0.693
  answer_relevancy: 0.728
  context_precision: 0.729
  context_recall: 0.617

Successfully evaluated: 44/50 questions


In [6]:
print("\n")
print("EVALUATING ENHANCED RAG")
print("\n")

enhanced_dataset = Dataset.from_dict({
	'question': [d['question'] for d in enhanced_data],
	'answer': [d['answer'] for d in enhanced_data],
	'contexts': [d['contexts'] for d in enhanced_data],
	'ground_truth': [d['ground_truth'] for d in enhanced_data]
})

enhanced_results = evaluate(
	enhanced_dataset,
	metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
    llm=llm
)



EVALUATING ENHANCED RAG




Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Exception raised in Job[12]: TimeoutError()
Exception raised in Job[4]: TimeoutError()
Exception raised in Job[23]: TimeoutError()
Exception raised in Job[35]: TimeoutError()
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Exception raised in Job[63]: TimeoutError()
Exception raised in Job[86]: TimeoutError()
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Exception raised in Job[96]: TimeoutError()
LLM returned 1 generations instead of

In [7]:
enhanced_scores = {
	'faithfulness': np.nanmean(enhanced_results['faithfulness']),
	'answer_relevancy': np.nanmean(enhanced_results['answer_relevancy']),
	'context_precision': np.nanmean(enhanced_results['context_precision']),
	'context_recall': np.nanmean(enhanced_results['context_recall'])
}

print("\nEnhanced RAG Results:")
for metric, score in enhanced_scores.items():
	print(f"  {metric}: {score:.3f}")
	
# Count how many questions actually evaluated
valid_count = sum(1 for x in enhanced_results['faithfulness'] if not np.isnan(x))
print(f"\nSuccessfully evaluated: {valid_count}/50 questions")


Enhanced RAG Results:
  faithfulness: 0.775
  answer_relevancy: 0.719
  context_precision: 0.909
  context_recall: 0.667

Successfully evaluated: 40/50 questions


In [12]:
comparison = pd.DataFrame({
	'Metric': ['faithfulness', 'answer_relevancy', 'context_precision', 'context_recall'],
	'Naive RAG': [
		naive_scores['faithfulness'],
		naive_scores['answer_relevancy'],
		naive_scores['context_precision'],
		naive_scores['context_recall']
	],
	'Enhanced RAG': [
		enhanced_scores['faithfulness'],
		enhanced_scores['answer_relevancy'],
		enhanced_scores['context_precision'],
		enhanced_scores['context_recall']
	]
})

comparison['Improvement'] = comparison['Enhanced RAG'] - comparison['Naive RAG']

print("\n")
print("RAGAs COMPARISON")
print("\n")
print(comparison.to_string(index=False))
print(f"\nNote: Naive evaluated on 44 questions, Enhancement on 40 questions; Random pickings.")

comparison.to_csv('../results/06_ragas_comparison.csv', index=False)
print("Results Saved")



RAGAs COMPARISON


           Metric  Naive RAG  Enhanced RAG  Improvement
     faithfulness   0.693182      0.775000     0.081818
 answer_relevancy   0.727969      0.719251    -0.008718
context_precision   0.729167      0.909091     0.179924
   context_recall   0.617021      0.666667     0.049645

Note: Naive evaluated on 44 questions, Enhancement on 40 questions; Random pickings.
Results Saved
