In [None]:
import json
import pandas as pd
import config
import os
from deepeval import evaluate
from deepeval.metrics import FaithfulnessMetric
from deepeval.test_case import LLMTestCase
from deepeval.metrics import ContextualRelevancyMetric
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams

# Open and read the JSON file
with open('result.json', 'r') as file:
    data = json.load(file)

results = []  # Initialize a list to store results

# Iterate through each key and value in the data dictionary
for key, value in data.items():
    try:
        query = value['Query']
        actual_output = value['Suggested_Response']
        retrieval_context = value.get('Context', [])
        expected_output = [value['Expected_Response']]

        # Ensure retrieval_context is a list of strings
        if not isinstance(retrieval_context, list):
            retrieval_context = [retrieval_context]
        retrieval_context = [str(context) for context in retrieval_context]

        correctness_metric = GEval(
            name="Context Utilization",
            evaluation_steps=[
                "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
                "Verify that no facts in the response contradict the provided context.",
                "Assess whether the response omits critical details from the context.",
                "Evaluate if the language is clear and the response is logically structured based on the context."
            ],
            evaluation_params=[
                LLMTestCaseParams.INPUT, 
                LLMTestCaseParams.ACTUAL_OUTPUT, 
                LLMTestCaseParams.EXPECTED_OUTPUT, 
            ]
        )
        
        test_case = LLMTestCase(
            input=query,
            actual_output=actual_output,
            retrieval_context=retrieval_context,
            expected_output=expected_output
        )
        
        # Measure the test case
        correctness_metric.measure(test_case)
        
        # Collect the score and reason into the results list
        results.append({
            'Entry': key,
            'correctness Score': correctness_metric.score,
            'correctness Reason': correctness_metric.reason
        })

    except Exception as e:
        # Log the error and continue processing other entries
        results.append({
            'Entry': key,
            'correctness Score': None,
            'correctness Reason': f'Error: {str(e)}'
        })

# Convert the list to a DataFrame
df = pd.DataFrame(results)

# Print the DataFrame
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_row', None)
df
print(df['correctness Score'].mean())


In [None]:
import json
from datasets import Dataset
from ragas.metrics import faithfulness
from ragas import evaluate

# Load JSON data from the file
with open('result.json', 'r') as file:
    data = json.load(file)

# Prepare the data in the required format
data_samples = {
    'question': [entry['Query'] for entry in data.values()],
    'answer': [entry['Suggested_Response'] for entry in data.values()],
    'contexts': [entry['Context'] for entry in data.values()]
}

# Convert the dictionary to a Dataset object
dataset = Dataset.from_dict(data_samples)

# Evaluate the dataset using the RAGAS faithfulness metric
score = evaluate(dataset, metrics=[faithfulness])
df = score.to_pandas().drop(columns=['contexts'])
df
