In [2]:
import json
import os
from dotenv import load_dotenv

In [3]:
load_dotenv()

True

In [4]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [7]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

In [8]:
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

In [45]:
from ragas.metrics import SemanticSimilarity, BleuScore, RougeScore, SemanticSimilarity
from ragas.dataset_schema import SingleTurnSample
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.metrics._factual_correctness import FactualCorrectness

In [28]:
from datasets import load_dataset, Dataset
from ragas import EvaluationDataset

In [14]:
with open('../dataset/eval_dset.json', 'r') as f:
  eval_dset = json.load(f)

In [15]:
eval_dset[0].keys()

dict_keys(['input', 'reference', 'prediction'])

In [None]:
key_map = {"prediction": "response", 
           "reference": "reference"}

ragas_dset = [
    {key_map[k]: v for k, v in item.items() if k in key_map.keys()}
    for item in eval_dset
]


In [34]:
dataset = Dataset.from_list(ragas_dset)

In [35]:
dataset

Dataset({
    features: ['reference', 'response'],
    num_rows: 4324
})

In [36]:
eval_dataset = EvaluationDataset.from_hf_dataset(dataset)

eval_dataset.features()

['response', 'reference']

In [39]:
eval_dataset[:500]

EvaluationDataset(features=['response', 'reference'], len=500)

In [40]:
from ragas import evaluate

In [49]:
metric1 = FactualCorrectness(llm = evaluator_llm)
metric2 = BleuScore()
metric3 = RougeScore()

In [50]:
results = evaluate(eval_dataset[:10], metrics = [metric1, metric2, metric3])

Evaluating:   0%|          | 0/30 [00:00<?, ?it/s]

In [51]:
results

{'factual_correctness(mode=f1)': 0.0700, 'bleu_score': 0.0287, 'rouge_score(mode=fmeasure)': 0.0362}