In [1]:
import json
import os
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [4]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

In [5]:
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

In [6]:
from ragas.metrics import SemanticSimilarity, BleuScore, RougeScore, SemanticSimilarity
from ragas.dataset_schema import SingleTurnSample
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.metrics._factual_correctness import FactualCorrectness

In [7]:
from datasets import load_dataset, Dataset
from ragas import EvaluationDataset

In [8]:
with open('../dataset/eval_dset_finetuned.json', 'r') as f:
  eval_dset = json.load(f)

In [9]:
eval_dset[0].keys()

dict_keys(['input', 'reference', 'prediction'])

In [10]:
key_map = {"prediction": "response", 
           "reference": "reference"}

ragas_dset = [
    {key_map[k]: v for k, v in item.items() if k in key_map.keys()}
    for item in eval_dset
]


In [11]:
dataset = Dataset.from_list(ragas_dset)

In [12]:
dataset

Dataset({
    features: ['reference', 'response'],
    num_rows: 300
})

In [13]:
eval_dataset = EvaluationDataset.from_hf_dataset(dataset)

eval_dataset.features()

['response', 'reference']

In [14]:
eval_dataset[:500]

EvaluationDataset(features=['response', 'reference'], len=300)

In [15]:
from ragas import evaluate

In [16]:
metric1 = FactualCorrectness(llm = evaluator_llm)
metric2 = BleuScore()
metric3 = RougeScore()
metric4 = SemanticSimilarity()

In [18]:
results = evaluate(eval_dataset, metrics = [metric1, metric2, metric3, metric4])

results.to_pandas().to_csv(path_or_buf=  "../results/data/finetuned.csv", index = False)

Evaluating:   0%|          | 0/1200 [00:00<?, ?it/s]

In [19]:
results.to_pandas()

Unnamed: 0,response,reference,factual_correctness(mode=f1),bleu_score,rouge_score(mode=fmeasure),semantic_similarity
0,\nDrake has a long history of conflicts. Chris...,Drake has a long history of conflicts. Chris B...,1.00,1.000000,0.793750,0.977184
1,\nMeek has been fighting off tensions between ...,"Although Drake squashed his major beef​, he’s ...",0.00,0.014712,0.091954,0.839520
2,\nDrake doesn’t want to be a booty call in the...,Drake doesn’t want to be a booty call in the m...,1.00,1.000000,1.000000,0.998880
3,\nDrake is referencing how the general public ...,Drake is referencing how the general public co...,1.00,1.000000,1.000000,0.999034
4,\nDrake may be referring to his late OVO affil...,Drake may be referring to his late OVO affilia...,0.22,1.000000,0.201493,0.883299
...,...,...,...,...,...,...
295,\nThe bumper is the trim around the wheels of ...,This line is straight from Popcaan’s 2014 song...,0.00,0.000182,0.149733,0.857132
296,\nDrake’s former girlfriend has been with anot...,Drake knows this girl has another man she’s se...,0.17,0.003824,0.168421,0.916183
297,"\nDrake wants to fix their relationship, so he...",The things Drake wants from the relationship a...,0.00,0.079559,0.240000,0.868439
298,\nInstead of allowing Rihanna to have her say ...,Despite all the points Drizzy has made about h...,0.17,0.007284,0.217391,0.873039
