In [1]:
from dotenv import load_dotenv
import os
 
# Load environment variables from the .env file
load_dotenv('.env')
 
# Access environment variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [3]:
import os
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAI, GoogleGenerativeAIEmbeddings

genai.configure(api_key=GOOGLE_API_KEY)

model = ChatGoogleGenerativeAI(model='gemini-pro',google_api_key = GOOGLE_API_KEY)
embeddings = GoogleGenerativeAIEmbeddings(model = 'models/embeddings-001')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from deepeval.models.base_model import DeepEvalBaseLLM

class GoogleVertextAI(DeepEvalBaseLLM):
    """Class to implement Vertex AI for DeepEval"""
    def __init__(self,model):
        self.model = model
    
    def load_model(self):
        return self.model
    
    def generate(self,prompt:str) -> str:
        chat_model = self.load_model()
        return chat_model.invoke(prompt).content
    
    async def a_generate(self,prompt: str) -> str:
        chat_model = self.load_model()
        res = await chat_model.ainvoke(prompt)
        return res.content
    
    def get_model_name(self):
        return "Vertex AI Model"
    

vertexai_gemini = GoogleVertextAI(model = model)
print(vertexai_gemini.generate("write me a joke"))

What do you call a fish with no eyes?

Fsh!


In [5]:
from deepeval import assert_test
from deepeval.test_case import LLMTestCase
from deepeval.metrics import AnswerRelevancyMetric

test_case = LLMTestCase(
    input="Python or R? what is Better?",
    actual_output="Python is best"
)

relevancy_metric = AnswerRelevancyMetric(threshold=0.9,model=vertexai_gemini)

relevancy_metric.measure(test_case)
print(relevancy_metric.score,relevancy_metric.reason)

1.0 The score is 1.00 because there are no irrelevant statements in the actual output.


In [6]:
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.metrics import GEval

test_case = LLMTestCase(
    input="Python or R? what is Better?",
    actual_output="Python is better",
    expected_output="Python"
)

correctness_metric = GEval(
    name='ABC',
    criteria = "ABC - determine if output is short or not",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT], model = vertexai_gemini
)

correctness_metric.measure(test_case)
print(correctness_metric.score,correctness_metric.reason)

0.0 The length of Actual Output is not less than the length of Expected Output.


## Creating our own metric

In [7]:
from deepeval.scorer import Scorer
from deepeval.metrics import BaseMetric
from deepeval.test_case import LLMTestCase

class RougeMetric(BaseMetric):
    def __init__(self,threshold:float = 0.5):
        self.threshold = threshold
        self.scorer = Scorer()

    def measure(self, test_case: LLMTestCase):
        self.score = self.scorer.rouge_score(
            prediction=test_case.actual_output,
            target=test_case.expected_output,
            score_type="rouge1"
        )
        self.success = self.score >= self.threshold
        return self.score
    
    async def a_measure(self, test_case: LLMTestCase):
        return self.measure(test_case)
    
    def is_successful(self):
        return self.success
    
    @property
    def __name__(self):
        return "Rouge Metric"
    
test_case = LLMTestCase(input="Is python better than R",actual_output="Yes it is" , expected_output="yes")
metric = RougeMetric()

metric.measure(test_case)
print(metric.is_successful())
        

True


## More than one testcases

In [8]:
from deepeval import evaluate
from deepeval.metrics import HallucinationMetric, AnswerRelevancyMetric
from deepeval.dataset import EvaluationDataset
from deepeval.test_case import LLMTestCase

first_test_case = LLMTestCase(input="who won the IPL 2024",actual_output="KKR" , expected_output="KKR won it")
second_test_case = LLMTestCase(input="What is Virat's sirname?",actual_output="Kohli" , expected_output="Virat Kohli is the sirname")

test_cases = [first_test_case,second_test_case]

dataset = EvaluationDataset(test_cases=test_cases)
answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5,model=vertexai_gemini)

#dataset.evaluate([answer_relevancy_metric])

# we can also call the evaluate() function directly
evaluate(dataset,[answer_relevancy_metric])

Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 2 test case(s) in parallel: |██████████|100% (2/2) [Time Taken: 00:04,  2.49s/test case]




Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: Vertex AI Model, reason: The score is 1.00 because the output provides a direct and concise answer to the question without any irrelevant statements., error: None)

For test case:

  - input: who won the IPL 2024
  - actual output: KKR
  - expected output: KKR won it
  - context: None
  - retrieval context: None


Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: Vertex AI Model, reason: The score is 1.00 because the output provides a direct and accurate answer to the user's question without any irrelevant information., error: None)

For test case:

  - input: What is Virat's sirname?
  - actual output: Kohli
  - expected output: Virat Kohli is the sirname
  - context: None
  - retrieval context: None


Overall Metric Pass Rates

Answer Relevancy: 100.00% pass rate




EvaluationResult(test_results=[TestResult(name='test_case_0', success=True, metrics_data=[MetricData(name='Answer Relevancy', threshold=0.5, success=True, score=1.0, reason='The score is 1.00 because the output provides a direct and concise answer to the question without any irrelevant statements.', strict_mode=False, evaluation_model='Vertex AI Model', error=None, evaluation_cost=None, verbose_logs='Statements:\n[\n    "KKR"\n] \n \nVerdicts:\n[\n    {\n        "verdict": "yes",\n        "reason": null\n    }\n]')], conversational=False, multimodal=False, input='who won the IPL 2024', actual_output='KKR', expected_output='KKR won it', context=None, retrieval_context=None), TestResult(name='test_case_1', success=True, metrics_data=[MetricData(name='Answer Relevancy', threshold=0.5, success=True, score=1.0, reason="The score is 1.00 because the output provides a direct and accurate answer to the user's question without any irrelevant information.", strict_mode=False, evaluation_model='V

## Evaluation of Summarization

In [9]:
# This is the original text to be summarized
input = """
The 'coverage score' is calculated as the percentage of assessment questions
for which both the summary and the original document provide a 'yes' answer. This
method ensures that the summary not only includes key information from the original
text but also accurately represents it. A higher coverage score indicates a
more comprehensive and faithful summary, signifying that the summary effectively
encapsulates the crucial points and details from the original content.
"""

# This is the summary, replace this with the actual output from your LLM application
actual_output="""
The coverage score quantifies how well a summary captures and
accurately represents key information from the original text,
with a higher score indicating greater comprehensiveness.
"""

In [11]:
from deepeval import evaluate
from deepeval.metrics import SummarizationMetric
from deepeval.test_case import LLMTestCase

test_case = LLMTestCase(input=input, actual_output=actual_output)
metric = SummarizationMetric(
    threshold=0.5,
    model="gpt-4",
    assessment_questions=[
        "Is the coverage score based on a percentage of 'yes' answers?",
        "Does the score ensure the summary's accuracy with the source?",
        "Does a higher score mean a more comprehensive summary?"
    ]
)

metric.measure(test_case)
print(metric.score)
print(metric.reason)

# or evaluate test cases in bulk
evaluate([test_case], [metric])

0.6666666666666666
The score is 0.67 because while there is no contradicting or extra information present in the summary, it fails to answer a question which the original text could answer.


Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:20, 20.18s/test case]




Metrics Summary

  - ✅ Summarization (score: 0.6666666666666666, threshold: 0.5, strict: False, evaluation model: gpt-4, reason: The score is 0.67 because the summary is mostly accurate without any contradictions or extra information. However, it fails to answer some questions that could be answered by the original text, hence the reduction in the score., error: None)

For test case:

  - input: 
The 'coverage score' is calculated as the percentage of assessment questions
for which both the summary and the original document provide a 'yes' answer. This
method ensures that the summary not only includes key information from the original
text but also accurately represents it. A higher coverage score indicates a
more comprehensive and faithful summary, signifying that the summary effectively
encapsulates the crucial points and details from the original content.

  - actual output: 
The coverage score quantifies how well a summary captures and
accurately represents key information from t

EvaluationResult(test_results=[TestResult(name='test_case_0', success=True, metrics_data=[MetricData(name='Summarization', threshold=0.5, success=True, score=0.6666666666666666, reason='The score is 0.67 because the summary is mostly accurate without any contradictions or extra information. However, it fails to answer some questions that could be answered by the original text, hence the reduction in the score.', strict_mode=False, evaluation_model='gpt-4', error=None, evaluation_cost=0.07937999999999999, verbose_logs='Truths (limit=None):\n[\n    "The \'coverage score\' is calculated as the percentage of assessment questions for which both the summary and the original document provide a \'yes\' answer.",\n    "The method of calculating the \'coverage score\' ensures the summary not only includes key information from the original text but also accurately represents it.",\n    "A higher coverage score indicates a more comprehensive and faithful summary.",\n    "The coverage score signifi