In [7]:
!pip install -U -q deepeval llama-index


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [13]:
!deepeval set-ollama llama3.1

🙌 Congratulations! You're now using a local Ollama model for all evals that 
require an LLM.


In [14]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core.settings import Settings

embed_model = OllamaEmbedding(model_name="mxbai-embed-large")
llm = Ollama(model="llama3.1", request_timeout=300, temperature=0.0)

Settings.chunk_size = 512
Settings.chunk_overlap = 50
Settings.llm = llm
Settings.embed_model = embed_model

# Read LlamaIndex's quickstart on more details, you will need to store your data in "YOUR_DATA_DIRECTORY" beforehand
documents = SimpleDirectoryReader(input_files=['./data/text1.txt']).load_data()
index = VectorStoreIndex.from_documents(documents)
rag_application = index.as_query_engine(llm=llm)

In [18]:
import nest_asyncio

nest_asyncio.apply()

from deepeval.integrations.llama_index import (
    DeepEvalAnswerRelevancyEvaluator,
    DeepEvalFaithfulnessEvaluator,
    DeepEvalContextualRelevancyEvaluator,
    DeepEvalSummarizationEvaluator,
    DeepEvalBiasEvaluator,
    DeepEvalToxicityEvaluator,
)

# An example input to your RAG application
user_input = "Who is Twice?"

# LlamaIndex returns a response object that contains
# both the output string and retrieved nodes
response_object = rag_application.query(user_input)

evaluators = [
    DeepEvalAnswerRelevancyEvaluator(),
    DeepEvalFaithfulnessEvaluator(),
    DeepEvalContextualRelevancyEvaluator(),
    DeepEvalSummarizationEvaluator(),
    DeepEvalBiasEvaluator(),
    DeepEvalToxicityEvaluator(),
]

for evaluator in evaluators:
    evaluation_result = evaluator.evaluate_response(
        query=user_input, response=response_object
    )
    print(evaluation_result)
    print("\n")

query='Who is Twice?' contexts=None response='A South Korean girl group from the third generation of K-Pop, formed by JYP Entertainment through a casting show. The group consists of nine members who have gained international recognition for their music and performances. They are known for breaking records in album sales and achieving significant chart success worldwide.' passing=True feedback='The score is 0.83 because it is lowered by the irrelevance of mentioning record-breaking album sales and chart success as an indirect definition of Twice.' score=0.8333333333333334 pairwise_source=None invalid_result=False invalid_reason=None




query='Who is Twice?' contexts=None response='A South Korean girl group from the third generation of K-Pop, formed by JYP Entertainment through a casting show. The group consists of nine members who have gained international recognition for their music and performances. They are known for breaking records in album sales and achieving significant chart success worldwide.' passing=True feedback="The score is 0.75 because the actual output is partially faithful to the retrieval context, as it appears to acknowledge the indirect support for the group's international recognition despite not directly stating it." score=0.75 pairwise_source=None invalid_result=False invalid_reason=None




query='Who is Twice?' contexts=None response='A South Korean girl group from the third generation of K-Pop, formed by JYP Entertainment through a casting show. The group consists of nine members who have gained international recognition for their music and performances. They are known for breaking records in album sales and achieving significant chart success worldwide.' passing=True feedback='The score is 0.78 because the retrieval context contains information about the group\'s achievements and activities, not a person, which contradicts the input \'Who is Twice?\' - as stated in the first reason for irrelevancy: "Twice" is a South Korean girl group, not a person.' score=0.7777777777777778 pairwise_source=None invalid_result=False invalid_reason=None




query='Who is Twice?' contexts=None response='A South Korean girl group from the third generation of K-Pop, formed by JYP Entertainment through a casting show. The group consists of nine members who have gained international recognition for their music and performances. They are known for breaking records in album sales and achieving significant chart success worldwide.' passing=False feedback='The score is 0.00 because there is contradicting information, as the summary inaccurately claims the group has nine members, which contradicts the lack of any member count mentioned in the original text.' score=0.0 pairwise_source=None invalid_result=False invalid_reason=None




query='Who is Twice?' contexts=None response='A South Korean girl group from the third generation of K-Pop, formed by JYP Entertainment through a casting show. The group consists of nine members who have gained international recognition for their music and performances. They are known for breaking records in album sales and achieving significant chart success worldwide.' passing=True feedback='The score is 0.00 because there are no identified biases.' score=0.0 pairwise_source=None invalid_result=False invalid_reason=None




query='Who is Twice?' contexts=None response='A South Korean girl group from the third generation of K-Pop, formed by JYP Entertainment through a casting show. The group consists of nine members who have gained international recognition for their music and performances. They are known for breaking records in album sales and achieving significant chart success worldwide.' passing=True feedback='The score is 0.00 because there are no notable reasons for toxicity.' score=0.0 pairwise_source=None invalid_result=False invalid_reason=None


