In [1]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings, get_response_synthesizer
from llama_index.llms.ollama import Ollama
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.core.evaluation import generate_question_context_pairs




In [2]:
documents = SimpleDirectoryReader("data").load_data()

# bge-base embedding model
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

# ollama
Settings.llm = Ollama(model="llama3", request_timeout=360.0)

# build index
index = VectorStoreIndex.from_documents(documents)

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=5,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer()

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
)

In [None]:
qa_dataset = generate_question_context_pairs(
    index.docstore.docs.values(), llm=Ollama(model="llama3"), num_questions_per_chunk=1
)

# Retrieval Evaluation

In [None]:
from llama_index.core.evaluation import RetrieverEvaluator
from llama_index.core.evaluation import FaithfulnessEvaluator, RelevancyEvaluator

metrics = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]

In [None]:
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    metrics, retriever=retriever
)
eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)
eval_results

# Response Evaluation

In [None]:
from llama_index.core.evaluation import BatchEvalRunner

runner = BatchEvalRunner(
    {"faithfulness": FaithfulnessEvaluator(), "relevancy": RelevancyEvaluator()},
    workers=8,
)

questions = qa_dataset.queries.values()
eval_results = await runner.aevaluate_queries(
    query_engine, queries=questions
)

In [None]:
import pandas as pd


def display_results(name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    columns = {
        "retrievers": [name],
        **{k: [full_df[k].mean()] for k in metrics},
    }

    metric_df = pd.DataFrame(columns)

    return metric_df

In [None]:
display_results("top-5 eval", eval_results)