# <center> **Custom Evaluation with LlamaIndex**

## **Load Dataset**

In [1]:
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset

test_dataset = EmbeddingQAFinetuneDataset.from_json("data/test_dataset.json")

In [2]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd

## **Hit Rate metric**

We use a simple hit rate metric for evaluation:

- for each (query, relevant_doc) pair,
- we retrieve top-k documents with the query, and
- it's a hit if the results contain the relevant_doc.

This approach is very simple and intuitive, and we can apply it to both the proprietary OpenAI embedding as well as our open source and fine-tuned embedding models.dels.

In [3]:
def evaluate(
    dataset,
    embed_model,
    top_k=5,
    verbose=False,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(
        nodes, embed_model=embed_model, show_progress=True
    )
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    for query_id, query in tqdm(queries.items()):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids  # assume 1 relevant doc

        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
        }
        eval_results.append(eval_result)
    return eval_results

### OpenAI

In [4]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd
import os

os.environ["OPENAI_API_KEY"] = "sk-"

ada = OpenAIEmbedding(model="text-embedding-3-large")
ada_val_results = evaluate(test_dataset, ada)

Generating embeddings:   0%|          | 0/196 [00:00<?, ?it/s]

  0%|          | 0/392 [00:00<?, ?it/s]

In [5]:
df_ada = pd.DataFrame(ada_val_results)

In [6]:
hit_rate_ada = df_ada["is_hit"].mean()
hit_rate_ada

0.8188775510204082

### BAAI/bge-large-en

In [9]:
bge = "local:BAAI/bge-large-en"
bge_val_results = evaluate(test_dataset, bge)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Generating embeddings:   0%|          | 0/196 [00:00<?, ?it/s]

  0%|          | 0/392 [00:00<?, ?it/s]

In [10]:
df_bge = pd.DataFrame(bge_val_results)

In [11]:
hit_rate_bge = df_bge["is_hit"].mean()
hit_rate_bge

0.7755102040816326

### Finetuned model

In [12]:
finetuned = "local:recruit_finetune"
val_results_finetuned = evaluate(test_dataset, finetuned)

Generating embeddings:   0%|          | 0/196 [00:00<?, ?it/s]

  0%|          | 0/392 [00:00<?, ?it/s]

In [13]:
df_finetuned = pd.DataFrame(val_results_finetuned)

## InformationRetrievalEvaluator metric

This provides a more comprehensive suite of metrics, but we can only run it against the sentencetransformers compatible models (open source and our finetuned model, not the OpenAI embedding model).


In [16]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers import SentenceTransformer
from pathlib import Path


def evaluate_st(
    dataset,
    model_id,
    name,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    evaluator = InformationRetrievalEvaluator(
        queries, corpus, relevant_docs, name=name
    )
    model = SentenceTransformer(model_id)
    output_path = "results/"
    Path(output_path).mkdir(exist_ok=True, parents=True)
    return evaluator(model, output_path=output_path)

### BAAI/bge-large-en

In [17]:
evaluate_st(test_dataset, "BAAI/bge-large-en", name='bge')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

0.6409585792488717

### Finetuned model

In [18]:
evaluate_st(test_dataset, "recruit_finetune", name="finetuned")

0.8066139440442256

## **SUMMARY**

### Hit rate

In [19]:
df_ada['model'] = 'ada'
df_bge['model'] = 'bge'
df_finetuned['model'] = 'fine_tuned'

In [20]:
df_all = pd.concat([df_ada, df_bge, df_finetuned])
df_all.groupby('model').mean('is_hit')

Unnamed: 0_level_0,is_hit
model,Unnamed: 1_level_1
ada,0.818878
bge,0.77551
fine_tuned,0.869898


## InformationRetrievalEvaluator

In [21]:
df_st_bge = pd.read_csv('results/Information-Retrieval_evaluation_bge_results.csv')
df_st_finetuned = pd.read_csv('results/Information-Retrieval_evaluation_finetuned_results.csv')

In [22]:
df_st_bge['model'] = 'bge'
df_st_finetuned['model'] = 'fine_tuned'
df_st_all = pd.concat([df_st_bge, df_st_finetuned])
df_st_all = df_st_all.set_index('model')
df_st_all

Unnamed: 0_level_0,epoch,steps,cos_sim-Accuracy@1,cos_sim-Accuracy@3,cos_sim-Accuracy@5,cos_sim-Accuracy@10,cos_sim-Precision@1,cos_sim-Recall@1,cos_sim-Precision@3,cos_sim-Recall@3,...,dot_score-Recall@1,dot_score-Precision@3,dot_score-Recall@3,dot_score-Precision@5,dot_score-Recall@5,dot_score-Precision@10,dot_score-Recall@10,dot_score-MRR@10,dot_score-NDCG@10,dot_score-MAP@100
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bge,-1,-1,0.561224,0.673469,0.727041,0.80102,0.561224,0.561224,0.22449,0.673469,...,0.561224,0.22449,0.673469,0.145408,0.727041,0.080102,0.80102,0.634025,0.673676,0.640959
fine_tuned,-1,-1,0.747449,0.844388,0.869898,0.908163,0.747449,0.747449,0.281463,0.844388,...,0.739796,0.277211,0.831633,0.175,0.875,0.090561,0.905612,0.796409,0.822865,0.800905
