# Evaluation data

In [2]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'

# documents from the DataTalks FAQ database with unique IDs
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)

# generated question-answer pairs
ground_truth = df_ground_truth.to_dict(orient='records')

In [3]:
len(ground_truth), len(documents)

(4627, 948)

In [4]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [85]:
# functions for evaluating
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)


def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

# Q1. Minsearch text
Now let's evaluate our usual minsearch approach, but tweak the parameters. Let's use the following boosting params:
```python
boost = {'question': 1.5, 'section': 0.1}
```

What's the hitrate for this approach?

In [12]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x2151d189b20>

In [22]:
minsearch.__version__

'0.0.4'

In [19]:
BOOST = {'question': 1.5, 'section': 0.1}

def minsearch_search(query, course):
    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=BOOST,
        num_results=5
    )

    return results

In [24]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [29]:
print(f"Performance of minsearch with boost {BOOST}:")
print(f"     Hit rate: {hit_rate(relevance_total):.3f}")
print(f"     MMR: {mrr(relevance_total):.3f}") 

Performance of minsearch with boost {'question': 1.5, 'section': 0.1}:
     Hit rate: 0.849
     MMR: 0.729


# Embeddings
The latest version of minsearch also supports vector search. We will use it. 

We will also use TF-IDF and Singular Value Decomposition to create embeddings from texts. You can refer to our ["Create Your Own Search Engine"](https://github.com/alexeygrigorev/build-your-own-search-engine) workshop if you want to know more about it.

Let's create embeddings for the "question" field:

In [90]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

texts = []

for doc in documents:
    t = doc['question']     # vectorize only question
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

# Q2. Vector search for question
Now let's index these embeddings with minsearch:

In [91]:
from minsearch import VectorSearch

vindex = VectorSearch( keyword_fields={"course"})
vindex.fit(vectors=X, payload=documents)

<minsearch.vector.VectorSearch at 0x21565f03cd0>

Evaluate this search method. What's MRR for it?

In [92]:
def minsearch_vector_search(query, course):
    query_vector = pipeline.transform([query])
    results = vindex.search(
        query_vector=query_vector,
        filter_dict={'course': course},
        num_results=5
    )

    return results

In [93]:
metrics = evaluate(ground_truth, lambda q: minsearch_vector_search(q['question'], q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

In [95]:
print("Performance of minsearch with TF-IDF vectors (question):")
print(f"     Hit rate: {metrics['hit_rate']:.3f}")
print(f"     MMR: {metrics['mrr']:.3f}")

Performance of minsearch with TF-IDF vectors (question):
     Hit rate: 0.482
     MMR: 0.357


# Q3. Vector search for question and answer
We only used question in Q2. We can use both question and answer.

Using the same `pipeline` (`min_df=3` for TF-IDF vectorizer and `n_components=128` for SVD), evaluate the performance of this approach.

What's the hitrate?

In [96]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

vindex = VectorSearch( keyword_fields={"course"})
vindex.fit(vectors=X, payload=documents)

<minsearch.vector.VectorSearch at 0x21565ef57f0>

In [97]:
metrics = evaluate(ground_truth, lambda q: minsearch_vector_search(q['question'], q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

In [98]:
print("Performance of minsearch with TF-IDF vectors (question + text):")
print(f"     Hit rate: {metrics['hit_rate']:.3f}")
print(f"     MMR: {metrics['mrr']:.3f}")

Performance of minsearch with TF-IDF vectors (question + text):
     Hit rate: 0.821
     MMR: 0.672


# Q4. Qdrant
Now let's evaluate the following settings in Qdrant:
```python
text = doc['question'] + ' ' + doc['text']
model_handle = "jinaai/jina-embeddings-v2-small-en"
limit = 5
```

What's the MRR?

In [41]:
from qdrant_client import QdrantClient, models

# Start the client
client = QdrantClient("http://localhost:6333")
client.info()

VersionInfo(title='qdrant - vector search engine', version='1.14.1', commit='530430fac2a3ca872504f276d2c91a5c91f43fa0')

In [45]:
from pprint import pprint
print(f"In the list of documents, there are {len(documents)} records like that:\n")
pprint(documents[100])

In the list of documents, there are 948 records like that:

{'course': 'data-engineering-zoomcamp',
 'id': '176ce516',
 'question': 'PGCLI - INKhould we run pgcli inside another docker container?',
 'section': 'Module 1: Docker and Terraform',
 'text': 'In this section of the course, the 5432 port of pgsql is mapped to '
         'your computer’s 5432 port. Which means you can access the postgres '
         'database via pgcli directly from your computer.\n'
         'So No, you don’t need to run it inside another container. Your local '
         'system will do.'}


In [59]:
EMBEDDINGS_DIMENSIONALITY = 512
collection_name = "homework-3"
model_name = "jinaai/jina-embeddings-v2-small-en"

# Initialize the collection
client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDINGS_DIMENSIONALITY,  
        distance=models.Distance.COSINE
    )
)

True

In [60]:
# Creating data enitites for Qdrant (points)
points = []

for id, doc in enumerate(documents):
    # When adding the data, use both question and answer fields
    text = doc.get('question', "") + " " + doc.get('text', "")

    point = models.PointStruct(
        id=id,
        vector=models.Document(text=text, model=model_name),
        payload={
            "text": doc['text'],
            "question": doc['question'],
            "section": doc['section'],
            "course": doc['course'],
            "id": doc['id']
        } 
    )
    points.append(point)

In [61]:
# Index the data
client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [None]:
LIMIT = 5

# function for vector search
def qdrant_search(query):

    results = client.query_points(
        collection_name=collection_name,
        query=models.Document( 
            text=query,
            model=model_name 
        ),
        limit=LIMIT, # top closest matches
        with_payload=True # to have metadata in the results
    )
    # return just payload
    # points are sorted by similarity score (first = the highest)
    return [p.payload for p in results.points] 

In [108]:
metrics = evaluate(ground_truth, lambda q: qdrant_search(q['question']))

  0%|          | 0/4627 [00:00<?, ?it/s]

In [109]:
print("Performance of qdrant vector search (question + text):")
print(f"     Hit rate: {metrics['hit_rate']:.3f}")
print(f"     MMR: {metrics['mrr']:.3f}")

Performance of qdrant vector search (question + text):
     Hit rate: 0.912
     MMR: 0.825


# Q5. Cosine simiarity
In the second part of the module, we looked at evaluating the entire RAG approach. In particular, we looked at comparing the answer generated by our system with the actual answer from the FAQ.

One of the ways of doing it is using the cosine similarity. Let's see how to calculate it.

Cosine similarity is a dot product between two normalized vectors. In geometrical sense, it's the cosine of the angle between the vectors. Look up "cosine similarity geometry" if you want to learn more about it.

For us, it means that we need two things:

* First, we normalize each of the vectors
* Then, compute the dot product


In [110]:
import numpy as np

def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [116]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [129]:
df_results.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


In [117]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

In [118]:
pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

Now use the `transform` methon of the pipeline to create the embeddings and calculate the cosine similarity between each pair.

What's the average cosine?

In [137]:
def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = pipeline.transform([answer_llm])
    v_orig = pipeline.transform([answer_orig])
    
    return cosine(v_llm[0], v_orig[0])

In [138]:
results_gpt4o_mini = df_results.to_dict(orient='records')

similarity = []

for record in tqdm(results_gpt4o_mini):
    sim = compute_similarity(record)
    similarity.append(sim)

  0%|          | 0/1830 [00:00<?, ?it/s]

In [141]:
print(f"Average cosine similarity: {np.mean(similarity):.3f}")

Average cosine similarity: 0.842


In [None]:
df_results['cosine'] = similarity
df_results['cosine'].describe()

count    1830.000000
mean        0.841584
std         0.173737
min         0.079093
25%         0.806927
50%         0.905812
75%         0.950711
max         0.996457
Name: cosine, dtype: float64

# Q6. Rouge
And alternative way to see how two texts are similar is ROUGE.

This is a set of metrics that compares two answers based on the overlap of n-grams, word sequences, and word pairs.

It can give a more nuanced view of text similarity than just cosine similarity alone.

We don't need to implement it ourselves, there's a python package for it:
```
pip install rouge
```
(The latest version at the moment of writing is 1.0.1)

Let's compute the ROUGE score between the answers at the index 10 of our dataframe (`doc_id=5170565b`)

In [144]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
print(f" document ID: {r.document}")
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

 document ID: 5170565b


{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

There are three scores: `rouge-1`, `rouge-2` and `rouge-l`, and precision, recall and F1 score for each.

* `rouge-1` - the overlap of unigrams,
* `rouge-2` - bigrams,
* `rouge-l` - the longest common subsequence

For the 10th document, Rouge-1 F1 score is 0.45

Let's compute it for the pairs in the entire dataframe. What's the average Rouge-1 F1?

In [150]:
METRIC = "rouge-1"
SCORE = "f"

all_scores = []

for record in tqdm(results_gpt4o_mini):
    scores = rouge_scorer.get_scores(record['answer_llm'], record['answer_orig'])[0]
    all_scores.append(scores[METRIC][SCORE])

  0%|          | 0/1830 [00:00<?, ?it/s]

In [156]:
score_map = {"p": "precision", "r": "recall"}
SCORE_STR = SCORE + "1" if SCORE == "f" else score_map[SCORE]

print(f"Average {METRIC.title()} {SCORE_STR.title()}: {np.mean(all_scores):.3f}")

Average Rouge-1 F1: 0.352
