# LLM Zoomcamp Homework 03 

In [1]:
# !pip install -U minsearch qdrant_client

## Evaluation data

In [2]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [3]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

## Q1. Minsearch text

Now let's evaluate our usual minsearch approach, indexing documents with:

```
text_fields=["question", "section", "text"],
keyword_fields=["course", "id"]
```

but tweak the parameters for search. Let's use the following boosting params:

```
boost = {'question': 1.5, 'section': 0.1}
```

What's the hitrate for this approach?

In [4]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "section", "text"],
    keyword_fields=["course", "id"]
)

index.fit(documents)


def minsearch_search(query, course):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results


result_ev = evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course']))



  0%|          | 0/4627 [00:00<?, ?it/s]

In [5]:
round(result_ev['hit_rate'], 2)

0.85

## Embeddings

In [6]:
from minsearch import VectorSearch

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

import numpy as np

In [7]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

## Q2. Vector search for question

Now let's index these embeddings with minsearch

In [8]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

texts_ground_truth = []
for q in ground_truth:
    t = q['question']
    texts_ground_truth.append(t)

X_gt = pipeline.transform(texts_ground_truth)

for i in range(len(ground_truth)):
    ground_truth[i]['question_vector'] = X_gt[i]

relevance_total = []

def minsearch_vector_search(query_vector, course):

    results = vindex.search(
        query_vector=query_vector,
        filter_dict={'course': course},
        num_results=5
    )

    return results

result_ev = evaluate(ground_truth, lambda q: minsearch_vector_search(q['question_vector'], q['course']))


  0%|          | 0/4627 [00:00<?, ?it/s]

Evaluate this seach method. What's MRR for it?

In [9]:
round(result_ev['mrr'], 2)

0.36

## Q3. Vector search for question and answer

In [10]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [11]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

texts_ground_truth = []
for q in ground_truth:
    t = q['question']
    texts_ground_truth.append(t)

X_gt = pipeline.transform(texts_ground_truth)

for i in range(len(ground_truth)):
    ground_truth[i]['question_vector'] = X_gt[i]

relevance_total = []

def minsearch_vector_search(query_vector, course):

    results = vindex.search(
        query_vector=query_vector,
        filter_dict={'course': course},
        num_results=5
    )

    return results

result_ev = evaluate(ground_truth, lambda q: minsearch_vector_search(q['question_vector'], q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

In [12]:
round(result_ev['hit_rate'], 2)

0.82

## Q4. Qdrant

Now let's evaluate the following settings in Qdrant:
```
text = doc['question'] + ' ' + doc['text']
model_handle = "jinaai/jina-embeddings-v2-small-en"
limit = 5
```
What's the MRR?

In [13]:
# !sudo docker run --rm -p 6333:6333 -p 6334:6334    -v "$(pwd)/qdrant_storage:/qdrant/storage:z"    qdrant/qdrant

In [14]:
from qdrant_client import QdrantClient, models

client = QdrantClient("http://localhost:6333") 
collection_name = "collection_2"

client.recreate_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=512,  
        distance=models.Distance.COSINE  
    )
)


  client.recreate_collection(


True

In [15]:
points = []
id = 0

model_handle = "jinaai/jina-embeddings-v2-small-en"

for doc in documents:
   point = models.PointStruct(
      id=id,
      vector=models.Document(text=doc['question'] + ' ' + doc['text'], model=model_handle), 
      payload={
            "id":doc['id'],
            "text": doc['text'],
            "section": doc['section'],
            "course": doc['course'],
            
      } 
   )
   points.append(point)

   id += 1


client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [16]:
from qdrant_client.models import Filter, FieldCondition, MatchValue, SearchRequest

filter_ = Filter(
    must=[
        FieldCondition(
            key="course",
            match=MatchValue(value='course')
        )
    ]
)

def qdrant_search(text_query, course):
        
    results = client.query_points(
        collection_name=collection_name,
        query=models.Document( 
            text=text_query,
            model=model_handle,
            # filter=course 
        ),
        limit=5,
        with_payload=True 
    )

    return [r.payload for r in results.points]

result_ev = evaluate(ground_truth, lambda q: qdrant_search(q['question'], q['course']))


  0%|          | 0/4627 [00:00<?, ?it/s]

In [17]:
round(result_ev['mrr'], 2)

0.82

## Q5. Cosine similarity

In [18]:
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [19]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [20]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

In [21]:
df_results

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp
...,...,...,...,...,...
1825,Some suggested titles for listing the Machine ...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,What are some suggested titles for listing the...,machine-learning-zoomcamp
1826,It is best advised that you do not list the Ma...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,Should I list the Machine Learning Zoomcamp ex...,machine-learning-zoomcamp
1827,You can incorporate your Machine Learning Zoom...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,In which LinkedIn sections can I incorporate m...,machine-learning-zoomcamp
1828,The advice on including a project link in a CV...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,Who gave advice on including a project link in...,machine-learning-zoomcamp


What's the average cosine?

In [22]:
v_llm = pipeline.transform(df_results['answer_llm'])
v_orig = pipeline.transform(df_results['answer_orig'])

cosine_sim = []
for v in zip(v_llm, v_orig):
    sim = cosine(v[0], v[1])
    cosine_sim.append(sim)

round(sum(cosine_sim)/len(cosine_sim), 2)

0.84

## Q6. Rouge

Let's compute the ROUGE score for the pairs in the entire dataframe. What's the average Rouge-1 F1?

In [23]:
# !pip install rouge

In [24]:
from rouge import Rouge
rouge_scorer = Rouge()

all_scores = []

for i in tqdm(range(df_results.shape[0])):       
    r = df_results.iloc[i]
    f1_score = rouge_scorer.get_scores(r['answer_llm'], r['answer_orig'])[0]['rouge-1']['f']
    all_scores.append(f1_score)

round(sum(all_scores)/len(all_scores),2)

  0%|          | 0/1830 [00:00<?, ?it/s]

0.35