In [46]:
import requests
import pandas as pd
from tqdm.auto import tqdm
import minsearch

In [47]:
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [48]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [49]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [67]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [73]:
### Minsearch(Q1)

In [69]:
s_text_fields=["question", "section", "text"]
skeyword_fields=["course", "id"]
s_boost = {'question': 1.5, 'section': 0.1}

In [70]:
index = minsearch.Index(
    text_fields=["question", "section", "text"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x1b535f5f4d0>

In [71]:
def text_minsearch_search(query):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query['question'],
        filter_dict={'course': query['course']},
        boost_dict=boost,
        num_results=5
    )

    return results

In [72]:
evaluate(ground_truth,text_minsearch_search)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}

In [74]:
###Embeddings(Q2)

In [120]:
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [121]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [122]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x1b52ed40b30>

In [123]:
def vector_minsearch_search(query):
    boost = {'question': 1.5, 'section': 0.1}

    results = vindex.search(
        query_vector=pipeline.transform([query['question']]),
        filter_dict={'course': query['course']},
        #boost=boost,
        num_results=5
    )

    return results

In [126]:
evaluate(ground_truth,vector_minsearch_search)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.48195374972984656, 'mrr': 0.3573085512571141}

In [128]:
##Vector Search(QA+AS)[Q3]###

In [132]:
combine_texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    combine_texts.append(t)
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
combineX = pipeline.fit_transform(combine_texts)

In [133]:
combineindex = VectorSearch(keyword_fields={'course'})
combineindex.fit(combineX, documents)

<minsearch.vector.VectorSearch at 0x1b545f7b680>

In [134]:
def vector_minsearch_combine_search(query):
    boost = {'question': 1.5, 'section': 0.1}

    results = combineindex.search(
        query_vector=pipeline.transform([query['question']]),
        filter_dict={'course': query['course']},
        #boost=boost,
        num_results=5
    )

    return results

In [135]:
evaluate(ground_truth,vector_minsearch_combine_search)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.8210503566025502, 'mrr': 0.6717347453353508}

In [136]:
###Qdrant(Q4)

In [144]:
from qdrant_client import QdrantClient, models
from fastembed import TextEmbedding

In [145]:
EMBEDDING_DIMENSIONALITY = 512
client = QdrantClient("http://localhost:6333")
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [146]:


# Define the collection name
collection_name = "search-evaluation-zoomcamp"

# Create the collection with specified vector parameters
client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
        distance=models.Distance.COSINE  # Distance metric for similarity search
    )
)



UnexpectedResponse: Unexpected Response: 409 (Conflict)
Raw response content:
b'{"status":{"error":"Wrong input: Collection `search-evaluation-zoomcamp` already exists!"},"time":0.000223506}'

In [147]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [152]:
points = []
id = 0
for doc in documents:
    text = doc['question'] + ' ' + doc['text']
    point = models.PointStruct(
        id=id,
        vector=models.Document(text=text, model=model_handle), #embed text locally with "jinaai/jina-embeddings-v2-small-en" from FastEmbed
        payload={
            "text": doc['text'],
            "section": doc['section'],
            "course": doc['course'],
            "id" : doc['id']
        } #save all needed metadata fields
    )
    points.append(point)
    id += 1

In [153]:
client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [154]:
client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword" # exact matching on string metadata fields
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [166]:
def qdrant_search(query, limit=5):
    results = client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query['question'],
            model=model_handle
        ),
        query_filter=models.Filter( # filter by course name
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=query['course'])
                )
            ]
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )
    return [point.payload for point in results.points]



In [172]:
evaluate(ground_truth,qdrant_search)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.9299762264966501, 'mrr': 0.8517722066133576}

In [173]:
###Cosine simiarity(Q5)###

In [174]:
import numpy as np
def normalize(u):
    norm = np.sqrt(u.dot(u))
    return u / norm
def cosine(u, v):
    u = normalize(u)
    v = normalize(v)
    return u.dot(v)

In [175]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [213]:
df_results

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp
...,...,...,...,...,...
1825,Some suggested titles for listing the Machine ...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,What are some suggested titles for listing the...,machine-learning-zoomcamp
1826,It is best advised that you do not list the Ma...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,Should I list the Machine Learning Zoomcamp ex...,machine-learning-zoomcamp
1827,You can incorporate your Machine Learning Zoom...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,In which LinkedIn sections can I incorporate m...,machine-learning-zoomcamp
1828,The advice on including a project link in a CV...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,Who gave advice on including a project link in...,machine-learning-zoomcamp


In [336]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

In [338]:
full_vectorizer_text = pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

In [339]:
answer_llm_v = full_vectorizer_text.transform(df_results.answer_llm)

In [340]:
answer_orig_v = full_vectorizer_text.transform(df_results.answer_orig)

In [344]:
len(answer_llm_v)

1830

In [358]:
similarty_arr = []
for index,row in enumerate(answer_llm_v):
    sim_dot = cosine(answer_llm_v[index],answer_orig_v[index])
    similarty_arr.append(sim_dot)    

In [359]:
df_results['cosine_similarty'] = similarty_arr

In [360]:
df_results

Unnamed: 0,answer_llm,answer_orig,document,question,course,cosine_similarty
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp,0.463526
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp,0.781565
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp,0.889158
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp,0.614962
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp,0.624086
...,...,...,...,...,...,...
1825,Some suggested titles for listing the Machine ...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,What are some suggested titles for listing the...,machine-learning-zoomcamp,0.907584
1826,It is best advised that you do not list the Ma...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,Should I list the Machine Learning Zoomcamp ex...,machine-learning-zoomcamp,0.965069
1827,You can incorporate your Machine Learning Zoom...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,In which LinkedIn sections can I incorporate m...,machine-learning-zoomcamp,0.965395
1828,The advice on including a project link in a CV...,I’ve seen LinkedIn users list DataTalksClub as...,c6a22665,Who gave advice on including a project link in...,machine-learning-zoomcamp,0.716734


In [361]:
df_results['cosine_similarty'].describe()

count    1830.000000
mean        0.841584
std         0.173737
min         0.079093
25%         0.806927
50%         0.905812
75%         0.950711
max         0.996457
Name: cosine_similarty, dtype: float64

In [365]:
###Rouge

In [362]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [364]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)
scores

[{'rouge-1': {'r': 0.45454545454545453,
   'p': 0.45454545454545453,
   'f': 0.45454544954545456},
  'rouge-2': {'r': 0.21621621621621623,
   'p': 0.21621621621621623,
   'f': 0.21621621121621637},
  'rouge-l': {'r': 0.3939393939393939,
   'p': 0.3939393939393939,
   'f': 0.393939388939394}}]

In [370]:
similarty_rouge = []
for index,row in df_results.iterrows():
    scores = rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0]
    similarty_rouge.append(scores['rouge-1']['f'])


In [371]:
df_results['rouge_similarty'] = similarty_rouge

In [373]:
df_results['rouge_similarty'].describe()

count    1830.000000
mean        0.351695
std         0.158905
min         0.000000
25%         0.238887
50%         0.356300
75%         0.460133
max         0.950000
Name: rouge_similarty, dtype: float64