# Homework

In [1]:
# dataset
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [2]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [3]:
ground_truth[:3]

[{'question': 'When does the course begin?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'How can I get the course schedule?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'What is the link for course registration?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'}]

### Question 1

In [4]:
import minsearch 

index = minsearch.Index(
    text_fields=["question", "section", "text"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

def search(query, course):
    # minsearch
    boost = {'question': 1.5, 'section': 0.1} # when one text field is more important than the other

    results = index.search(
        query = query,
        filter_dict = {'course':course},
        boost_dict = boost,
        num_results = 5
    )

    # build context
    return results

evaluate(ground_truth, lambda q: search(q['question'], q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}

### Question 2 - setup

In [5]:
from minsearch import VectorSearch

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [7]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [8]:
print(f"number of textx: {len(X)}")
print(f"number of components in each text: {len(X[0])}")

number of textx: 948
number of components in each text: 128


### Question 2

In [9]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x22803c47010>

In [10]:
def vector_search(query,course):
    # boost = {'question': 1.5, 'section': 0.1} # when one text field is more important than the other

    results = vindex.search(query,
                            filter_dict = {'course': course},
                            num_results = 5
                            )
    return results
    

In [11]:
def evaluate_vector_search(ground_truth, course):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        query_vector = pipeline.transform([q['question']])[0]
        results = vector_search(query_vector, course)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [12]:
evaluate_vector_search(ground_truth, 'data-engineering-zoomcamp')

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.2118003025718608, 'mrr': 0.15822707297745134}

In [13]:
def vector_search_wo_filter(query):
    # boost = {'question': 1.5, 'section': 0.1} # when one text field is more important than the other

    results = vindex.search(query,
                            num_results = 5
                            )
    return results
    

def evaluate_vector_search_wo_filter(ground_truth):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        query_vector = pipeline.transform([q['question']])[0]
        results = vector_search_wo_filter(query_vector)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

evaluate_vector_search_wo_filter(ground_truth)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.3939917873352064, 'mrr': 0.29028528204019916}

### Question 3

In [14]:
# create new query containing question and answer
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

# embed query with the same pipeline from Q2
Y = pipeline.transform(texts)

In [15]:
# fit vector search
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(Y, documents)

<minsearch.vector.VectorSearch at 0x22803cd77c0>

In [16]:
def vector_search(query,course):
    # boost = {'question': 1.5, 'section': 0.1} # when one text field is more important than the other

    results = vindex.search(query,
                            filter_dict = {'course': course},
                            num_results = 5
                            )
    return results
    
def evaluate_vector_search(ground_truth, course):
    def get_ground_truth_ans(doc_id, documents):
        for doc in documents:
            if doc['id'] == doc_id:
                return doc['text']
        return ''
    
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']

        ground_truth_ans = get_ground_truth_ans(doc_id, documents)
        query_vector = pipeline.transform([q['question'] + ' ' + ground_truth_ans])[0]
        results = vector_search(query_vector, course)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }
        

In [17]:
evaluate_vector_search(ground_truth, 'data-engineering-zoomcamp')

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.45709963259131187, 'mrr': 0.45515092572581234}

In [18]:
def vector_search_wo_filter(query):
    # boost = {'question': 1.5, 'section': 0.1} # when one text field is more important than the other

    results = vindex.search(query,
                            num_results = 5
                            )
    return results
    

def evaluate_vector_search_wo_filter(ground_truth):
    relevance_total = []
    def get_ground_truth_ans(doc_id, documents):
        for doc in documents:
            if doc['id'] == doc_id:
                return doc['text']
        return ''
    
    for q in tqdm(ground_truth):
        doc_id = q['document']
        ground_truth_ans = get_ground_truth_ans(doc_id, documents)
        query_vector = pipeline.transform([q['question'] + ' ' + ground_truth_ans])[0]
        results = vector_search_wo_filter(query_vector)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

evaluate_vector_search_wo_filter(ground_truth)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.9907067214177653, 'mrr': 0.9832324760463946}

### Question 4

In [19]:
from qdrant_client import QdrantClient, models
qd_client = QdrantClient("http://localhost:6333")

In [20]:
# QDrant

# Choose the model
model_handle = "jinaai/jina-embeddings-v2-small-en"
EMBEDDING_DIMENSIONALITY = 512

# Define the collection name
collection_name = "zoomcamp-faq"

qd_client.delete_collection(collection_name)

# Create the collection with specified vector parameters
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
        distance=models.Distance.COSINE  # Distance metric for similarity search
    )
)

# Create and insert points to collection
points = []

for i, doc in enumerate(documents): # for each question-answer pair in course
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector, 
        payload=doc #save all needed metadata fields
    )
    points.append(point)

In [21]:
qd_client.upsert(
    collection_name=collection_name,
    points=points
)   

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [22]:
# search function
def vector_search(question):    
    course = 'data-engineering-zoomcamp'
    
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=question,
            model=model_handle
        ),
        query_filter=models.Filter( # filter by course name
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=5, # top closest matches
    )

    results = []
    for point in query_points.points:
        results.append(point.payload)

    return results

In [23]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    def get_ground_truth_ans(doc_id, documents):
        for doc in documents:
            if doc['id'] == doc_id:
                return doc['text']
        return ''
    
    for q in tqdm(ground_truth):
        doc_id = q['document']

        ground_truth_ans = get_ground_truth_ans(doc_id, documents)
        query_str = q['question'] + ' ' + ground_truth_ans
        results = search_function(query_str)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [24]:
evaluate(ground_truth, vector_search)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.45537065052950076, 'mrr': 0.4536200561919169}

In [25]:
# without course filter
def vector_search(question):    
    
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=question,
            model=model_handle
        ),
        limit=5, # top closest matches
    )

    results = []
    for point in query_points.points:
        results.append(point.payload)

    return results

def evaluate(ground_truth, search_function):
    relevance_total = []

    def get_ground_truth_ans(doc_id, documents):
        for doc in documents:
            if doc['id'] == doc_id:
                return doc['text']
        return ''
    
    for q in tqdm(ground_truth):
        doc_id = q['document']

        ground_truth_ans = get_ground_truth_ans(doc_id, documents)
        query_str = q['question'] + ' ' + ground_truth_ans
        results = search_function(query_str)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

evaluate(ground_truth, vector_search)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.9954614220877458, 'mrr': 0.9929904185577407}

### Question 5

In [26]:
import numpy as np
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [27]:
# results from gpt-4o-mini evaluations:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [28]:
df_results.head(4)

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp


In [29]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)


0,1,2
,steps,"[('tfidfvectorizer', ...), ('truncatedsvd', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,128
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,1
,tol,0.0


In [30]:
vector_llm_answer = pipeline.transform(df_results.answer_llm)
vector_actual_answer = pipeline.transform(df_results.answer_orig)

In [31]:
print(len(vector_llm_answer))
print(len(vector_actual_answer))
print(len(vector_llm_answer[8]))
print(len(vector_actual_answer[8]))

1830
1830
128
128


In [32]:
# how to calculate cos similarity for many vectors:
# calculate cosine for each llm-original answer pairs, then get the average
cos_sim_list = []
for i in range(len(vector_llm_answer)):
    cos_sim = cosine(vector_llm_answer[i], vector_actual_answer[i])
    cos_sim_list.append(cos_sim)

In [33]:
sum(cos_sim_list)/len(cos_sim_list)

np.float64(0.8415841233490399)

### Question 6

In [34]:
# example
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [35]:
# calculate rouge scores for entire dataset
rouge_score_list = []
for i in range(len(df_results)):
    scores = rouge_scorer.get_scores(df_results.loc[i,'answer_llm'], df_results.loc[i, 'answer_orig'])
    f1_score = scores[0]['rouge-1']['f']
    rouge_score_list.append(f1_score)

In [36]:
print(f"The average rouge-1 f1-score is {sum(rouge_score_list)/len(rouge_score_list)}.")

The average rouge-1 f1-score is 0.3516946452113944.
