In [1]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [2]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [3]:
import minsearch
print(minsearch.__version__)

0.0.4


In [4]:
boost = {'question': 1.5, 'section': 0.1}

In [5]:
from minsearch import Index

index = Index(
    text_fields=['question', 'answer', 'section'],
    keyword_fields=['id']
)

index.fit(documents)

<minsearch.minsearch.Index at 0x7ab4c6135400>

In [6]:
def minsearch_search(q):
    return index.search(
        query=q['question'],
        filter_dict={'course': q['course']},
        boost_dict=boost,
        num_results=5
    )

results = evaluate(ground_truth, minsearch_search)
print(results['hit_rate'])

  0%|          | 0/4627 [00:00<?, ?it/s]

0.6263237518910741


In [7]:
from minsearch import VectorSearch

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [9]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [10]:
vindex = VectorSearch(keyword_fields=['course'])

In [11]:
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7ab4c38c90a0>

In [12]:
def vector_search(q):
    x = pipeline.transform([q['question']])
    return vindex.search(x[0], filter_dict={'course': q['course']}, num_results=5)

In [13]:
result = evaluate(ground_truth, vector_search)
print(result['mrr'])

  0%|          | 0/4627 [00:00<?, ?it/s]

0.3572833369353793


In [14]:
texts = []
docs_filtered = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)
    docs_filtered.append(doc)


In [15]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [16]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7ab4c38dfc80>

In [17]:
def vector_search(q):
    x = pipeline.transform([q['question']])
    return vindex.search(x[0], filter_dict={'course': q['course']}, num_results=5)

In [18]:
result = evaluate(ground_truth, vector_search)
print(result['hit_rate'])

  0%|          | 0/4627 [00:00<?, ?it/s]

0.8210503566025502


In [19]:
from sentence_transformers import SentenceTransformer
model_name = 'jinaai/jina-embeddings-v2-small-en'
model = SentenceTransformer(model_name)

Some weights of BertModel were not initialized from the model checkpoint at jinaai/jina-embeddings-v2-small-en and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.2.intermediate.dense.bias', 'encoder.layer.2.intermediate.dense.weight', 'encoder.layer.2.output.LayerNorm.bias', 'encoder.layer.2.output.LayerNorm.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.2.output.dense.weight', 'encoder.layer.3.intermediate.dense.bias', 'encoder.layer.3.intermediate.den

In [20]:
from tqdm.auto import tqdm
import numpy as np
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
vectors = []
for doc in tqdm(documents, desc="Encoding documents"):
    combined = 'search_document: ' + doc['question'] + ' ' + doc['text']
    vec = model.encode(
        combined,
        normalize_embeddings=True
    )
    vectors.append(vec)
vectors = np.vstack(vectors)

Encoding documents:   0%|          | 0/948 [00:00<?, ?it/s]



In [21]:
norms = np.linalg.norm(vectors, axis=1)
print("First 5 norms:", norms[:5])

First 5 norms: [1.0000001  0.99999994 1.         0.99999994 1.        ]


In [22]:
qdrant = QdrantClient(":memory:")
qdrant.recreate_collection(
    collection_name="faq",
    vectors_config=VectorParams(size=vectors.shape[1], distance=Distance.COSINE)
)

  qdrant.recreate_collection(


True

In [23]:
points = [
    PointStruct(
        id=i,
        vector=vec.tolist(),
        payload=doc
    )
    for i, (doc, vec) in enumerate(zip(documents, vectors))
]
qdrant.upsert(collection_name="faq", points=points)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [24]:
from qdrant_client.models import Filter, FieldCondition, MatchValue

def qdrant_search(q):
    vec = model.encode(
        q["question"],
        normalize_embeddings=True
    ).tolist()

    flt = Filter(
        must=[
            FieldCondition(
                key="course",
                match=MatchValue(value=q["course"])
            )
        ]
    )
    hits = qdrant.search(
        collection_name="faq",
        query_vector=vec,
        query_filter=flt,
        limit=5,
        with_payload=True,
        with_vectors=False
    )

    return [hit.payload for hit in hits]

In [25]:
result = evaluate(ground_truth, qdrant_search)
print("MRR:", result["mrr"])

  0%|          | 0/4627 [00:00<?, ?it/s]

  hits = qdrant.search(


MRR: 0.13103522800950945


In [28]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [27]:
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [29]:
results_url = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [30]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

In [32]:
text_data = df_results['answer_llm'] + ' ' + df_results['answer_orig'] + ' ' + df_results['question']
pipeline.fit(text_data)

0,1,2
,steps,"[('tfidfvectorizer', ...), ('truncatedsvd', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,128
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,1
,tol,0.0


In [33]:
v_llm = pipeline.transform(df_results['answer_llm'])
v_orig = pipeline.transform(df_results['answer_orig'])

In [34]:
cosines = [cosine(u, v) for u, v in zip(v_llm, v_orig)]
avg_cosine = np.mean(cosines)
print(f"Average cosine similarity: {avg_cosine:.2f}")

Average cosine similarity: 0.84


In [35]:
from rouge import Rouge
import pandas as pd

In [36]:
results_url = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [37]:
rouge_scorer = Rouge()
rouge_1_f1_scores = []
for _, row in df_results.iterrows():
    try:
        scores = rouge_scorer.get_scores(row['answer_llm'], row['answer_orig'])[0]
        rouge_1_f1 = scores['rouge-1']['f']
        rouge_1_f1_scores.append(rouge_1_f1)
    except Exception as e:
        rouge_1_f1_scores.append(0)

In [38]:
avg_rouge_1_f1 = sum(rouge_1_f1_scores) / len(rouge_1_f1_scores)
print(f"Average ROUGE-1 F1 score: {avg_rouge_1_f1:.2f}")

Average ROUGE-1 F1 score: 0.35
