#### Data Evaluation


In [22]:
import requests
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import minsearch
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from sentence_transformers import SentenceTransformer
from qdrant_client.http.models import Filter, FieldCondition, MatchValue, SearchParams
import uuid
from rouge import Rouge

In [3]:
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [4]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

#### Question 1. Hitrate for minsearch text 

In [5]:
index = minsearch.Index(
    text_fields=["question", "section", "text"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x77df473566c0>

In [6]:
def minsearch_search(query, course):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [7]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [8]:
# What's the hitrate for this approach?

hit_rate(relevance_total)

0.848714069591528

#### Embeddings

In [9]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

#### Question 2. MRR Vector search (question field)

In [10]:
# Create the vector search index
vindex = VectorSearch(keyword_fields=['course'])
vindex.fit(X, documents)

# Define a vector search function
def minsearch_vector_search(query, course):
    # Transform the query using the same pipeline
    query_vector = pipeline.transform([query])
    
    results = vindex.search(
        query_vector[0],  # Get the first (and only) vector
        filter_dict={'course': course},
        num_results=5
    )
    
    return results

# Evaluate using the existing evaluate function
results_vector = evaluate(ground_truth, 
                         lambda q: minsearch_vector_search(q['question'], q['course']))


print(f"MRR: {results_vector['mrr']}")

  0%|          | 0/4627 [00:00<?, ?it/s]

MRR: 0.3571284489590088


#### Question 3. Hitrate Vector search (question + text fields)

In [11]:
# Create embeddings using both question and text fields
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

# Use the same pipeline parameters
pipeline_combined = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X_combined = pipeline_combined.fit_transform(texts)

# Create new vector search index with combined embeddings
vindex_combined = VectorSearch(keyword_fields=['course'])
vindex_combined.fit(X_combined, documents)

# Define search function for combined embeddings
def minsearch_vector_search_combined(query, course):
    # Transform the query using the same pipeline
    query_vector = pipeline_combined.transform([query])
    
    results = vindex_combined.search(
        query_vector[0],  # Get the first (and only) vector
        filter_dict={'course': course},
        num_results=5
    )
    
    return results

# Evaluate using the existing evaluate function
results_combined = evaluate(ground_truth, 
                           lambda q: minsearch_vector_search_combined(q['question'], q['course']))

print(f"Hit rate: {results_combined['hit_rate']}")

  0%|          | 0/4627 [00:00<?, ?it/s]

Hit rate: 0.8210503566025502


#### Question 4. MRR Qdrant

##### Note: jinaai/jina-embeddings-v2-small-en crashes my kernel so I switched to all-MiniLM-L6-v2

In [12]:
# Initialize Qdrant client (in-memory)
client = QdrantClient(":memory:")

# Load the specified model
model_handle = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_handle)

# Create embeddings for question + text
texts = []
for doc in documents:
    text = doc['question'] + ' ' + doc['text']
    texts.append(text)

# Generate embeddings
embeddings = model.encode(texts)

# Create collection
collection_name = "documents"
client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=embeddings.shape[1], distance=Distance.COSINE),
)

# Index documents
points = []
for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
    points.append(
        PointStruct(
            id=i,
            vector=embedding.tolist(),
            payload=doc
        )
    )

client.upsert(collection_name=collection_name, points=points)

# Define search function for Qdrant
def qdrant_search(query, course, limit=5):
    # Encode the query
    query_vector = model.encode([query])[0]
    
    # Search in Qdrant with course filter
    search_result = client.search(
        collection_name=collection_name,
        query_vector=query_vector.tolist(),
        query_filter= Filter(
            must=[
                FieldCondition(
                key="course",
                match=MatchValue(value=course)
            )
            ]
        ),
        limit=limit
    )
    
    # Convert results to the expected format
    results = []
    for hit in search_result:
        results.append(hit.payload)
    
    return results

# Evaluate using the existing evaluate function
results_qdrant = evaluate(ground_truth, 
                         lambda q: qdrant_search(q['question'], q['course'], limit=5))

print(f"MRR: {results_qdrant['mrr']}")

  0%|          | 0/4627 [00:00<?, ?it/s]

  search_result = client.search(


MRR: 0.8271990490598672


#### Question 5. Average cosine

In [13]:
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [14]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [15]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

In [16]:
pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

In [20]:
# Transform the data using the fitted pipeline
answer_llm_vectors = pipeline.transform(df_results.answer_llm)
answer_orig_vectors = pipeline.transform(df_results.answer_orig)

# Calculate cosine similarity between each pair
cosine_similarities = []

for i in range(len(df_results)):
    llm_vec = answer_llm_vectors[i]
    orig_vec = answer_orig_vectors[i]
    
    # Calculate cosine similarity using your existing function
    similarity = cosine(llm_vec.flatten(), orig_vec.flatten())
    cosine_similarities.append(similarity)

# Calculate the average cosine similarity
average_cosine = np.mean(cosine_similarities)
print(f"Average cosine similarity: {average_cosine}")

Average cosine similarity: 0.8415841233490402


#### Question 6. Average Rouge-1 F1

In [21]:
!pip install rouge

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [23]:
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [24]:
#Let's compute it for the pairs in the entire dataframe. What's the average Rouge-1 F1?

# Calculate Rouge-1 F1 for all pairs in the dataframe
rouge_f1_scores = []

for i in range(len(df_results)):
    llm_answer = df_results.iloc[i].answer_llm
    orig_answer = df_results.iloc[i].answer_orig
    
    # Get Rouge scores for this pair
    scores = rouge_scorer.get_scores(llm_answer, orig_answer)[0]
    
    # Extract the Rouge-1 F1 score
    rouge_1_f1 = scores['rouge-1']['f']
    rouge_f1_scores.append(rouge_1_f1)

# Calculate the average Rouge-1 F1 score
average_rouge_f1 = np.mean(rouge_f1_scores)
print(f"Average Rouge-1 F1: {average_rouge_f1}")

Average Rouge-1 F1: 0.3516946452113943
